In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

# Step 2: Load the dataset
df = pd.read_csv("data.csv")

# Step 3: Data Preprocessing
# Convert 'weathersit' and 'season' into categorical variables with appropriate labels
df['weathersit'] = df['weathersit'].map({1: 'Clear', 2: 'Mist', 3: 'Light Snow', 4: 'Heavy Snow'})
df['season'] = df['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'})

# Convert the 'yr' column to categorical (2018 and 2019) - Don't drop it as it could be important for predictions
df['yr'] = df['yr'].map({0: '2018', 1: '2019'})

# Optional: Encoding categorical variables if necessary
# We can use LabelEncoder for ordinal encoding or OneHotEncoding depending on the variables
label_encoder = LabelEncoder()
df['season'] = label_encoder.fit_transform(df['season'])
df['weathersit'] = label_encoder.fit_transform(df['weathersit'])
df['yr'] = label_encoder.fit_transform(df['yr'])

# Step 4: Target variable 'cnt' and feature selection
X = df.drop(['cnt', 'casual', 'registered'], axis=1)  # Removing the target and individual user columns
y = df['cnt']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Build the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate Model
# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score: ", r2)

# Optional: Plot residuals
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Step 9: Conclusion
# You can now analyze the R-squared score to understand how well the model is performing.


In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

# Step 2: Load the dataset
# Assuming the data is in a CSV file called "bike_sharing_data.csv"
df = pd.read_csv("data.csv")

# Step 3: Data Preprocessing
# Convert 'weathersit' and 'season' into categorical variables with appropriate labels
df['weathersit'] = df['weathersit'].map({1: 'Clear', 2: 'Mist', 3: 'Light Snow', 4: 'Heavy Snow'})
df['season'] = df['season'].map({1: 'Spring', 2: 'Summer', 3: 'Fall', 4: 'Winter'})

# Convert the 'yr' column to categorical (2018 and 2019) - Don't drop it as it could be important for predictions
df['yr'] = df['yr'].map({0: '2018', 1: '2019'})

# Optional: Encoding categorical variables if necessary
# We can use LabelEncoder for ordinal encoding or OneHotEncoding depending on the variables
label_encoder = LabelEncoder()
df['season'] = label_encoder.fit_transform(df['season'])
df['weathersit'] = label_encoder.fit_transform(df['weathersit'])
df['yr'] = label_encoder.fit_transform(df['yr'])

# Step 4: Target variable 'cnt' and feature selection
X = df.drop(['cnt', 'casual', 'registered'], axis=1)  # Removing the target and individual user columns
y = df['cnt']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Build the Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 7: Predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate Model
# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score: ", r2)

# Optional: Plot residuals
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

# Step 9: Conclusion
# The R-squared value indicates the goodness of fit of the model. 
# A higher value of R-squared means a better fit of the model to the data.
