In [None]:
# ---------------------------------------
# 1. Import Libraries
# ---------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [None]:
# ---------------------------------------
# 2. Load Dataset
# ---------------------------------------
df = pd.read_csv('data/day.csv')  # Adjust path if needed
df.head()

# Basic info
df.info()
df.describe()

In [None]:
# ---------------------------------------
# 3. Data Preprocessing
# ---------------------------------------

# Convert numeric categorical variables to strings
df['season'] = df['season'].map({1:'Spring', 2:'Summer', 3:'Fall', 4:'Winter'})
df['weathersit'] = df['weathersit'].map({1:'Clear', 2:'Mist', 3:'Light Snow/Rain', 4:'Heavy Rain/Snow'})
df['mnth'] = df['mnth'].astype(str)
df['weekday'] = df['weekday'].astype(str)

# Drop unnecessary columns
df.drop(['instant', 'dteday', 'casual', 'registered'], axis=1, inplace=True)

# Create dummy variables
df = pd.get_dummies(df, drop_first=True)

In [None]:
# ---------------------------------------
# 4. Exploratory Data Analysis (EDA)
# ---------------------------------------

# Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Pairplot of selected numerical variables
sns.pairplot(df[['temp','atemp','hum','windspeed','cnt']])
plt.show()

In [None]:
# ---------------------------------------
# 5. Split Dataset
# ---------------------------------------
X = df.drop('cnt', axis=1)
y = df['cnt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# ---------------------------------------
# 6. Feature Scaling
# ---------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# ---------------------------------------
# 7. Build Linear Regression Model
# ---------------------------------------
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)


In [None]:
# ---------------------------------------
# 8. Model Evaluation
# ---------------------------------------
r2 = r2_score(y_test, y_pred)
print(f"R-squared score on test set: {r2:.4f}")

# Residual analysis
residuals = y_test - y_pred

# Residual plot
plt.figure(figsize=(8,5))
plt.scatter(y_pred, residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted')
plt.show()

# Q-Q plot for normality of residuals
sm.qqplot(residuals, line='45')
plt.title("Q-Q Plot of Residuals")
plt.show()

In [None]:
# ---------------------------------------
# 9. Check Multicollinearity (VIF)
# ---------------------------------------
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]
print(vif_data.sort_values(by='VIF', ascending=False))


In [None]:
# ---------------------------------------
# 10. Top Features by Coefficient
# ---------------------------------------
coef_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': lr.coef_})
coef_df = coef_df.sort_values(by='Coefficient', ascending=False)
print("Top 10 features by coefficient:")
print(coef_df.head(10))


In [None]:
# 11. Conclusion
# ---------------------------------------
print("""
Model Summary:
- The Linear Regression model has R-squared score of {:.4f} on test data.
- Top features influencing bike demand are displayed above.
- Residual and Q-Q plots indicate assumptions of linear regression are reasonably satisfied.
""".format(r2))