In [4]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ---------------------------
# Load dataset
# ---------------------------
df = pd.read_csv("data/day.csv", parse_dates=['dteday'])

# ---------------------------
# Time-aware split
# ---------------------------
train = df[df['dteday'].dt.year == 2011]
valid = df[(df['dteday'].dt.year == 2012) & (df['dteday'].dt.month <= 6)]
test = df[(df['dteday'].dt.year == 2012) & (df['dteday'].dt.month > 6)]

# Separate target
target = 'cnt'
X_train = train.drop(columns=[target, 'dteday'])
y_train = train[target]
X_valid = valid.drop(columns=[target, 'dteday'])
y_valid = valid[target]
X_test = test.drop(columns=[target, 'dteday'])
y_test = test[target]

# ---------------------------
# Dummy Regressor (baseline)
# ---------------------------
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_valid)

# Metrics
mae_dummy = mean_absolute_error(y_valid, y_pred_dummy)
rmse_dummy = np.sqrt(mean_squared_error(y_valid, y_pred_dummy))
r2_dummy = r2_score(y_valid, y_pred_dummy)

print(f"Dummy - MAE: {mae_dummy:.2f}, RMSE: {rmse_dummy:.2f}, R2: {r2_dummy:.3f}")

# ---------------------------
# Linear Regression
# ---------------------------
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LinearRegression())
])
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_valid)

# Metrics
mae_lr = mean_absolute_error(y_valid, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_valid, y_pred_lr))
r2_lr = r2_score(y_valid, y_pred_lr)

print(f"Linear Regression - MAE: {mae_lr:.2f}, RMSE: {rmse_lr:.2f}, R2: {r2_lr:.3f}")

# ---------------------------
# Ridge Regression
# ---------------------------
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=1.0))
])
ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_valid)

# Metrics
mae_ridge = mean_absolute_error(y_valid, y_pred_ridge)
rmse_ridge = np.sqrt(mean_squared_error(y_valid, y_pred_ridge))
r2_ridge = r2_score(y_valid, y_pred_ridge)

print(f"Ridge - MAE: {mae_ridge:.2f}, RMSE: {rmse_ridge:.2f}, R2: {r2_ridge:.3f}")

# ---------------------------
# Save results to CSV
# ---------------------------
results = pd.DataFrame({
    'model': ['Dummy', 'LinearRegression', 'Ridge'],
    'MAE': [mae_dummy, mae_lr, mae_ridge],
    'RMSE': [rmse_dummy, rmse_lr, rmse_ridge],
    'R2': [r2_dummy, r2_lr, r2_ridge]
})

# Ensure folder exists
os.makedirs('results', exist_ok=True)

results.to_csv('results/baseline_and_linear.csv', index=False)
print("\nResults saved to 'results/baseline_and_linear.csv'")
print(results)


Dummy - MAE: 2052.44, RMSE: 2449.89, R2: -1.033
Linear Regression - MAE: 0.00, RMSE: 0.00, R2: 1.000
Ridge - MAE: 31.96, RMSE: 32.93, R2: 1.000

Results saved to 'results/baseline_and_linear.csv'
              model           MAE          RMSE        R2
0             Dummy  2.052435e+03  2.449887e+03 -1.032913
1  LinearRegression  1.402971e-12  1.626831e-12  1.000000
2             Ridge  3.196429e+01  3.293397e+01  0.999633
