In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('data/hour.csv')
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [3]:
# Feature Enginnering

def categorize_temp(temp):
    if temp < 0.2:
        return 'cold'
    elif 0.2 <= temp < 0.4:
        return 'cool'
    elif 0.4 <= temp < 0.6:
        return 'moderate'
    elif 0.6 <= temp < 0.8:
        return 'warm'
    else:
        return 'hot'

def categorize_rush_hour(hr):
    if 7 <= hr <= 9 or 16 <= hr <= 19:
        return 'rush_hour'
    else:
        return 'non_rush_hour'
data['day_night'] = data['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
data['temp_range'] = data['temp'].apply(categorize_temp)
data['rush_hour'] = data['hr'].apply(categorize_rush_hour)

In [4]:
data.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
data['dteday'] = pd.to_datetime(data.dteday)
data['season'] = data.season.astype('category')
data['holiday'] = data.holiday.astype('category')
data['weekday'] = data.weekday.astype('category')
data['weathersit'] = data.weathersit.astype('category')
data['workingday'] = data.workingday.astype('category')
data['mnth'] = data.mnth.astype('category')
data['yr'] = data.yr.astype('category')
data['hr'] = data.hr.astype('category')
data.drop(columns=['dteday'], inplace=True)
#data.columns = data.columns.astype(str)
data.head(5)

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt,day_night,temp_range,rush_hour
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16,night,cool,non_rush_hour
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40,night,cool,non_rush_hour
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32,night,cool,non_rush_hour
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13,night,cool,non_rush_hour
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1,night,cool,non_rush_hour


In [5]:
# Separating features and target variable
X = data.drop(columns=['cnt']) # Features
y = data['cnt'] # Target

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder

# Numerical features
numerical_features = ['temp', 'hum', 'windspeed']
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', MinMaxScaler())  # Normalize using MinMaxScaler
])

# Transforming numerical features
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

# Categorical features
categorical_features = ['season', 'weathersit', 'day_night', 'temp_range', 'rush_hour']
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('target_encoder', TargetEncoder())  # Replace OneHotEncoder with TargetEncoder
])

X_encoded = categorical_pipeline.fit_transform(X[categorical_features], y)

# Combine encoded categorical features back into the dataset
X = X.drop(columns=categorical_features)
X = X.join(X_encoded)
X.columns = X.columns.astype(str)

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model_a = RandomForestRegressor(random_state=42)
model_a.fit(X_train, y_train)

# Predict and evaluate with the test set
y_pred = model_a.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"Sklearn Linear Regression - MSE: {rmse}, R²: {r2}")

Sklearn Linear Regression - MSE: 41.80012989714289, R²: 0.9448214696833452




In [8]:
from sklearn.linear_model import LinearRegression

# Initialize the LinearRegression model
sklearn_model = LinearRegression()

# Train the model
sklearn_model.fit(X_train, y_train)

# Predict on the test set
y_pred_sklearn = sklearn_model.predict(X_test)

# Evaluate performance
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print(f"Sklearn Linear Regression - MSE: {mse_sklearn}, R²: {r2_sklearn}")

Sklearn Linear Regression - MSE: 12264.087263488926, R²: 0.6126976803434367


In [9]:
import numpy as np

# Add a column of ones to X_train and X_test to account for the intercept term
X_train_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]  # Add intercept term
X_test_b = np.c_[np.ones((X_test.shape[0], 1)), X_test]  # Add intercept term

# Compute the parameter vector theta using the Normal Equation
theta_best = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)

# Predict using the calculated parameters
y_pred_scratch = X_test_b.dot(theta_best)

# Evaluate performance
mse_scratch = mean_squared_error(y_test, y_pred_scratch)
r2_scratch = r2_score(y_test, y_pred_scratch)

print(f"Scratch Linear Regression - MSE: {mse_scratch}, R²: {r2_scratch}")

Scratch Linear Regression - MSE: 12264.087263488549, R²: 0.6126976803434486


In [10]:
print(f"RandomForest Regression - MSE: {rmse}, R²: {r2}")
print(f"Sklearn Linear Regression - MSE: {mse_sklearn}, R²: {r2_sklearn}")
print(f"Linear Regression  from scratch - MSE: {mse_scratch}, R²: {r2_scratch}")

RandomForest Regression - MSE: 41.80012989714289, R²: 0.9448214696833452
Sklearn Linear Regression - MSE: 12264.087263488926, R²: 0.6126976803434367
Linear Regression  from scratch - MSE: 12264.087263488549, R²: 0.6126976803434486


In [11]:
from sklearn import set_config
final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', RandomForestRegressor(n_estimators=100, random_state=42))
])
set_config(display='diagram')# To display
final_pipeline

In [12]:
# Create the final pipeline for linear regression
final_pipeline = Pipeline([
    ('num_preprocess', numerical_pipeline),
    ('cat_preprocess', categorical_pipeline),
    ('model', LinearRegression())
])

# Configure the display to show the pipeline as a diagram
set_config(display='diagram')

# Display the pipeline
final_pipeline

In [14]:
## Multiple models:

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn import set_config
from sklearn.pipeline import Pipeline

# Define individual models
linear_reg = LinearRegression()
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a VotingRegressor
voting_regressor = VotingRegressor(estimators=[
    ('lr', linear_reg), 
    ('rf', random_forest)
])

# Create the final pipeline with the VotingRegressor
final_pipeline = Pipeline([
    ('num_preprocess', numerical_pipeline),
    ('cat_preprocess', categorical_pipeline),
    ('voting_regressor', voting_regressor)
])

# Configure the display to show the pipeline as a diagram
set_config(display='diagram')

# Display the pipeline
final_pipeline


In [15]:
## Comparing Models

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import set_config
from sklearn.pipeline import Pipeline

# Define the pipeline without specifying a model
pipeline = Pipeline([
    ('num_preprocess', numerical_pipeline),
    ('cat_preprocess', categorical_pipeline),
    ('model', LinearRegression())  # Placeholder model
])

# Define the parameter grid to search across different models
param_grid = {
    'model': [LinearRegression(), RandomForestRegressor(n_estimators=100, random_state=42)]
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Configure the display to show the pipeline as a diagram
set_config(display='diagram')

# Display the best pipeline found
grid_search.best_estimator_


  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.