# Importing Necessary Packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Data loading and exploration

In [4]:
# Load the data
df = pd.read_csv('G:/M.Tech - Ph.D/Sem 1/MLOps/Assignment 2/hour.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
# Information about the type of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [6]:
# changing the type of some columns
df['dteday'] = pd.to_datetime(df['dteday'])
df['season'] = df['season'].astype('category')
df['yr'] = df['yr'].astype('category')
df['mnth'] = df['mnth'].astype('category')
df['hr'] = df['hr'].astype('category')
df['holiday'] = df['holiday'].astype('category')
df['weekday'] = df['weekday'].astype('category')
df['workingday'] = df['workingday'].astype('category')
df['weathersit'] = df['weathersit'].astype('category')

# information about the type of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  category      
 3   yr          17379 non-null  category      
 4   mnth        17379 non-null  category      
 5   hr          17379 non-null  category      
 6   holiday     17379 non-null  category      
 7   weekday     17379 non-null  category      
 8   workingday  17379 non-null  category      
 9   weathersit  17379 non-null  category      
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64         
 16  cnt         17379 non-

# Data Preprocessing

## Missing Values

In [7]:
# Checking for missing values
df.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
casual        0
registered    0
cnt           0
dtype: int64

## Feature Engineering

Interaction variable:
- temp_humid (temp*hum): Temperature and humidity often have a combined effect on how comfortable or uncomfortable a day feels, which can influence the demand for bike rentals. For instance, a day with high temperature and high humidity might lead to lower rentals due to discomfort, even though the temperature alone might suggest it’s a good day for biking.

-   atemp_windspeed (atemp*windspeed): The feeling temperature (atemp) is influenced by wind speed. On a windy day, the perceived temperature can be lower than the actual temperature, which can affect people's decision to rent bikes. By including this interaction, the model can account for days where a high wind speed may reduce the perceived warmth, potentially decreasing bike rentals.

In [8]:
df["temp_humidity"] = df["temp"] * df["hum"]
df["atemp_windspeed"] = df["atemp"] * df["windspeed"]

In [9]:
# Spllitting the data into features and target
X = df.drop(['instant', 'dteday', 'cnt'], axis=1)
y = df['cnt']

In [10]:
# find the index of cateforical columns
categorical_features = X.select_dtypes(include=['category']).columns
categorical_features_index = [X.columns.get_loc(i) for i in categorical_features]

# find the index of numerical columns
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
numerical_features_index = [X.columns.get_loc(i) for i in numerical_features]

In [11]:
# categorical pipeline; step 1: Impute missing values, Step 2: OneHotEncode
categorical_pipeline_onehot = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

categorical_pipeline_target = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', TargetEncoder())
])

categorical_pipeline_onehot

In [12]:
categorical_pipeline_target

In [13]:
# numerical pipeline; step 1: Impute missing values, Step 2: Standardize
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

numerical_pipeline

In [14]:
# Column Transformer with OneHotEncoding
preprocessor_with_onehot = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features_index),
        ('cat', categorical_pipeline_onehot, categorical_features_index)
    ])

preprocessor_with_onehot

In [15]:
# Column Transformer with TargetEncoding
preprocessor_with_target = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features_index),
        ('cat', categorical_pipeline_target, categorical_features_index)
    ])

preprocessor_with_target

# Model Integration to Data Pipeline and Model Training

## Inbulit Model Integration

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
regression_model_onehot = Pipeline(steps=[('preprocessor', preprocessor_with_onehot),
                      ('classifier', LinearRegression())])

regression_model_onehot

In [18]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
regression_model_target = Pipeline(steps=[('preprocessor', preprocessor_with_target),
                      ('classifier', LinearRegression())])

regression_model_target

## Linear Model from Scratch

In [19]:
class LinearRegressionScratch:
    
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None
        
    def fit(self,X_train,y_train):
        X_train = X_train.toarray()
        X_train = np.insert(X_train,0,1,axis=1)
        
        
        # calcuate the coeffs
        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.intercept_ = betas[0]
        self.coef_ = betas[1:]
    
    def predict(self,X_test):
        X_test = X_test.toarray()
        y_pred = np.dot(X_test,self.coef_) + self.intercept_
        return y_pred

In [20]:
# Attaching LinearRegressionScratch to the pipeline
regression_model_scratch_onehot = Pipeline(steps=[('preprocessor', preprocessor_with_onehot),
                      ('classifier', LinearRegressionScratch())])
regression_model_scratch_onehot

# Model Training

In [21]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Training the model
regression_model_onehot.fit(X_train, y_train)
regression_model_target.fit(X_train, y_train)
regression_model_scratch_onehot.fit(X_train, y_train)



# Model Evaluation

In [23]:
# Predicting the target using regression_model_onehot
y_pred_onehot = regression_model_onehot.predict(X_test)
y_pred_target = regression_model_target.predict(X_test)
y_pred_scratch_onehot = regression_model_scratch_onehot.predict(X_test)

In [24]:
# Evaluating the model
mse_onehot = mean_squared_error(y_test, y_pred_onehot)
r2_onehot = r2_score(y_test, y_pred_onehot)

print(f"Mean Squared Error: {mse_onehot}")
print(f"R^2 Score: {r2_onehot}")

Mean Squared Error: 1.3710970844406965e-06
R^2 Score: 0.9999999999567005


In [25]:
# Evaluating the model
mse_target = mean_squared_error(y_test, y_pred_target)
r2_target = r2_score(y_test, y_pred_target)

print(f"Mean Squared Error: {mse_target}")
print(f"R^2 Score: {r2_target}")

Mean Squared Error: 9.270924647536939e-10
R^2 Score: 0.9999999999999707


In [26]:
# Evaluating the model
mse_scratch_onehot = mean_squared_error(y_test, y_pred_scratch_onehot)
r2_scratch_onehot = r2_score(y_test, y_pred_scratch_onehot)

print(f"Mean Squared Error: {mse_scratch_onehot}")
print(f"R^2 Score: {r2_scratch_onehot}")

Mean Squared Error: 43952204.13866513
R^2 Score: -1387.0193650938775


In [27]:
# Export the results to a csv file
results = pd.DataFrame({
    'Model': ['RegressionModelOneHot', 'RegressionModelTarget', 'RegressionModelScratchOneHot'],
    'Mean Squared Error': [mse_onehot, mse_target, mse_scratch_onehot],
    'R^2 Score': [r2_onehot, r2_target, r2_scratch_onehot]
})