In [None]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-08-27 08:31:05--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip.3’

bike+sharing+datase     [   <=>              ] 273.43K   529KB/s    in 0.5s    

2024-08-27 08:31:06 (529 KB/s) - ‘bike+sharing+dataset.zip.3’ saved [279992]

Archive:  bike+sharing+dataset.zip
replace Readme.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Readme.txt              
replace day.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: day.csv                 
replace hour.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: hour.csv                


Feature Engineering

Adding two features

i) WeatherImpact : (weathersit)* temp * (1-hum)* (1-windspeed)

clear weather, high temperature, low humidity, and low wind speed suggests a potentially higher demand for bike rentals.


ii) DayHour = weekday * hr

This feature captures the interaction between the day and the hour of the day, potentially revealing patterns that are not apparent when considering these factors individually.


In [None]:
import pandas as pd
df = pd.read_csv('hour.csv')
#Feature 1
w_i= (df['weathersit'])* (df['temp']) * (1-df['hum'])* (1-df['windspeed'])
df = df.assign(weather_impact= w_i)

#Feature 2
s_t= df['season']*(24- df['hr'])
df = df.assign(season_time= s_t)

df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,weather_impact,season_time
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16,0.045600,24
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40,0.044000,23
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32,0.044000,22
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13,0.060000,21
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1,0.060000,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119,0.173846,5
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89,0.173846,4
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90,0.086923,3
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61,0.099036,2


Using oneHotEncoder

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df.drop(columns=['dteday'], inplace=True)
# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

# Numerical features
numerical_features = ['temp', 'hum', 'windspeed', 'weather_impact', 'season_time']

numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])

# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

# Categorical features
categorical_features = ['season', 'weathersit']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(sparse_output=False, drop='first'))
])

# Transforming above
X_encoded = categorical_pipeline.fit_transform(X[categorical_features])

# Converting it to a dataframe
X_encoded = pd.DataFrame(X_encoded, columns=categorical_pipeline.named_steps['onehot'].get_feature_names_out(categorical_features))

# Encoded categorical features + Numerical features
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

#Training the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Calculate performance metrics suitable for regression
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("For OneHot Encoder")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

For OneHot Encoder
Mean Squared Error: 8.016552733026463
Root Mean Squared Error: 2.831351750140993
R-squared: 0.9997468356672254


Using Target Encoder

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

# Numerical features
numerical_features = ['temp', 'hum', 'windspeed', 'weather_impact', 'season_time']

numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])

# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

# Categorical features
categorical_features = ['season', 'weathersit']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('target', TargetEncoder(target_type="continuous"))
])

# Transforming above
X_encoded = categorical_pipeline.fit_transform(X[categorical_features], y)

# Converting it to a dataframe
X_encoded = pd.DataFrame(X_encoded, columns=categorical_pipeline.named_steps['target'].get_feature_names_out(categorical_features))

# Encoded categorical features + Numerical features
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

#Training the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Calculate performance metrics suitable for regression
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("For Target Encoder")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R-squared:", r2)

For Target Encoder
Mean Squared Error: 8.317861910241659
Root Mean Squared Error: 2.8840703719295164
R-squared: 0.9997373202633668


Training Linear Regressor using package


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 8.609576811712244e-22
R-squared: 1.0


Training linear regressor from scratch

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

# Add a bias (intercept) term by adding a column of ones to X
X = np.c_[np.ones(X.shape[0]), X]

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize weights
weights = np.zeros(X_scaled.shape[1])

# Hyperparameters
learning_rate = 0.001
iterations = 1000

# Hypothesis function
def predict(X, weights):
    return np.dot(X, weights)

# Cost function (Mean Squared Error)
def compute_cost(X, y, weights):
    m = len(y)
    y_pred = predict(X, weights)
    cost = (1/(2*m)) * np.sum((y_pred - y)**2)
    return cost

# Gradient Descent algorithm
def gradient_descent(X, y, weights, learning_rate, iterations):
    m = len(y)
    cost_history = np.zeros(iterations)

    for i in range(iterations):
        y_pred = predict(X, weights)
        gradients = (1 / m) * np.dot(X.T, (y_pred - y))
        weights -= learning_rate * gradients
        cost_history[i] = compute_cost(X, y, weights)

    return weights, cost_history

# Train the linear regressor using gradient descent
weights, cost_history = gradient_descent(X_scaled, y, weights, learning_rate, iterations)

# Predictions on the training data
y_pred = predict(X_scaled, weights)

# Calculate Mean Squared Error (MSE)
mse = np.mean((y_pred - y) ** 2)

# Calculate R-squared (R²)
ss_total = np.sum((y - np.mean(y)) ** 2)
ss_residual = np.sum((y - y_pred) ** 2)
r2 = 1 - (ss_residual / ss_total)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 39261.654451518116
R-squared: -0.19337902959721642


Pipeline

In [None]:
from sklearn import set_config

final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', LinearRegression())
])
set_config(display='diagram')# To display
final_pipeline