In [1]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-08-27 10:49:10--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [ <=>                ] 273.43K  1.37MB/s    in 0.2s    

2024-08-27 10:49:10 (1.37 MB/s) - ‘bike+sharing+dataset.zip’ saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [33]:
import pandas as pd
df = pd.read_csv('hour.csv')
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
 17  day_night   17379 non-null  object 
dtypes: float64(4), int64(12), object(2)
memory usage: 2.4+ MB


In [34]:
#Adding two new features
df['wind_temp'] = df['windspeed'] * df['temp']
df['hum_temp'] = df['hum'] * df['temp']

In [35]:
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)
df['cnt'] = df['cnt'].astype('float64')


In [36]:
# Separating features and target variable
X = df.drop(columns=['cnt']) # Features
y = df['cnt'] # Target

In [37]:
X


Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,day_night,wind_temp,hum_temp
0,1,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,night,0.000000,0.1944
1,2,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,night,0.000000,0.1760
2,3,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,night,0.000000,0.1760
3,4,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,night,0.000000,0.1800
4,5,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,night,0.000000,0.1800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,night,0.042692,0.1560
17375,17376,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,night,0.042692,0.1560
17376,17377,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,night,0.042692,0.1560
17377,17378,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,night,0.034918,0.1456


In [38]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
!pip install category_encoders
import category_encoders as ce



In [39]:
# Numerical features
numerical_features = ['temp', 'hum', 'windspeed','wind_temp','hum_temp']
numerical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # Impute missing values with mean
('scaler', MinMaxScaler()) # Normalize using MinMaxScaler
])


In [40]:
# Transforming above
X[numerical_features] = numerical_pipeline.fit_transform(X[numerical_features])

In [41]:
# Categorical features
categorical_features = ['season', 'weathersit', 'day_night']
categorical_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('targetencoder', ce.TargetEncoder())
])

In [42]:
X_tobeencoded =X[categorical_features]
X_tobeencoded

Unnamed: 0,season,weathersit,day_night
0,1,1,night
1,1,1,night
2,1,1,night
3,1,1,night
4,1,1,night
...,...,...,...
17374,1,2,night
17375,1,2,night
17376,1,1,night
17377,1,1,night


In [43]:
X_encoded = categorical_pipeline.fit_transform(X_tobeencoded, y)

In [44]:
# Converting it to a dataframe
X_encoded.columns = categorical_features


In [45]:
# Encoded categorical features + Numerical features
X = pd.concat([X.drop(columns=categorical_features), X_encoded], axis=1)

In [46]:
X

Unnamed: 0,instant,yr,mnth,hr,holiday,weekday,workingday,temp,atemp,hum,windspeed,casual,registered,wind_temp,hum_temp,season,weathersit,day_night
0,1,0,1,0,0,6,0,0.224490,0.2879,0.81,0.000000,3,13,0.000000,0.312039,111.114569,204.869272,98.894138
1,2,0,1,1,0,6,0,0.204082,0.2727,0.80,0.000000,8,32,0.000000,0.282504,111.114569,204.869272,98.894138
2,3,0,1,2,0,6,0,0.204082,0.2727,0.80,0.000000,5,27,0.000000,0.282504,111.114569,204.869272,98.894138
3,4,0,1,3,0,6,0,0.224490,0.2879,0.75,0.000000,3,10,0.000000,0.288925,111.114569,204.869272,98.894138
4,5,0,1,4,0,6,0,0.224490,0.2879,0.75,0.000000,0,1,0.000000,0.288925,111.114569,204.869272,98.894138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,1,12,19,0,1,1,0.244898,0.2576,0.60,0.193018,11,108,0.062731,0.250401,111.114569,175.165493,98.894138
17375,17376,1,12,20,0,1,1,0.244898,0.2576,0.60,0.193018,8,81,0.062731,0.250401,111.114569,175.165493,98.894138
17376,17377,1,12,21,0,1,1,0.244898,0.2576,0.60,0.193018,7,83,0.062731,0.250401,111.114569,204.869272,98.894138
17377,17378,1,12,22,0,1,1,0.244898,0.2727,0.56,0.157870,13,48,0.051308,0.233708,111.114569,204.869272,98.894138


In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

In [53]:
X_train

Unnamed: 0,instant,yr,mnth,hr,holiday,weekday,workingday,temp,atemp,hum,windspeed,casual,registered,wind_temp,hum_temp,season,weathersit,day_night
335,336,0,1,11,0,6,0,0.183673,0.1970,0.55,0.263195,18,54,0.065799,0.176565,111.114569,204.869272,265.225933
7035,7036,0,10,18,0,2,1,0.510204,0.5000,0.42,0.122840,65,453,0.079846,0.350562,198.868856,204.869272,265.225933
8051,8052,0,12,3,0,3,1,0.448980,0.4545,1.00,0.263195,0,3,0.151337,0.738363,198.868856,111.579281,98.894138
2133,2134,0,4,18,0,0,0,0.448980,0.4545,0.31,0.000000,71,101,0.000000,0.228892,208.344069,175.165493,265.225933
8485,8486,0,12,6,0,0,0,0.183673,0.2273,0.75,0.122840,0,1,0.030710,0.240770,111.114569,204.869272,265.225933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,11285,1,4,9,0,5,1,0.448980,0.4545,0.88,0.105325,30,329,0.060562,0.649759,208.344069,204.869272,265.225933
11964,11965,1,5,17,0,5,1,0.653061,0.6212,0.34,0.157870,124,688,0.130243,0.360193,208.344069,204.869272,265.225933
5390,5391,0,8,12,0,3,1,0.795918,0.7273,0.43,0.333373,26,163,0.333373,0.552167,236.016237,204.869272,265.225933
860,861,0,2,7,0,2,1,0.224490,0.1970,0.65,0.491243,3,97,0.147373,0.250401,111.114569,204.869272,265.225933


In [54]:
X_test

Unnamed: 0,instant,yr,mnth,hr,holiday,weekday,workingday,temp,atemp,hum,windspeed,casual,registered,wind_temp,hum_temp,season,weathersit,day_night
12830,12831,1,6,19,0,6,0,0.795918,0.6970,0.27,0.228047,185,240,0.228047,0.346709,236.016237,204.869272,98.894138
8688,8689,1,1,20,1,1,0,0.224490,0.2273,0.41,0.263195,5,83,0.078959,0.157945,111.114569,204.869272,98.894138
7091,7092,0,10,2,0,5,1,0.306122,0.3030,0.66,0.333373,1,3,0.133349,0.339005,198.868856,204.869272,98.894138
12230,12231,1,5,19,0,2,1,0.775510,0.7121,0.52,0.421065,69,457,0.410538,0.651043,208.344069,204.869272,98.894138
431,432,0,1,0,0,4,1,0.244898,0.2273,0.56,0.456213,5,8,0.148269,0.233708,111.114569,204.869272,98.894138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6759,6760,0,10,5,0,5,1,0.551020,0.5303,0.94,0.193018,0,17,0.135112,0.844944,198.868856,175.165493,98.894138
13989,13990,1,8,2,0,6,0,0.632653,0.5909,0.78,0.228047,19,66,0.182438,0.801284,236.016237,111.579281,98.894138
173,174,0,1,12,0,6,0,0.183673,0.1818,0.59,0.421065,8,90,0.105266,0.189406,111.114569,175.165493,265.225933
16192,16193,1,11,10,1,1,0,0.469388,0.4697,0.77,0.193018,82,184,0.115811,0.593258,198.868856,204.869272,265.225933


In [55]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [56]:
model = LinearRegression().fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

#Parameters
coefficients = model.coef_
intercept = model.intercept_
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.359273477023023e-21
R-squared: 1.0


In [57]:
final_pipeline = Pipeline([
('num_preprocess', numerical_pipeline),
('cat_preprocess', categorical_pipeline),
('model', LinearRegression())
])

# To display
final_pipeline

In [62]:
#From
import numpy as np

# Add intercept term (column of ones) to X_train and X_test
X_train_intercept = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_intercept = np.c_[np.ones((X_test.shape[0], 1)), X_test]

# Calculate the weights using the normal equation
theta = np.linalg.inv(X_train_intercept.T.dot(X_train_intercept)).dot(X_train_intercept.T).dot(y_train)

# Predict on test data
y_test_pred = X_test_intercept.dot(theta)

mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'Mean Squared Error: {mse_test}')
print(f'R-squared: {r2_test}')


Mean Squared Error: 3.1348747725858174e-20
R-squared: 1.0
