In [None]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.3.2
    Uninstalling scikit-learn-1.3.2:
      Successfully uninstalled scikit-learn-1.3.2
Successfully installed scikit-learn-1.5.1


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import TargetEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-08-27 16:52:36--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [ <=>                ] 273.43K  --.-KB/s    in 0.1s    

2024-08-27 16:52:36 (2.48 MB/s) - ‘bike+sharing+dataset.zip’ saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [None]:
df = pd.read_csv('hour.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [None]:
df['dteday'] = pd.to_datetime(df['dteday'])
df['season'] = df['season'].astype('category')
df['yr'] = df['yr'].astype('category')
df['mnth'] = df['mnth'].astype('category')
df['hr'] = df['hr'].astype('category')
df['holiday'] = df['holiday'].astype('category')
df['weekday'] = df['weekday'].astype('category')
df['workingday'] = df['workingday'].astype('category')
df['weathersit'] = df['weathersit'].astype('category')

In [None]:
df["temp_humidity"] = df["temp"] * df["hum"]
df["atemp_windspeed"] = df["atemp"] * df["windspeed"]

In [None]:
X = df.drop(['instant', 'dteday', 'cnt'], axis=1)
y = df['cnt']

In [None]:
categorical_features = X.select_dtypes(include=['category']).columns
categorical_features_index = [X.columns.get_loc(i) for i in categorical_features]

numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
numerical_features_index = [X.columns.get_loc(i) for i in numerical_features]

In [None]:
categorical_pipeline_onehot = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore')) ])
categorical_pipeline_target = Pipeline(steps=[ ('onehot', TargetEncoder()) ])

categorical_pipeline_onehot

In [None]:
categorical_pipeline_target

In [None]:
numerical_pipeline = Pipeline(steps=[ ('scaler', StandardScaler()) ])
numerical_pipeline

In [None]:
preprocessor_with_onehot = ColumnTransformer(transformers=[ ('num', numerical_pipeline, numerical_features_index), ('cat', categorical_pipeline_onehot, categorical_features_index) ])
preprocessor_with_onehot

In [None]:
preprocessor_with_target = ColumnTransformer(transformers=[ ('num', numerical_pipeline, numerical_features_index), ('cat', categorical_pipeline_target, categorical_features_index) ])
preprocessor_with_target

In [None]:
from sklearn.linear_model import LinearRegression
regression_model_onehot = Pipeline(steps=[('preprocessor', preprocessor_with_onehot), ('classifier', LinearRegression())])
regression_model_onehot

In [None]:
regression_model_target = Pipeline(steps=[('preprocessor', preprocessor_with_target), ('classifier', LinearRegression())])
regression_model_target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
regression_model_onehot.fit(X_train, y_train)
regression_model_target.fit(X_train, y_train)



In [None]:
y_pred_onehot = regression_model_onehot.predict(X_test)
y_pred_target = regression_model_target.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred_onehot)
r2 = r2_score(y_test, y_pred_onehot)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 1.405474303607487e-06
R^2 Score: 0.9999999999556148


In [None]:
mse1 = mean_squared_error(y_test, y_pred_target)
r21 = r2_score(y_test, y_pred_target)

print(f"Mean Squared Error: {mse1}")
print(f"R^2 Score: {r21}")

Mean Squared Error: 5.919691859945668e-11
R^2 Score: 0.9999999999999981
