In [27]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import pandas as pd
import numpy as np

In [28]:
df = pd.read_csv("bike_sharing_dataset/hour.csv")
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


### There were two temperature features so I combined them to avg_temp using average of the two

### I created heat_index using avg_temp and hum

In [30]:
df['avg_temp'] = (df['atemp'] + df['temp'])/2
df['heat_index'] = 0.5*df['avg_temp'] + 0.5*df['hum']
df.drop(['atemp','temp','hum'],axis=1,inplace=True)

df['day_night'] = df['hr'].apply(lambda x: 'day' if 6<=x<=18 else 'night')

df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)

df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')

df.drop(columns=['dteday'], inplace=True)

In [31]:
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,windspeed,cnt,avg_temp,heat_index,day_night
0,1,0,1,0,0,6,0,1,0.0,16,0.26395,0.536975,night
1,1,0,1,1,0,6,0,1,0.0,40,0.24635,0.523175,night
2,1,0,1,2,0,6,0,1,0.0,32,0.24635,0.523175,night
3,1,0,1,3,0,6,0,1,0.0,13,0.26395,0.506975,night
4,1,0,1,4,0,6,0,1,0.0,1,0.26395,0.506975,night


In [32]:
df.isnull().sum()

season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
windspeed     0
cnt           0
avg_temp      0
heat_index    0
day_night     0
dtype: int64

### No Null values

In [33]:
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,windspeed,cnt,avg_temp,heat_index,day_night
0,1,0,1,0,0,6,0,1,0.0,16,0.26395,0.536975,night
1,1,0,1,1,0,6,0,1,0.0,40,0.24635,0.523175,night
2,1,0,1,2,0,6,0,1,0.0,32,0.24635,0.523175,night
3,1,0,1,3,0,6,0,1,0.0,13,0.26395,0.506975,night
4,1,0,1,4,0,6,0,1,0.0,1,0.26395,0.506975,night


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      17379 non-null  category
 1   yr          17379 non-null  category
 2   mnth        17379 non-null  category
 3   hr          17379 non-null  category
 4   holiday     17379 non-null  category
 5   weekday     17379 non-null  category
 6   workingday  17379 non-null  category
 7   weathersit  17379 non-null  category
 8   windspeed   17379 non-null  float64 
 9   cnt         17379 non-null  int64   
 10  avg_temp    17379 non-null  float64 
 11  heat_index  17379 non-null  float64 
 12  day_night   17379 non-null  object  
dtypes: category(8), float64(3), int64(1), object(1)
memory usage: 817.0+ KB


In [35]:
X = df.drop(['cnt'],axis=1).copy()
y = df['cnt']

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:

numerical_features = ['avg_temp', 'heat_index', 'windspeed']
categorical_features = ['season', 'weathersit', 'day_night']


numerical_transformer = MinMaxScaler()
categorical_transformer = TargetEncoder()

In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [39]:

model = LinearRegression()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])




In [40]:

mlflow.set_tracking_uri("http://localhost:5000")


with mlflow.start_run():

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_metric("mse", mse)

    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("r2", r2)
    
    mlflow.sklearn.log_model(pipeline, "model_pipeline")
    
mlflow.end_run()

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
2024/08/27 13:03:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run traveling-owl-181 at: http://localhost:5000/#/experiments/0/runs/3f116ea452894be99cd2474bab9f7c18.
2024/08/27 13:03:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http

In [41]:
print("MSE: ",mse)
print("r2: ", r2)

MSE:  19588.301053361854
r2:  0.3813975493566498


In [42]:

pipeline

In [43]:
class MultipleLinearRegression:

    def __init__(self):
        self.coef_ = None
        self.intercept_ = None

    def fit(self,X_train,y_train):
        X_train = np.insert(X_train,0,1,axis=1)

        betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)

        self.intercept_ = betas[0]
        self.coef_=betas[1:]


    def predict(self,X_test):
        
        return np.dot(X_test,self.coef_) + self.intercept_

In [44]:
lr = MultipleLinearRegression()

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', lr)
])


In [45]:

mlflow.set_tracking_uri("http://localhost:5000")


with mlflow.start_run():

    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mlflow.log_metric("mse", mse)

    r2 = r2_score(y_test, y_pred)
    mlflow.log_metric("r2", r2)
    
    mlflow.sklearn.log_model(pipeline, "model_pipeline")
    
mlflow.end_run()

  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
  X[column] = X[column].astype("object").fillna(np.nan).map(col_mapping)
2024/08/27 13:03:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run useful-gnu-853 at: http://localhost:5000/#/experiments/0/runs/bad66b2629d94cd8a3eb59851a2f6be4.
2024/08/27 13:03:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://

In [46]:

pipeline

In [47]:
print("MSE: ",mse)
print("r2: ",r2)

MSE:  19588.30105336187
r2:  0.38139754935664927
