In [3]:
import pandas as pd
import numpy as np

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

In [4]:
data = pd.read_csv('D:/Music/CSV/london_merged.csv')
data

Unnamed: 0,timestamp,cnt,t1,t2,hum,wind_speed,weather_code,is_holiday,is_weekend,season
0,2015-01-04 00:00:00,182,3.0,2.0,93.0,6.0,3.0,0.0,1.0,3.0
1,2015-01-04 01:00:00,138,3.0,2.5,93.0,5.0,1.0,0.0,1.0,3.0
2,2015-01-04 02:00:00,134,2.5,2.5,96.5,0.0,1.0,0.0,1.0,3.0
3,2015-01-04 03:00:00,72,2.0,2.0,100.0,0.0,1.0,0.0,1.0,3.0
4,2015-01-04 04:00:00,47,2.0,0.0,93.0,6.5,1.0,0.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...,...
17409,2017-01-03 19:00:00,1042,5.0,1.0,81.0,19.0,3.0,0.0,0.0,3.0
17410,2017-01-03 20:00:00,541,5.0,1.0,81.0,21.0,4.0,0.0,0.0,3.0
17411,2017-01-03 21:00:00,337,5.5,1.5,78.5,24.0,4.0,0.0,0.0,3.0
17412,2017-01-03 22:00:00,224,5.5,1.5,76.0,23.0,4.0,0.0,0.0,3.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17414 entries, 0 to 17413
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   timestamp     17414 non-null  object 
 1   cnt           17414 non-null  int64  
 2   t1            17414 non-null  float64
 3   t2            17414 non-null  float64
 4   hum           17414 non-null  float64
 5   wind_speed    17414 non-null  float64
 6   weather_code  17414 non-null  float64
 7   is_holiday    17414 non-null  float64
 8   is_weekend    17414 non-null  float64
 9   season        17414 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 1.3+ MB


#Preprocessing

In [58]:
def preprocess_inputs(df):
    df = df.copy()
    
    #Extracting Month, day, time from timesatmp column
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['month'] = df['timestamp'].apply(lambda x: x.month)
    df['day'] = df['timestamp'].apply(lambda x: x.day)
    df['hour'] = df['timestamp'].apply(lambda x: x.hour)
    df = df.drop('timestamp', axis = 1)
    
    #One hot encode weather column
    weather_dummy = pd.get_dummies(df['weather_code'])
    df = pd.concat([df, weather_dummy], axis = 1)
    df = df.drop('weather_code', axis = 1)
    
    #Spittig into X and Y
    X = df.drop('cnt', axis = 1)
    Y = df['cnt']
    
    #Train-Test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1)
    
    #Scaling the data
    scaler = StadardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index = X_train.index, colums = X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index = X_test.index, columns = X_test.columns)
    
    return X, Y, X_train, X_test, Y_train, Y_test

In [50]:
data['timestamp'] = pd.to_datetime(data['timestamp'])
data['timestamp']
data['month'] = data['timestamp'].apply(lambda x: x.month)
data['month']

0        1
1        1
2        1
3        1
4        1
        ..
17409    1
17410    1
17411    1
17412    1
17413    1
Name: month, Length: 17414, dtype: int64

In [56]:
X, Y, X_train, X_test, Y_train, Y_test = preprocess_inputs(data)
print(X.shape, X_train.shape, X_test.shape)

(17414, 17) (12189, 17) (5225, 17)


In [52]:
data['weather_code'].value_counts()

1.0     6150
2.0     4034
3.0     3551
7.0     2141
4.0     1464
26.0      60
10.0      14
Name: weather_code, dtype: int64

In [53]:
X

Unnamed: 0,cnt,t1,t2,hum,wind_speed,is_holiday,is_weekend,season,month,day,hour,1.0,2.0,3.0,4.0,7.0,10.0,26.0
0,182,3.0,2.0,93.0,6.0,0.0,1.0,3.0,1,4,0,0,0,1,0,0,0,0
1,138,3.0,2.5,93.0,5.0,0.0,1.0,3.0,1,4,1,1,0,0,0,0,0,0
2,134,2.5,2.5,96.5,0.0,0.0,1.0,3.0,1,4,2,1,0,0,0,0,0,0
3,72,2.0,2.0,100.0,0.0,0.0,1.0,3.0,1,4,3,1,0,0,0,0,0,0
4,47,2.0,0.0,93.0,6.5,0.0,1.0,3.0,1,4,4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17409,1042,5.0,1.0,81.0,19.0,0.0,0.0,3.0,1,3,19,0,0,1,0,0,0,0
17410,541,5.0,1.0,81.0,21.0,0.0,0.0,3.0,1,3,20,0,0,0,1,0,0,0
17411,337,5.5,1.5,78.5,24.0,0.0,0.0,3.0,1,3,21,0,0,0,1,0,0,0
17412,224,5.5,1.5,76.0,23.0,0.0,0.0,3.0,1,3,22,0,0,0,1,0,0,0


In [57]:
Y

0         182
1         138
2         134
3          72
4          47
         ... 
17409    1042
17410     541
17411     337
17412     224
17413     139
Name: cnt, Length: 17414, dtype: int64

In [59]:
X_train

Unnamed: 0,t1,t2,hum,wind_speed,is_holiday,is_weekend,season,month,day,hour,1.0,2.0,3.0,4.0,7.0,10.0,26.0
1930,8.0,5.5,64.0,13.0,0.0,0.0,0.0,3,25,12,0,0,1,0,0,0,0
14312,21.0,21.0,83.0,8.5,0.0,0.0,1.0,8,26,3,0,1,0,0,0,0,0
2542,5.0,4.5,87.0,5.0,0.0,0.0,0.0,4,20,6,1,0,0,0,0,0,0
16732,10.0,10.0,88.0,3.0,0.0,0.0,3.0,12,6,14,0,0,1,0,0,0,0
5815,14.0,14.0,63.0,13.0,0.0,0.0,2.0,9,3,22,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,6.5,2.5,73.5,26.0,0.0,0.0,0.0,4,7,7,0,0,0,0,1,0,0
17289,5.5,4.0,84.0,6.5,0.0,0.0,3.0,12,29,19,1,0,0,0,0,0,0
5192,19.0,19.0,64.0,8.0,0.0,1.0,1.0,8,8,23,1,0,0,0,0,0,0
12172,12.5,12.5,85.0,9.0,0.0,1.0,0.0,5,28,1,1,0,0,0,0,0,0


In [60]:
#Training the model
model = XGBRegressor()
model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [61]:
#Predicting
y_pred = model.predict(X_test)


In [67]:
rmse = np.sqrt(np.mean((Y_test - y_pred) ** 2))
print("RMSE: {:.4f}".format(rmse))

RMSE: 214.5723


In [68]:
#R2 score

In [72]:
#base model
np.sum((Y_test - Y_test.mean())**2)

6161196246.401914

In [74]:
#trained model
np.sum((Y_test - y_pred) ** 2)

240565577.5834918

In [77]:
r2 = 1 - (np.sum((Y_test - y_pred) ** 2) / np.sum((Y_test - Y_test.mean())**2))
print("R2 Score: {:.2f}%".format(r2 * 100))

R2 Score: 96.10%


In [78]:
print("RMSE: {:.4f}".format(rmse))
print("R2 Score: {:.2f}%".format(r2 * 100))

RMSE: 214.5723
R2 Score: 96.10%


In [82]:
fig = px.scatter(
    x = y_pred,
    y = Y_test,
    labels = { 'x' : 'Predicted', 'y' : 'Actual'},
    title = 'Actual vs Predicted values'
)
fig.show()