- split the data into a training and validation set (or apply cross validation)

- conduct an exploratory data analysis

- train a regression model

- iteratively optimize the model by expanding or selecting features

- regularize the model to avoid overfitting

- calculate a RMSLE for the training and validation set

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import numpy as np


In [2]:

# Import the relevant sklearn packages
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score  


In [3]:

# Set the figure size of matplotlib plots to (12,6)
plt.rcParams['figure.figsize'] = (28,14)

In [4]:
df = pd.read_csv('data/train.csv', index_col=0, parse_dates = True)
df.head()

#sns.lineplot(data=df, x=df.index, y='count')

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


datetime: date and hour in "mm/dd/yyyy hh:mm" format

season: Four categories-> 1 = spring, 2 = summer, 3 = fall, 4 = winter

holiday: whether the day is a holiday or not (1/0)

workingday: whether the day is neither a weekend nor holiday (1/0)

weather: Four Categories of weather

        1-> Clear, Few clouds, Partly cloudy
        2-> Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
        3-> Light Snow and Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
        4-> Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

temp: hourly temperature in Celsius

atemp: "feels like" temperature in Celsius

humidity: relative humidity

windspeed: wind speed

casual + registered = count

In [5]:
# df.insert(0, 'day_of_week', df.index.dayofweek)
df = df.drop(['casual','registered'], axis=1)
df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10886 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      10886 non-null  int64  
 1   holiday     10886 non-null  int64  
 2   workingday  10886 non-null  int64  
 3   weather     10886 non-null  int64  
 4   temp        10886 non-null  float64
 5   atemp       10886 non-null  float64
 6   humidity    10886 non-null  int64  
 7   windspeed   10886 non-null  float64
 8   count       10886 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 850.5 KB


In [6]:
df['hour'] = df.index.hour
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year

df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10886 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      10886 non-null  int64  
 1   holiday     10886 non-null  int64  
 2   workingday  10886 non-null  int64  
 3   weather     10886 non-null  int64  
 4   temp        10886 non-null  float64
 5   atemp       10886 non-null  float64
 6   humidity    10886 non-null  int64  
 7   windspeed   10886 non-null  float64
 8   count       10886 non-null  int64  
 9   hour        10886 non-null  int64  
 10  day         10886 non-null  int64  
 11  month       10886 non-null  int64  
 12  year        10886 non-null  int64  
dtypes: float64(3), int64(10)
memory usage: 1.2 MB


In [7]:
#change int columns to cartegory
cat_cols = ['season', 'holiday', 'workingday',
            'weather', 'hour', 'day', 'month','year']
for col in cat_cols:
    df[col] = df[col].astype('category')
df.info()


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 10886 entries, 2011-01-01 00:00:00 to 2012-12-19 23:00:00
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      10886 non-null  category
 1   holiday     10886 non-null  category
 2   workingday  10886 non-null  category
 3   weather     10886 non-null  category
 4   temp        10886 non-null  float64 
 5   atemp       10886 non-null  float64 
 6   humidity    10886 non-null  int64   
 7   windspeed   10886 non-null  float64 
 8   count       10886 non-null  int64   
 9   hour        10886 non-null  category
 10  day         10886 non-null  category
 11  month       10886 non-null  category
 12  year        10886 non-null  category
dtypes: category(8), float64(3), int64(2)
memory usage: 597.9 KB


In [8]:
df.describe()


Unnamed: 0,temp,atemp,humidity,windspeed,count
count,10886.0,10886.0,10886.0,10886.0,10886.0
mean,20.23086,23.655084,61.88646,12.799395,191.574132
std,7.79159,8.474601,19.245033,8.164537,181.144454
min,0.82,0.76,0.0,0.0,1.0
25%,13.94,16.665,47.0,7.0015,42.0
50%,20.5,24.24,62.0,12.998,145.0
75%,26.24,31.06,77.0,16.9979,284.0
max,41.0,45.455,100.0,56.9969,977.0


In [12]:
### Split data ###
X = df.drop(columns=['count'])
y = df['count']

X_train, X_test, y_train, y_test =  train_test_split(X, y, random_state = 42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((8164, 12), (2722, 12), (8164,), (2722,))

In [None]:
ax = plt.figure(figsize=(28, 14))

pearsoncorr = df.corr(method='pearson')
sns.heatmap(pearsoncorr, xticklabels=pearsoncorr.columns, yticklabels=pearsoncorr.columns,
            cmap='Greens', annot=True)



In [None]:
del df['temp']
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X 

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:

rf = RandomForestRegressor(max_depth=5, random_state=0)

In [None]:

rf.fit(X_train, y_train)

In [None]:

pd.DataFrame({'importance': rf.feature_importances_, 'feature': X_train.columns}).\
sort_values('importance', ascending=False)  #feature_importance- use for RF model

In [None]:

from sklearn.linear_model import Lasso

In [None]:

l = Lasso()

In [None]:

l.fit(X_train, y_train)

In [None]:

pd.DataFrame({'feature': X_train.columns, 'coefficient': np.abs(l.coef_)}).sort_values('coefficient', ascending=False)

In [None]:
m = LinearRegression()
m.fit(X_train,y_train)
ypred = m.predict(X_test)


In [None]:

m.score(X_train, y_train)


In [None]:
m.score(X_test, y_test)




In [None]:
def RMSLE(y_train,y_pred):
    return np.sqrt(mean_squared_log_error(y_train, y_pred))

In [None]:
RMSLE(y_train, ypred)

In [None]:
To optimize your model against the RMSLE, you should take the logarithm of the target colum (y). Because 0 is a valid target value, use the log of 𝑦+1 instead:

ylog = np.log1p(y)
Then train your model on the transformed column ylog. To bring back your log predictions to the original scale you have to apply the inverse transformation on the predictions:

ypred = np.exp(ypredlog)-1
You can then calculate the RMSLE score using sklearn:

from sklearn.metrics import mean_squared_log_error

np.sqrt(mean_squared_log_error(y, ypred))

In [None]:
fig, axes = plt.subplots(3,1, figsize = (20,12))

sns.countplot(train["season"], ax = axes[0], palette="Set1")
sns.countplot(train["weather"], ax = axes[1], palette="Set1")
sns.countplot(train["windspeed"], ax = axes[2])
plt.xticks(rotation = 60, )

In [None]:
_, axes = plt.subplots(1,1, figsize = (20,12))
sns.boxplot(x=train["hour"], y=train["count"])

In [None]:
df['ideal'] = df[['temp', 'windspeed']].apply(lambda x: (0, 1)[x['temp'] > 27 and x['windspeed'] < 30], axis = 1)
df['sticky'] = df[['humidity', 'workingday']].apply(lambda x: (0, 1)[x['workingday'] == 1 and x['humidity'] >= 60], axis = 1)