Data Columns:
**TimeSeries**
datetime - hourly date + timestamp  

**Categorical**
season -  
1 = spring, 2 = summer, 3 = fall, 4 = winter 
holiday - 
whether the day is considered a holiday
workingday - 
whether the day is neither a weekend nor holiday
weather - 
1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

**Numerical/Continuous**  
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv(Path('./Resources/train.csv'))
test_df = pd.read_csv(Path('./Resources/test.csv'))

In [3]:
train_df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
# get the total count of training set
len(train_df.index)

10886

In [6]:
# check if any rows have nulls & NAN. (There are no missing values)
train_df.isna().sum()
#train_df.info()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [7]:
train_df.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
casual        0
registered    0
count         0
dtype: int64

In [8]:
# get unique values for categorial columns
train_df["workingday"].unique()

array([0, 1])

In [9]:
train_df["season"].unique()

array([1, 2, 3, 4])

In [10]:
train_df["holiday"].unique()

array([0, 1])

In [11]:
train_df["weather"].unique()

array([1, 2, 3, 4])

In [12]:
train_df["windspeed"].unique()

array([ 0.    ,  6.0032, 16.9979, 19.0012, 19.9995, 12.998 , 15.0013,
        8.9981, 11.0014, 22.0028, 30.0026, 23.9994, 27.9993, 26.0027,
        7.0015, 32.9975, 36.9974, 31.0009, 35.0008, 39.0007, 43.9989,
       40.9973, 51.9987, 46.0022, 50.0021, 43.0006, 56.9969, 47.9988])

In [None]:
# checking datatypes of all columns
train_df.dtypes

In [None]:
import datetime
train_df["datetime"] = pd.to_datetime(train_df["datetime"])

In [None]:
# further divide the datetime column to month, dayoftheweek,
train_df['year'] = pd.DatetimeIndex(train_df['datetime']).year
train_df['month'] = pd.DatetimeIndex(train_df['datetime']).month
train_df['day'] = pd.DatetimeIndex(train_df['datetime']).day
train_df['dayofweek'] = pd.DatetimeIndex(train_df['datetime']).dayofweek
train_df['hour'] = pd.DatetimeIndex(train_df['datetime']).hour

In [None]:
train_df[train_df["hour"]==3].count()

In [None]:
# drop the datetime column
train_df = train_df.drop(["datetime"],axis=1)

In [None]:
train_df.head()

In [None]:
# remove counts before sending data to the model
train_df = train_df.drop(["casual","registered"],axis=1)
train_df.head()

In [None]:
train_df["year"].unique()

In [None]:
# get the X and y values from training set
y = train_df["count"]
X = train_df.drop(["count"], axis = 1)

In [None]:
X.shape

In [None]:
# use minmaxscaler to scale the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
col_names = ["temp","atemp","humidity","windspeed"]
X[col_names] = scaler.fit_transform(X[col_names])

In [None]:
X.head()

In [None]:
# Split train & test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
def fit(model):
    mod = model
    mod.fit(X_train,y_train)
    predictions = mod.predict(X_test)
    pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# Evaluate the model using Root Mean Square Error (RMSE)
def rmse(predictions,y_test):
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    sqrt(mean_squared_error(predictions,y_test))

In [None]:
def feature_importances(reg):
    plt.figure(figsize=(20, 10))
    print(type(reg))
    df = pd.DataFrame(sorted(zip(X_train.columns, reg.feature_importances_)), columns=['Feature', 'Value'])
    sns.barplot(x="Value", y="Feature", data=df.sort_values(by="Value", ascending=False))
    plt.show()

In [None]:
# Run Linear Regression Model
from sklearn.linear_model import LinearRegression
fit(LinearRegression)
#model = LinearRegression()
#model.fit(X_train,y_train)
#predictions = model.predict(X_test)
#pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
importance = model.coef_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
#model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1)
model = GradientBoostingRegressor()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# Evaluate the model using Root Mean Square Error (RMSE)
from sklearn.metrics import mean_squared_error
from math import sqrt
sqrt(mean_squared_error(predictions,y_test))

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# Evaluate the model using Root Mean Square Error (RMSE)
from sklearn.metrics import mean_squared_error
from math import sqrt
sqrt(mean_squared_error(predictions,y_test))

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
# Evaluate the model using Root Mean Square Error (RMSE)
from sklearn.metrics import mean_squared_error
from math import sqrt
sqrt(mean_squared_error(predictions,y_test))

In [None]:
# feature selection