In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score, accuracy_score
from sklearn.model_selection import cross_val_score

df_train_X = pd.read_csv("train.csv", parse_dates=[0])
df_train_Y =pd.read_csv("train_label.csv", names =["Label"])
df_train_X["Month"] = df_train_X.datetime.dt.month
df_train_X["Year"] = df_train_X.datetime.dt.year
df_train_X["Hour"] = df_train_X.datetime.dt.hour
df_train_X["season"] = df_train_X["season"].map({"Fall":1,"Spring":2, "Summer":3,"Winter": 4})
df_train_X["weather"] = df_train_X["weather"].map({" Clear + Few clouds":1," Light Snow, Light Rain":2, " Mist + Cloudy ":3," Heavy Rain + Thunderstorm ": 4})
df_train_X = df_train_X.drop(["datetime"],axis=1)

df_test_X = pd.read_csv("test.csv", parse_dates=[0])
df_test_Y =pd.read_csv("test_label.csv", names =["Label"])
df_test_X["Month"] = df_test_X.datetime.dt.month
df_test_X["Year"] = df_test_X.datetime.dt.year
df_test_X["Hour"] = df_test_X.datetime.dt.hour
df_test_X["season"] = df_test_X["season"].map({"Fall":1,"Spring":2, "Summer":3,"Winter": 4})
df_test_X["weather"] = df_test_X["weather"].map({" Clear + Few clouds":1," Light Snow, Light Rain":2, " Mist + Cloudy ":3," Heavy Rain + Thunderstorm ": 4})
df_test_X = df_test_X.drop(["datetime"],axis=1)


In [11]:
slm = LinearRegression()
slm.fit(df_train_X, df_train_Y)
print("Coefficient:", slm.coef_,"\nIntercept:", slm.intercept_)

Coefficient: [[15.11462974 -9.24296102 -0.71183731  4.09654321  4.52195146  3.13529338
  -2.05214024  0.43331242  4.8967749  81.90056261  7.61805601]] 
Intercept: [-164758.99394004]


In [6]:
Train_Pred = slm.predict(df_train_X)
print("Train Accuracy: ")
print("MSE: ", mean_squared_error(df_train_Y, Train_Pred))
print("Accuracy: ", r2_score(df_train_Y, Train_Pred))
Test_Pred = slm.predict(df_test_X)    
print("Test Accuracy: ")
print("MSE: ", mean_squared_error(df_test_Y, Test_Pred))
print("Accuracy: ", r2_score(df_test_Y, Test_Pred))

Train Accuracy: 
MSE:  20006.718960518683
Accuracy:  0.3929525164293759
Test Accuracy: 
MSE:  19303.776401183724
Accuracy:  0.4001509239671054


In [8]:
scores = cross_val_score(slm, df_train_X, df_train_Y, cv=5)

print("Scores:",scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Scores: [0.39982683 0.40041374 0.40015202 0.39301101 0.35168608]
Accuracy: 0.39 (+/- 0.04)


In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(df_train_X, df_train_Y)
rf_feat = pd.Series(rf.feature_importances_, index=df_train_X.columns)
print((rf_feat*100).sort_values(ascending=False))
print(rf.score(df_train_X, df_train_Y))
print(rf.score(df_test_X, df_test_Y))

  This is separate from the ipykernel package so we can avoid doing imports until


Hour          19.854737
humidity      17.010529
windspeed     16.815624
temp          12.650021
atemp         12.563736
Month          8.132709
weather        4.596314
workingday     3.021195
season         2.691454
Year           2.035223
holiday        0.628456
dtype: float64
0.9807073954983923
0.023415977961432508


In [34]:
# ["Hour","humidity","windspeed","temp","atemp","Month","weather","workingday","season","Year","holiday"]
rf.fit(df_train_X[["humidity","windspeed","temp","atemp","weather","workingday","season","holiday"]], df_train_Y)
print(rf.score(df_train_X[["humidity","windspeed","temp","atemp","weather","workingday","season","holiday"]], df_train_Y))
print(rf.score(df_test_X[["humidity","windspeed","temp","atemp","weather","workingday","season","holiday"]], df_test_Y))

0.859364121605941
0.008264462809917356
