In [76]:
%load_ext autoreload
%autoreload 2
%autosave 5

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Autosaving every 5 seconds


#  Import modules

In [77]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier , ExtraTreesRegressor
from IPython.display import display

from sklearn import metrics

from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

# Data importing 

In [78]:
PATH = './data/'

In [79]:
!ls {PATH}

sample_submission.csv  Test.csv  Train.csv


In [80]:
df_raw = pd.read_csv(f'{PATH}Train.csv',low_memory = False, parse_dates = ['date_time'])

In [81]:
df_raw.head().T

Unnamed: 0,0,1,2,3,4
date_time,2012-10-02 09:00:00,2012-10-02 10:00:00,2012-10-02 11:00:00,2012-10-02 12:00:00,2012-10-02 13:00:00
is_holiday,,,,,
air_pollution_index,121,178,113,20,281
humidity,89,67,66,66,65
wind_speed,2,3,3,3,3
wind_direction,329,330,329,329,329
visibility_in_miles,1,1,2,5,7
dew_point,1,1,2,5,7
temperature,288.28,289.36,289.58,290.13,291.14
rain_p_h,0,0,0,0,0


# Data Preprocessing 

In [82]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [83]:
df= df_raw.drop_duplicates(['date_time'])

In [84]:
df_raw.drop(['air_pollution_index'], axis=1, inplace=True)
df_raw.drop(['wind_direction'], axis=1, inplace=True)
df_raw.drop(['dew_point'], axis=1, inplace=True)

# Data Preprocessing Function


In [85]:
def preprocessing_data(d_frame):
    dframe = d_frame.copy()
    print(type(dframe))
    dframe['hour'] = getattr(dframe.date_time.dt,'hour')
    
    dframe['dates'] = dframe['date_time'].dt.date
    date_of_holiday = dframe.dates[dframe.is_holiday != 'None']
    holiday_type = dframe.is_holiday[dframe.is_holiday != 'None']
    mapping = {i:j for i,j in zip(date_of_holiday,holiday_type)}
    for x in mapping:
        dframe.loc[dframe.dates == x , 'is_holiday'] = mapping[x]
    dframe.drop(columns='dates',inplace=True)
    add_datepart(dframe,'date_time')
    
    return dframe

In [86]:
df = preprocessing_data(df_raw)

<class 'pandas.core.frame.DataFrame'>


In [87]:
train_cats(df)

# Removing outliers 

In [88]:
for index,row in df.iterrows() :
    if(row['temperature']==0):
        df.drop(index,inplace=True)
    if(row['rain_p_h']>60):
        df.drop(index,inplace=True)
    if(row['snow_p_h']>0.2):
        df.drop(index,inplace=True)

In [89]:
df['hourtotraffic'] = 1.5
df.loc[(df.hour >= 22) & (df.hour <= 5), 'hourtotraffic'] = 0
df.loc[(df.hour >= 19) & (df.hour <= 22) , 'hourtotraffic'] = 2
df.loc[(df.hour >= 14) & (df.hour  <=19) , 'hourtotraffic'] = 8
df.loc[(df.hour>=6) & (df.hour <= 9), 'hourtotraffic'] = 1

In [90]:
df['holidaypriority'] = 7
df.loc[(df.is_holiday == 'None'), 'holidaypriority'] = 50
df.loc[(df.is_holiday.isin(['Columbus Day','Veterans Day','Washingtons Birthday','State Fair'])) , 'holidaypriority'] = 10
df.loc[(df.is_holiday.isin(['Thanksgiving Day','Christmas Day','New Years Day','Independence Day','Labor Day'])) , 'holidaypriority'] = 5
df.loc[(df.is_holiday.isin(['Memorial Day','Martin Luther King Day'])), 'holidaypriority'] = 4

# Testing 

In [91]:
dfs,y,nas = proc_df(df,'traffic_volume')

In [92]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 12000  # same as Kaggle's test set size
n_trn = len(dfs)-n_valid

X_train, X_valid = split_vals(dfs, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((21719, 26), (21719,), (12000, 26))

# Random Forest

In [93]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [127]:
m = RandomForestRegressor(n_estimators=1000, min_samples_leaf=3, max_features=0.5, n_jobs=-1,random_state=18)
m.fit(X_train, y_train)
print_score(m)

[197.22239420132144, 451.81932208543583, 0.9903649754493682, 0.946780370302373]


# Testing the model 

In [113]:
valid_raw = pd.read_csv(f'{PATH}Test.csv',low_memory = False,parse_dates=['date_time'])

In [114]:
date_time = valid_raw['date_time']

In [115]:
valid = preprocessing_data(valid_raw)

<class 'pandas.core.frame.DataFrame'>


In [116]:
valid.drop(['air_pollution_index'], axis=1, inplace=True)
valid.drop(['wind_direction'], axis=1, inplace=True)
valid.drop(['dew_point'], axis=1, inplace=True)

In [117]:
for index,row in valid.iterrows() :
    if(row['temperature']==0):
        valid.drop(index,inplace=True)
    if(row['rain_p_h']>60):
        valid.drop(index,inplace=True)
    if(row['snow_p_h']>0.2):
        valid.drop(index,inplace=True)

In [118]:
valid['hourtotraffic'] = 1.5
valid.loc[(valid.hour >= 22) & (valid.hour <= 5), 'hourtotraffic'] = 0
valid.loc[(valid.hour >= 19) & (valid.hour <= 22) , 'hourtotraffic'] = 2
valid.loc[(valid.hour >= 14) & (valid.hour  <=19) , 'hourtotraffic'] = 8
valid.loc[(valid.hour>=6) & (valid.hour <= 9), 'hourtotraffic'] = 1

In [119]:
valid['holidaypriority'] = 7
valid.loc[(valid.is_holiday == 'None'), 'holidaypriority'] = 50
valid.loc[(valid.is_holiday.isin(['Columbus Day','Veterans Day','Washingtons Birthday','State Fair'])) , 'holidaypriority'] = 10
valid.loc[(valid.is_holiday.isin(['Thanksgiving Day','Christmas Day','New Years Day','Independence Day','Labor Day'])) , 'holidaypriority'] = 5
valid.loc[(valid.is_holiday.isin(['Memorial Day','Martin Luther King Day'])), 'holidaypriority'] = 4


In [120]:
apply_cats(valid,df)

In [121]:
check,_,_ = proc_df(valid)

In [128]:
answer = m.predict(check)

In [129]:
answer

array([ 649.15797,  648.42789,  654.10919, ..., 2465.11829, 1791.06603, 1257.56892])

In [130]:
dict = {
    'date_time':date_time,
    'traffic_volume':answer
}

In [131]:
answer_final2 = pd.DataFrame(dict)

In [132]:
answer_final2.to_csv('final2.csv',header=True,index=False)