# **Random Forest Classifier**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import os
import os.path

In [2]:
import modules.save_model as save

In [3]:
data = 'Datasets/Model_Final.csv'

In [4]:
df = pd.read_csv(data)

In [5]:
df.shape

(22674, 10)

In [6]:
df.head()

Unnamed: 0,arr_delay,crs_arr_time,fl_date_day_of_month,day_of_week,origin_city_avg_dep_delay,origin_day_week_avg_dep_delay,dest_day_week_avg_arr_delay,dest_city_avg_arr_delay,op_unique_carrier_day_week_avg_arr_delay,distance_group
0,40.0,1305,1,Monday,8.405941,31.363636,-8.5,1.142857,1.444444,5
1,-38.0,1305,8,Monday,8.405941,31.363636,-8.5,1.142857,1.444444,5
2,-3.0,2330,11,Thursday,3.479452,-4.307692,13.27907,-1.692182,-3.4,6
3,-1.0,500,13,Saturday,3.479452,9.75,-9.4875,-1.692182,14.0,6
4,10.0,2115,22,Monday,9.44086,9.356322,-8.5,1.142857,1.444444,6


In [7]:
X = df.drop(['arr_delay', 'day_of_week', 'distance_group', 'fl_date_day_of_month'], axis=1)
y = df['arr_delay']

In [8]:
X.head()

Unnamed: 0,crs_arr_time,origin_city_avg_dep_delay,origin_day_week_avg_dep_delay,dest_day_week_avg_arr_delay,dest_city_avg_arr_delay,op_unique_carrier_day_week_avg_arr_delay
0,1305,8.405941,31.363636,-8.5,1.142857,1.444444
1,1305,8.405941,31.363636,-8.5,1.142857,1.444444
2,2330,3.479452,-4.307692,13.27907,-1.692182,-3.4
3,500,3.479452,9.75,-9.4875,-1.692182,14.0
4,2115,9.44086,9.356322,-8.5,1.142857,1.444444


### Split the data into train and test sets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [10]:
# scale data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

y_train = scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test = scaler.transform(y_test.values.reshape(-1, 1))


In [11]:
X_train.shape

(15191, 6)

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score

In [13]:
def random_forest(X_train, X_test, y_train, y_test, random_state=99):
    '''Returns a tuple of random forest regressor object and a prediction array of the test data.'''
    
    rand_for = RandomForestRegressor(max_depth=100, min_samples_split=3, min_samples_leaf=3)
    rand_for.fit(X_train, y_train)
    y_pred = rand_for.predict(X_test)
    accuracy_score = rand_for.score(X_test, y_test)
    
    print('R2 Score: {0:0.4f}'.format(r2_score(y_test, y_pred)))
    print('MAE: {0:0.4f}'.format(mean_absolute_error(y_test, y_pred)))
    print('Accuracy score on test data:', accuracy_score)
    
    return rand_for, y_pred

In [14]:
tree, y_pred = random_forest(X_train, X_test, y_train, y_test)

R2 Score: 0.1186
MAE: 0.4392
Accuracy score on test data: 0.1185537158597707



**R2 - Coefficient of Determination is 0.1186.**\
`The goodness of fit of a set of predictions to the actual values. The value ranges between 0 and 1 for no fit (0) and perfect fit (1).`\
**MAE - Mean Absolute Error is 0.439**\
 `Got about 0.43% of test data wrong.`

In [15]:
# save model
save.jar(tree, 'random_forest_regressor')

In [16]:
df.head()

Unnamed: 0,arr_delay,crs_arr_time,fl_date_day_of_month,day_of_week,origin_city_avg_dep_delay,origin_day_week_avg_dep_delay,dest_day_week_avg_arr_delay,dest_city_avg_arr_delay,op_unique_carrier_day_week_avg_arr_delay,distance_group
0,40.0,1305,1,Monday,8.405941,31.363636,-8.5,1.142857,1.444444,5
1,-38.0,1305,8,Monday,8.405941,31.363636,-8.5,1.142857,1.444444,5
2,-3.0,2330,11,Thursday,3.479452,-4.307692,13.27907,-1.692182,-3.4,6
3,-1.0,500,13,Saturday,3.479452,9.75,-9.4875,-1.692182,14.0,6
4,10.0,2115,22,Monday,9.44086,9.356322,-8.5,1.142857,1.444444,6


In [17]:
# add column of delayed to df with values 'delayed' = arr_delay less than 0, 'not delayed' = arr_delay greater than or equal to 0
df['delayed'] = np.where(df['arr_delay'] < 0, 'delayed', 'not delayed')



In [37]:
X = df.drop(['arr_delay', 'day_of_week', 'distance_group', 'delayed'], axis=1)
y = df['delayed']

In [39]:
X.head()

Unnamed: 0,crs_arr_time,fl_date_day_of_month,origin_city_avg_dep_delay,origin_day_week_avg_dep_delay,dest_day_week_avg_arr_delay,dest_city_avg_arr_delay,op_unique_carrier_day_week_avg_arr_delay
0,1305,1,8.405941,31.363636,-8.5,1.142857,1.444444
1,1305,8,8.405941,31.363636,-8.5,1.142857,1.444444
2,2330,11,3.479452,-4.307692,13.27907,-1.692182,-3.4
3,500,13,3.479452,9.75,-9.4875,-1.692182,14.0
4,2115,22,9.44086,9.356322,-8.5,1.142857,1.444444


In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [41]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [42]:
#import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state = 42)
clf.fit(X_train, y_train)

In [43]:
feature_imp = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
feature_imp

crs_arr_time                                0.216104
dest_day_week_avg_arr_delay                 0.150934
fl_date_day_of_month                        0.147316
origin_day_week_avg_dep_delay               0.145489
origin_city_avg_dep_delay                   0.114639
dest_city_avg_arr_delay                     0.113280
op_unique_carrier_day_week_avg_arr_delay    0.112238
dtype: float64

`The most important feature is destination, average weekly arrival delay, origin averaage daily depature delay. All features seem to contribute almost equally`

In [44]:
from lets_plot import *
LetsPlot.setup_html()

In [45]:
# plot feature importance using lets-plot
y = feature_imp
x = feature_imp.index.tolist()
ggplot({'x': x, 'y': y}, aes(x='x', y='y')) + geom_bar(stat='identity', fill='#f68060', alpha=0.8) + \
    ggsize(500, 300) + \
    ggtitle('Feature Importance') + \
    xlab('Features') + \
    ylab('Importance') + \
    theme(axis_text_x=element_text(angle=90, hjust=1))


In [46]:
y

crs_arr_time                                0.216104
dest_day_week_avg_arr_delay                 0.150934
fl_date_day_of_month                        0.147316
origin_day_week_avg_dep_delay               0.145489
origin_city_avg_dep_delay                   0.114639
dest_city_avg_arr_delay                     0.113280
op_unique_carrier_day_week_avg_arr_delay    0.112238
dtype: float64