In [99]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sea
import sklearn
import sklearn.ensemble as ensemble
import sklearn.tree as tree
import sklearn.metrics as metrics
import sklearn.linear_model as linear
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

# Hackathon Shinkansen Travel Experience

In [100]:
train_s = pd.read_csv("Surveydata_train.csv")
train_t = pd.read_csv("Traveldata_train.csv")
test_s = pd.read_csv("Surveydata_test.csv")
test_t = pd.read_csv("Traveldata_test.csv")

# Merge survey and travel data for train and test

In [101]:
train = train_s.merge(train_t,how='outer', left_on='ID', right_on='ID')
test = test_s.merge(test_t,how='outer', left_on='ID', right_on='ID')

# Dependant variable balance check

In [102]:
print('Train Data split \n',train['Overall_Experience'].value_counts())

Train Data split 
 1    51593
0    42786
Name: Overall_Experience, dtype: int64


In [103]:
## the dependant variable is split quite evenly in training dataset
## dependant variable removed from test set for hackathon

# Check categorical values

In [109]:
def valCounts(ds):

    for column in ds.drop('ID',1).columns:

        print(ds[column].value_counts(),'\n')
print("TEST \n")
valCounts(test)
print("TRAIN \n")
valCounts(train)

TEST 

Acceptable           8003
Needs Improvement    7831
Good                 7808
Poor                 5755
Excellent            4854
Extremely Poor       1329
Name: Seat_Comfort, dtype: int64 

Ordinary     17860
Green Car    17742
Name: Seat_Class, dtype: int64 

Good                 7361
Excellent            6589
Acceptable           5844
Needs Improvement    5684
Poor                 5131
Extremely Poor       1668
Name: Arrival_Time_Convenient, dtype: int64 

Acceptable           7133
Needs Improvement    6701
Good                 6701
Poor                 5178
Excellent            5025
Extremely Poor       1507
Name: Catering, dtype: int64 

Manageable           9364
Convenient           8178
Needs Improvement    6703
Inconvenient         6134
Very Convenient      5211
Name: Platform_Location, dtype: int64 

Good                 8743
Excellent            7875
Acceptable           7495
Needs Improvement    7456
Poor                 3980
Extremely Poor         41
Name: Onboard_Wi

  for column in ds.drop('ID',1).columns:
  for column in ds.drop('ID',1).columns:


39.0    2688
25.0    2542
40.0    2333
44.0    2291
41.0    2248
        ... 
73.0      47
79.0      39
76.0      38
78.0      30
85.0      18
Name: Age, Length: 75, dtype: int64 

Business Travel    58617
Personal Travel    26536
Name: Type_Travel, dtype: int64 

Eco         49342
Business    45037
Name: Travel_Class, dtype: int64 

1963    72
1789    68
1894    65
1639    65
1981    64
        ..
5028     1
4312     1
5267     1
4094     1
4156     1
Name: Travel_Distance, Length: 5210, dtype: int64 

0.0      53339
1.0       2734
2.0       2096
3.0       1838
4.0       1641
         ...  
933.0        1
292.0        1
323.0        1
815.0        1
610.0        1
Name: Departure_Delay_in_Mins, Length: 437, dtype: int64 

0.0      52915
1.0       1995
2.0       1899
3.0       1757
4.0       1713
         ...  
586.0        1
399.0        1
920.0        1
267.0        1
593.0        1
Name: Arrival_Delay_in_Mins, Length: 434, dtype: int64 



In [91]:
## categorical variables make sense but require ordinal label encoding (more positive result = high number)

# Encoding

In [52]:
scale_cols = ['Seat_Comfort','Arrival_Time_Convenient','Catering','Onboard_Wifi_Service','Onboard_Entertainment','Online_Support','Ease_of_Online_Booking','Onboard_Service','Legroom','Baggage_Handling','CheckIn_Service','Cleanliness','Online_Boarding']
scale_dic = {'Extremely Poor':0,'Poor':1,'Needs Improvement':2,'Acceptable':3,'Good':4,'Excellent':5} 
platform_dic = {'Very Inconvenient':0,'Inconvenient':1,'Needs Improvement':2,'Manageable':3,'Convenient':4,'Very Convenient':5}

In [74]:
def scaler(ds):
    for columns in scale_cols:
        ds = ds.replace({columns:scale_dic}) # ordinal encoding on specifed columns

    ds = ds.replace({'Platform_Location':platform_dic}) # ordinal encoding on one column

    ds = pd.get_dummies(ds, drop_first = True) # binary encoding everything else
    return ds
    
test = (scaler(test))  
train = (scaler(train))
#print(test.head(2))
#print(train.head(2))

# Fill N/A's

In [76]:
def nas(ds):

    for i in ds:
        ds[i].fillna(value = ds[i].median(), inplace =True)
    return ds

train = nas(train)
test = nas(test)
#print(train.info())
#print(test.info())

# Split independant and dependant variables

In [77]:
#94.9 'ID','Overall_Experience','Arrival_Delay_in_Mins','Departure_Delay_in_Mins','Seat_Class_Ordinary'
X = train.drop(columns = ['ID','Overall_Experience','Departure_Delay_in_Mins','Seat_Class_Ordinary'], axis=1)
y = train['Overall_Experience']

test_X = test.drop(columns = ['ID','Departure_Delay_in_Mins','Seat_Class_Ordinary'], axis=1)

# Split train into two data sets 

In [78]:
train_X1, train_X2, train_y1, train_y2 = sklearn.model_selection.train_test_split(X, y, test_size = 0.1, random_state = 1, stratify = y)

# Grid Search CV

In [79]:
param_grid = {'n_estimators':[200], 'max_depth': [15,18,20],'min_samples_split': [2,4,6],'criterion':['gini']}

grid_obj = GridSearchCV(ensemble.RandomForestClassifier(), param_grid, scoring='accuracy', cv=3, n_jobs=-1)

grid_obj.fit(train_X1,train_y1)

print(grid_obj.best_score_)

print(grid_obj.best_params_)

0.9501889697800346
{'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


# Train and test with recursive feature extraction

In [82]:
rfe = RFE(estimator=ensemble.RandomForestClassifier(n_estimators=200,max_depth=20,min_samples_split=2,criterion='gini'), n_features_to_select=19)
# find n most important feaures to train on
rfe.fit(train_X1,train_y1)

y1_pred_train = rfe.predict(train_X1)
y2_pred_train = rfe.predict(train_X2)

print('Accuracy 1',round(metrics.accuracy_score(train_y1,y1_pred_train),3))
print('Accuracy 2',round(metrics.accuracy_score(train_y2,y2_pred_train),3))

Accuracy 1 0.995
Accuracy 2 0.95


# Fit to whole dataset

In [83]:
rfe.fit(X,y)

RFE(estimator=RandomForestClassifier(max_depth=20, n_estimators=200),
    n_features_to_select=19)

# RUN ON TEST AND DOWNLOAD CSV

In [86]:
y_pred_test = rfe.predict(test_X)
test['Overall_Experience'] = y_pred_test
print(test[['ID','Overall_Experience']])
test = test[['ID','Overall_Experience']]
test.to_csv(r'C:\Users\VK\Github\MIT\Hackathon\Submit10.csv', index=False)

             ID  Overall_Experience
0      99900001                   1
1      99900002                   1
2      99900003                   1
3      99900004                   0
4      99900005                   1
...         ...                 ...
35597  99935598                   0
35598  99935599                   1
35599  99935600                   1
35600  99935601                   1
35601  99935602                   0

[35602 rows x 2 columns]


#### Outcome on blind test data in time limited hackathon:
0.9509859% accuracy#
11th place

Potential improvements:
    use grid search on RFE n features
    use multiple ML algorithms and use majority vote