In [104]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sea
import sklearn
import sklearn.ensemble as ensemble
import sklearn.tree as tree
import sklearn.metrics as metrics
import sklearn.linear_model as linear
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE

# Hackathon Shinkansen Travel Experience

In [58]:
train_s = pd.read_csv("Surveydata_train.csv")
train_t = pd.read_csv("Traveldata_train.csv")
test_s = pd.read_csv("Surveydata_test.csv")
test_t = pd.read_csv("Traveldata_test.csv")

# Merge survey and travel data for train and test

In [59]:
train = train_s.merge(train_t,how='outer', left_on='ID', right_on='ID')
test = test_s.merge(test_t,how='outer', left_on='ID', right_on='ID')

# Dependant variable balance check

In [2]:
#print('Train Data split \n',train['Overall_Experience'].value_counts())

In [None]:
## the dependant variable is split quite evenly in training dataset
## dependant variable not present in test set 

# Check categorical values

In [1]:
#for column in train.drop('ID',1).columns:

    #print(train[column].value_counts(),'\n')

In [97]:
## commented out after check ##

# for column in test.drop('ID',1).columns:

    #print(test[column].value_counts(),'\n')

In [None]:
## categorical variables make sense but require ordinal label encoding (more positive result = high number)

# Encoding

In [60]:
scale_cols = ['Seat_Comfort','Arrival_Time_Convenient','Catering','Onboard_Wifi_Service','Onboard_Entertainment','Online_Support','Ease_of_Online_Booking','Onboard_Service','Legroom','Baggage_Handling','CheckIn_Service','Cleanliness','Online_Boarding']
scale_dic = {'Extremely Poor':0,'Poor':1,'Needs Improvement':2,'Acceptable':3,'Good':4,'Excellent':5} 
platform_dic = {'Very Inconvenient':0,'Inconvenient':1,'Needs Improvement':2,'Manageable':3,'Convenient':4,'Very Convenient':5}

In [61]:
##TEST dataset##

for columns in scale_cols:
    test = test.replace({columns:scale_dic})

test = test.replace({'Platform_Location':platform_dic})

test = pd.get_dummies(test, drop_first = True)
#print(test.head(5))

##TRAIN dataset##

for columns in scale_cols:
    train = train.replace({columns:scale_dic})

train = train.replace({'Platform_Location':platform_dic})

train = pd.get_dummies(train, drop_first = True)
#print(train.head(5))

# Fill N/A's

In [62]:
for i in train:
    train[i].fillna(value = train[i].median(), inplace =True)
#print(train.info())

for i in test:
    test[i].fillna(value = test[i].median(), inplace =True)
#print(test.info())

# Split independant and dependant variables

In [92]:
#94.9 'ID','Overall_Experience','Arrival_Delay_in_Mins','Departure_Delay_in_Mins','Seat_Class_Ordinary'
X = train.drop(columns = ['ID','Overall_Experience','Departure_Delay_in_Mins','Seat_Class_Ordinary'], axis=1)
y = train['Overall_Experience']

test_X = test.drop(columns = ['ID','Departure_Delay_in_Mins','Seat_Class_Ordinary'], axis=1)

# Split train into two data sets 

In [93]:
train_X1, train_X2, train_y1, train_y2 = sklearn.model_selection.train_test_split(X, y, test_size = 0.1, random_state = 1, stratify = y)

# Grid Search CV

In [94]:
param_grid = {'n_estimators':[200], 'max_depth': [15,18,20],'min_samples_split': [2,4,6],'criterion':['gini']}

grid_obj = GridSearchCV(ensemble.RandomForestClassifier(), param_grid, scoring='accuracy', cv=3, n_jobs=-1)

grid_obj.fit(train_X1,train_y1)

print(grid_obj.best_score_)

print(grid_obj.best_params_)

0.9503537918689328
{'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}


# Train and test with recursive feature extraction

In [118]:
### sklearn.feature_selection.RFE
# 18 features

rfe = RFE(estimator=ensemble.RandomForestClassifier(n_estimators=200,max_depth=20,min_samples_split=2,criterion='gini'), n_features_to_select=20)

rfe.fit(train_X1,train_y1)

y1_pred_train = rfe.predict(train_X1)
y2_pred_train = rfe.predict(train_X2)

print('Accuracy 1',round(metrics.accuracy_score(train_y1,y1_pred_train),3))
print('Accuracy 2',round(metrics.accuracy_score(train_y2,y2_pred_train),3))

Accuracy 1 0.995
Accuracy 2 0.95


# Fit to whole dataset

In [119]:
rfe.fit(X,y)

RFE(estimator=RandomForestClassifier(max_depth=20, n_estimators=200),
    n_features_to_select=20)

# Importances

In [52]:
importances = model.feature_importances_
columns = train_X1.columns
importance_df = pd.DataFrame(importances, index = columns, columns = {"Importance"})
importance_df.sort_values(['Importance'],ascending=False)

Unnamed: 0,Importance
Onboard_Entertainment,0.206368
Seat_Comfort,0.142941
Ease_of_Online_Booking,0.081308
Online_Support,0.072315
Legroom,0.045147
Travel_Class_Eco,0.044707
Online_Boarding,0.03711
Catering,0.035376
Onboard_Service,0.0336
Gender_Male,0.032443


# RUN ON TEST AND DOWNLOAD CSV

In [120]:
y_pred_test = rfe.predict(test_X)
test['Overall_Experience'] = y_pred_test
print(test[['ID','Overall_Experience']])
test = test[['ID','Overall_Experience']]
test.to_csv(r'C:\Users\VK\MIT\Hackathon\Submit9.csv', index=False)

             ID  Overall_Experience
0      99900001                   1
1      99900002                   1
2      99900003                   1
3      99900004                   0
4      99900005                   1
...         ...                 ...
35597  99935598                   0
35598  99935599                   1
35599  99935600                   1
35600  99935601                   1
35601  99935602                   0

[35602 rows x 2 columns]
