## General Imports

In [76]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime

from scipy import stats

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPRegressor

from sklearn.metrics import mean_absolute_error, accuracy_score, make_scorer, precision_score, recall_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

# from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import PCA

mpl.rc(group='figure', figsize=(10,8))
plt.style.use('seaborn')

from warnings import filterwarnings
filterwarnings('ignore')

## Take Dataset

In [88]:
filepath = '../data/'
train = pd.read_csv(filepath + 'train.csv')
test = pd.read_csv(filepath + 'test.csv')

T = test.copy()

train.replace(to_replace='correct', value=1, inplace=True)
train.replace(to_replace='incorrect', value=0, inplace=True)

X = train.iloc[:, 1:-1]
Y = train['label']

In [89]:
X

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,10.5,834,56.0,0.0000,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
1,10.5,791,47.0,0.0000,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
2,10.5,1087,80.0,0.0000,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
3,10.5,598,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
4,,1020,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
17171,10.5,838,93.0,5.4219,451.0,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
17172,10.5,2151,428.0,0.0000,39.0,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
17173,10.5,263,9.0,0.0000,110.0,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
17174,10.5,858,115.0,0.0000,317.0,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


## Filling Missing values

#### duration filled in excel

In [90]:
X['additional_fare'].fillna(10.5, inplace=True)
X['meter_waiting'].fillna(X['meter_waiting'].mean(), inplace=True)
X['meter_waiting_fare'].fillna(X['meter_waiting_fare'].mean(), inplace=True)
X['meter_waiting_till_pickup'].fillna(X['meter_waiting_till_pickup'].mean(), inplace=True)
X['fare'].fillna(X['additional_fare']+X['meter_waiting_fare'], inplace=True)

In [91]:
X

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,10.5,834,56.000000,0.000000,64.000000,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.90330,79.8783,270.32
1,10.5,791,47.000000,0.000000,134.000000,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85
2,10.5,1087,80.000000,0.000000,61.000000,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64
3,10.5,598,271.000000,15.663800,68.000000,11/1/2019 2:27,11/1/2019 2:37,6.92570,79.8895,6.92748,79.8971,82.30
4,10.5,1020,629.074231,32.057666,112.466832,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.9290,358.39
...,...,...,...,...,...,...,...,...,...,...,...,...
17171,10.5,838,93.000000,5.421900,451.000000,1/31/2020 22:07,1/31/2020 22:21,7.29073,80.6367,7.28891,80.6557,198.26
17172,10.5,2151,428.000000,0.000000,39.000000,1/31/2020 23:07,1/31/2020 23:43,6.90569,79.8516,6.95089,79.9389,581.23
17173,10.5,263,9.000000,0.000000,110.000000,1/31/2020 23:21,1/31/2020 23:25,7.09210,79.9000,7.10135,79.9017,76.20
17174,10.5,858,115.000000,0.000000,317.000000,1/31/2020 23:39,1/31/2020 23:53,6.94540,79.8768,6.93574,79.9010,133.31


## Format datetime values

In [92]:
# Removing datetime columns for now
# TODO: encode in pickup_year,pickup_month,pickup_date,pickup_hour,pickup_min etc

X.drop(['pickup_time','drop_time'], axis=1, inplace=True)
T.drop(['tripid', 'pickup_time','drop_time'], axis=1, inplace=True)

## Test Train Split

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

## Normalize

In [94]:
# normalizer = Normalizer()
# normalizer.fit(X_train)

# X_train = normalizer.transform(X_train)
# X_test = normalizer.transform(X_test)

## PCA

In [95]:
pca = PCA(10)
pca.fit(X_train)

X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

## Model

In [96]:
model = RandomForestClassifier(n_jobs=2, random_state=0)
model.fit(X_train, Y_train)
Y_predict = model.predict(X_test)
accuracy_score(Y_test, Y_predict)

0.9473224679860303

## For all data

In [97]:
Y

0        1
1        1
2        1
3        1
4        1
        ..
17171    1
17172    1
17173    1
17174    1
17175    1
Name: label, Length: 17176, dtype: int64

In [98]:
pca = PCA(10)
pca.fit(X)

X = pca.transform(X)
T = pca.transform(T)

model = RandomForestClassifier(n_jobs=2, random_state=0)
model.fit(X, Y)
T_Predict = model.predict(T)

T_Predict

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

## Submission

In [99]:
name = "randomforest_pca10_jobs2_state0"

date = datetime.now().strftime('%m-%d-%H_%M_%S')
file = open("../submissions/" + name + "_" + date + ".csv", "w+")

submit = pd.DataFrame()
submit["tripid"] = test["tripid"]
submit["prediction"] = T_Predict

submit

Unnamed: 0,tripid,prediction
0,213284604,1
1,213286352,1
2,213293973,1
3,213294622,1
4,213298687,1
...,...,...
8571,222856243,1
8572,222857785,1
8573,222858416,1
8574,222858691,1


In [100]:
submit.to_csv(file, header=True, index=False, line_terminator='\n')
file.flush()