# Pre-processing & Training Data Development

In [1]:
import pandas as pd
import numpy as np
from datetime import *
from datetime import timedelta

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('data/EDA_output.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,tip_amount,tolls_amount,total_amount
0,0,2,2017-03-11 14:05:48,2017-03-11 14:12:34,1,1.26,1,262,237,2,6.5,0.0,0.0,0.0,7.3
1,1,1,2017-06-08 12:59:56,2017-06-08 13:03:49,1,0.7,1,262,140,1,5.0,0.0,1.15,0.0,6.95
2,2,2,2017-06-08 09:08:45,2017-06-08 09:12:59,6,0.81,1,237,141,2,5.0,0.0,0.0,0.0,5.8
3,3,1,2017-03-18 13:12:23,2017-03-18 13:17:35,1,0.7,1,113,234,1,5.5,0.0,10.0,0.0,16.3
4,4,2,2017-03-05 01:12:35,2017-03-05 01:26:06,3,3.18,1,114,50,1,12.5,0.5,2.76,0.0,16.56


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,tip_amount,tolls_amount,total_amount
count,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0,99952.0
mean,50002.929626,1.548553,1.62401,2.944339,1.032786,163.107091,160.888557,1.325426,13.010682,0.336831,1.878334,0.324718,16.352794
std,28867.230107,0.497639,1.264339,3.733306,0.234227,66.964611,70.386984,0.484947,11.057895,0.457072,2.528518,1.482199,13.819162
min,0.0,1.0,0.0,0.01,1.0,1.0,1.0,1.0,0.01,0.0,0.0,0.0,0.31
25%,25003.75,1.0,1.0,1.0,1.0,114.0,107.0,1.0,6.5,0.0,0.0,0.0,8.75
50%,50005.5,2.0,1.0,1.64,1.0,162.0,162.0,1.0,9.5,0.0,1.44,0.0,11.8
75%,75004.25,2.0,2.0,3.05,1.0,233.0,233.0,2.0,14.5,0.5,2.46,0.0,17.8
max,99999.0,2.0,6.0,66.4,5.0,265.0,265.0,4.0,345.0,4.5,80.0,38.0,426.96


In [60]:
# drop index column that was added when creating the csv
df = df.drop('Unnamed: 0', axis = 1)

In [61]:
# convert pickup and dropoff columns to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S.%f')
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'], format='%Y-%m-%d %H:%M:%S.%f')

In [62]:
#only keep columns that are integers or floats

dfo = df.select_dtypes(include=['int64', 'float64']) # select object type columns

dfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99952 entries, 0 to 99951
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   VendorID         99952 non-null  int64  
 1   passenger_count  99952 non-null  int64  
 2   trip_distance    99952 non-null  float64
 3   RatecodeID       99952 non-null  int64  
 4   PULocationID     99952 non-null  int64  
 5   DOLocationID     99952 non-null  int64  
 6   payment_type     99952 non-null  int64  
 7   fare_amount      99952 non-null  float64
 8   extra            99952 non-null  float64
 9   tip_amount       99952 non-null  float64
 10  tolls_amount     99952 non-null  float64
 11  total_amount     99952 non-null  float64
dtypes: float64(6), int64(6)
memory usage: 9.2 MB


In [63]:
dummies = pd.get_dummies(dfo)

dummies.head()

Unnamed: 0,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,tip_amount,tolls_amount,total_amount
0,2,1,1.26,1,262,237,2,6.5,0.0,0.0,0.0,7.3
1,1,1,0.7,1,262,140,1,5.0,0.0,1.15,0.0,6.95
2,2,6,0.81,1,237,141,2,5.0,0.0,0.0,0.0,5.8
3,1,1,0.7,1,113,234,1,5.5,0.0,10.0,0.0,16.3
4,2,3,3.18,1,114,50,1,12.5,0.5,2.76,0.0,16.56


# Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier


In [71]:
#set X and y variable

X = dummies.drop("tip_amount", axis = 1)
y = dummies['tip_amount']

#convert y to integer KNN does not accept floats(continuous data) on the y axis
y = dummies['tip_amount'].astype(int)


In [None]:
#split the data into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
#fiting the data to the training set and then scaling the test set 


ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
# grid search for KNN


param_grid = {'n_neighbors':np.arange(1,20)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)

In [None]:
# grid search for random forest

param_grid = {'n_neighbors':np.arange(1,20)}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)

In [None]:
# random forest

clf = RandomForestClassifier(n_estimators=300, random_state = 1,n_jobs=-1)
model_res = clf.fit(X_train_scaled, y_train)
y_pred = model_res.predict(X_test_scaled)
y_pred_prob = model_res.predict_proba(X_test_scaled)
lr_probs = y_pred_prob[:,1]
ac = accuracy_score(y_test, y_pred)

f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print('Random Forest: Accuracy=%.3f' % (ac))

print('Random Forest: f1-score=%.3f' % (f1))

In [None]:
# There is a strong linear correlation between tips and total amount charged

from sklearn import linear_model, preprocessing

# Subsetting our data into our dependent and independent variables.
X_linear = dummies[["total_amount"]] 
y_linear = dummies[["tit_amount"]]

# Split the data. This line uses the sklearn function train_test_split().
# The test_size parameter means we can train with 75% of the data, and test on 25%. 
X_train, X_test, y_train, y_test = train_test_split(X_linear, y_linear, test_size = 0.25, random_state = 123)

In [None]:

feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())[:30]
sorted_idx = np.argsort(feature_importance)[:30]

pos = np.arange(sorted_idx.shape[0]) + .5
print(pos.size)
sorted_idx.size
plt.figure(figsize=(10,10))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()