In [None]:
'''
We will evaluate a diverse set of nonlinear and ensemble machine learning algorithms, specifically:

Nonlinear Algorithms:

k-Nearest Neighbors
Classification and Regression Tree
Support Vector Machine
Naive Bayes
Ensemble Algorithms:

Bagged Decision Trees
Random Forest
Extra Trees
Gradient Boosting Machine

'''

In [7]:
# spot check on engineered-features
from pandas import read_csv
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pandas as pd
import numpy as np
import calendar
from sklearn.cluster import MiniBatchKMeans
from sklearn import preprocessing
from tpot import TPOTRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import type_of_target
from xgboost import XGBClassifier

In [8]:

# create a dict of standard models to evaluate {name:object}
def define_models(models=dict()):
    # nonlinear models
    models['knn'] = KNeighborsClassifier(n_neighbors=7)
    models['cart'] = DecisionTreeClassifier()
    models['svm'] = SVC()
    models['bayes'] = GaussianNB()
    # ensemble models
    models['bag'] = BaggingClassifier(n_estimators=100)
    models['rf'] = RandomForestClassifier(n_estimators=100)
    models['et'] = ExtraTreesClassifier(n_estimators=100)
    models['gbm'] = GradientBoostingClassifier(n_estimators=100)
    models['xgb'] = XGBClassifier(n_estimators=100)
    print('Defined %d models' % len(models))
    return models
 
# evaluate a single model
def evaluate_model(trainX, trainy, testX, testy, model):
    # fit the model
    model.fit(trainX, trainy)
    # make predictions
    yhat = model.predict(testX)
    # evaluate predictions
    accuracy = accuracy_score(testy, yhat)
    return accuracy * 100.0
 
# evaluate a dict of models {name:object}, returns {name:score}
def evaluate_models(trainX, trainy, testX, testy, models):
    results = dict()
    for name, model in models.items():
        # evaluate the model
        results[name] = evaluate_model(trainX, trainy, testX, testy, model)
        # show process
        print('>%s: %.3f' % (name, results[name]))
    return results
 
# print and plot the results
def summarize_results(results, maximize=True):
    # create a list of (name, mean(scores)) tuples
    mean_scores = [(k,v) for k,v in results.items()]
    # sort tuples by mean score
    mean_scores = sorted(mean_scores, key=lambda x: x[1])
    # reverse for descending order (e.g. for accuracy)
    if maximize:
        mean_scores = list(reversed(mean_scores))
    print()
    for name, score in mean_scores:
        print('Name=%s, Score=%.3f' % (name, score))

In [3]:
def clean_data(df):
    df_ = df.copy()
    # remove the 2016-01-23 data since its too less comapre others days,
    # maybe quality is not good
    # df_ = df_[(df_.pickup_date != '2016-01-23') & (df_.dropoff_date != '2016-01-23')]
    # potential passenger_count outlier
    df_ = df_[(df_['passenger_count'] <= 6) & (df_['passenger_count'] > 0)]
    return df_

### ================================================ ###


def load_data():
    df_train = pd.read_csv('../data/train.csv')
    df_test = pd.read_csv('../data/test.csv')
    # sample train data for fast job
    # df_train = df_train.sample(n=100)
    # clean train data
    df_train_ = clean_data(df_train)
    # merge train and test data for fast process and let model view test data when training as well
    df_all = pd.concat([df_train_, df_test], axis=0)
    return df_all, df_train_, df_test

df_all_, df_train, df_test = load_data()

features = ['VendorID',
            'RatecodeID',
            'PULocationID',
            'DOLocationID',
            'passenger_count',
            'trip_distance',
            'fare_amount',
            'extra',
            'tolls_amount',
            'total_amount',
            'payment_type',
            'duration',
            'day_of_week',
            'speed',
            'tip_amount']
      
X_train = df_all_[features].values
y_train = X_train[:, X_train.shape[1]-1]

X_test = df_test[features].values
y_test = X_test[:, X_test.shape[1]-1]

y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
print (df_all_[features].head())

   VendorID  RatecodeID  PULocationID  DOLocationID  passenger_count  \
0         2           1            40           141              1.0   
1         2           1            74            74              1.0   
2         2           1            65           143              2.0   
3         2           1           165            40              1.0   
4         2           1            94           120              1.0   

   trip_distance  fare_amount  extra  tolls_amount  total_amount  \
0           2.91         13.0    0.0           0.0         16.56   
1           0.36          3.5    0.0           0.0          4.30   
2           2.31         12.5    0.0           0.0         15.96   
3           0.96          6.5    1.0           0.0          9.96   
4           2.86         15.0    0.0           0.0         18.96   

   payment_type  duration  day_of_week      speed  tip_amount  
0             1    1194.0            5   8.773869        2.76  
1             2     113.0     

In [9]:
sample_size = 5000

X_train = X_train[:sample_size]
y_train = y_train[:sample_size]

X_test = X_test[:sample_size]
y_test = y_test[:sample_size]

In [10]:
print('type of y_train: %s' % type_of_target(y_train))
print('type of y_test: %s' % type_of_target(y_test))

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

type of y_train: multiclass
type of y_test: multiclass


In [11]:
# get model list
models = define_models()
# evaluate models
results = evaluate_models(X_train, y_train, X_test, y_test, models)
# summarize results
summarize_results(results)

Defined 9 models
>knn: 57.900
>svm: 59.060
>gbm: 0.000


  if diff:


>xgb: 59.720
>cart: 59.960
>bag: 60.000
>rf: 59.480
>bayes: 52.660
>et: 59.420
()
Name=bag, Score=60.000
Name=cart, Score=59.960
Name=xgb, Score=59.720
Name=rf, Score=59.480
Name=et, Score=59.420
Name=svm, Score=59.060
Name=knn, Score=57.900
Name=bayes, Score=52.660
Name=gbm, Score=0.000
