In [13]:
#packages
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing,metrics 
from haversine import haversine
pd.set_option("display.max_columns", 100)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model,svm
from sklearn.metrics import 
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
#loading data 
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.sample(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
1448256,id1389846,1,2016-06-08 14:18:26,2016-06-08 14:47:05,1,-73.973389,40.743607,-73.989967,40.749077,N,1719
1234146,id0753766,2,2016-02-10 17:38:14,2016-02-10 17:44:21,2,-73.977432,40.76363,-73.962143,40.773762,N,367
997408,id1929945,2,2016-01-25 13:43:20,2016-01-25 13:49:26,1,-73.98143,40.746868,-73.974159,40.74387,N,366
1346728,id3884998,2,2016-05-21 18:39:00,2016-05-21 18:41:58,1,-73.965942,40.770321,-73.959465,40.767609,N,178
355944,id1269156,2,2016-02-27 22:42:27,2016-02-27 22:50:37,2,-73.984245,40.74247,-74.00296,40.734035,N,490


In [3]:
#check for missing column values 
missing_train = train_data.isnull().mean().sort_values(ascending=False)
missing_test = train_data.isnull().mean().sort_values(ascending=False)
missing_train.head(5)

trip_duration         0.0
store_and_fwd_flag    0.0
dropoff_latitude      0.0
dropoff_longitude     0.0
pickup_latitude       0.0
dtype: float64

In [4]:
print(train_data.shape)
print(test_data.shape)

(1458644, 11)
(625134, 9)


In [5]:
#drop uneceessary features and add haversine distance 
train_data.drop(labels=['id','pickup_datetime','dropoff_datetime'], axis=1, inplace=True)
test_data.drop(labels=['id','pickup_datetime'], axis=1, inplace=True)
train_dist = []
test_dist = []
for index, row in train_data.iterrows():
    dist_train = haversine([row['pickup_latitude'],row['pickup_longitude']],[row['dropoff_latitude'],row['dropoff_longitude']])
    train_dist.append(dist_train)
for index, row in test_data.iterrows():
    dist_test = haversine([row['pickup_latitude'],row['pickup_longitude']],[row['dropoff_latitude'],row['dropoff_longitude']])
    test_dist.append(dist_test)
train_data['dist'] = train_dist
test_data['dist'] = test_dist

In [6]:
#encoding necessary features
train_data = train_data.replace({"store_and_fwd_flag": {"N":0, "Y":1}})
test_data = test_data.replace({"store_and_fwd_flag": {"N":0, "Y":1}})
train_data.sample(5)
test_data.sample(5)

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,dist
184411,2,1,-73.96006,40.782242,-73.977997,40.773602,0,1.789997
547614,2,1,-73.970352,40.758801,-73.975319,40.752041,0,0.86022
302521,1,1,-73.955086,40.820297,-73.900467,40.903721,0,10.351188
169893,1,1,-73.99279,40.739166,-73.95623,40.768822,0,4.511913
179249,1,2,-74.006378,40.733227,-74.00103,40.746464,0,1.539314


In [7]:
print(np.mean(test_data['dist']))

3.4334155628450507


In [8]:
#randomly sample train and test data to reduce dataset
train_data = train_data.sample(frac=0.1,random_state=4)
test_data = test_data.sample(frac=0.1,random_state=4)

In [9]:
#normalizing train and test data 
train_data.fillna(train_data.mean(), inplace = True)
train_labels = train_data.columns
scaler = preprocessing.StandardScaler()
scaled_train_data = scaler.fit_transform(train_data[train_labels])
train_data = pd.DataFrame(scaled_train_data, columns = train_labels)
test_data.fillna(test_data.mean(), inplace = True)
test_labels = test_data.columns
scaler = preprocessing.StandardScaler()
scaled_test_data = scaler.fit_transform(train_data[test_labels])
test_data = pd.DataFrame(scaled_test_data, columns = test_labels)
train_data.head(5)

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,dist
0,0.931173,-0.505519,-0.753195,-0.037354,-0.544689,0.068901,-0.073887,-0.233449,-0.604926
1,-1.073914,-0.505519,0.460681,2.133139,0.320428,1.508949,-0.073887,-0.237574,-0.582023
2,0.931173,1.780026,-0.557484,-1.022496,-0.656768,-0.308482,-0.073887,-0.134768,-0.238624
3,0.931173,-0.505519,2.021462,0.43813,2.008436,0.331202,-0.073887,0.002941,-0.803817
4,0.931173,0.256329,-0.415114,0.178882,-0.12186,0.059327,-0.073887,-0.014828,-0.572558


In [10]:
train_data.isnull().any()
test_data.isnull().any()

vendor_id             False
passenger_count       False
pickup_longitude      False
pickup_latitude       False
dropoff_longitude     False
dropoff_latitude      False
store_and_fwd_flag    False
dist                  False
dtype: bool

In [11]:
#split train data into train and cross_val
train_target = pd.DataFrame(train_data['trip_duration'])
train_data.drop('trip_duration', axis = 1, inplace = True)
lab_enc = preprocessing.LabelEncoder()
train_target_encoded = lab_enc.fit_transform(train_target)

X_train, X_test, y_train, y_test = train_test_split(np.array(train_data), np.array(train_target_encoded), test_size=0.30)
eval_set=[(X_test, y_test)]
print("train_target: ", train_target_encoded.shape)
print('train_set: ', X_train.shape, y_train.shape)
print('test_set: ', X_test.shape, y_test.shape)

train_target:  (145864,)
train_set:  (102104, 8) (102104,)
test_set:  (43760, 8) (43760,)


In [21]:
#Random Forest
random_forest_regressor = RandomForestRegressor(n_estimators = 100, n_jobs = 4)
random_forest_regressor.fit(X_train, y_train)
rf_y_pred = random_forest_regressor.predict(X_test)
random_forest_error =  mean_squared_error(y_test, rf_y_pred)
print("Test error: ", random_forest_error)
print(y_test[0:10])
print(rf_y_pred[0:10])

Test error:  143359.58032379524
[ 680  525  558 3907 2877  990  570  338  959  939]
[ 725.08  452.92  429.9  2283.46 2065.01  437.96  879.08  345.66  866.31
  670.23]


In [None]:
#Logistic Regression Model
logistic_regression_classifier = LogisticRegression(C = 10, tol = 0.0001, random_state = 51, solver = 'liblinear', class_weight = 'balanced')
train_results = logistic_regression_classifier.fit(X_train, y_train)
train_score = train_results.score(X_train, y_train)
print("Train accuracy: ", train_score)
# Run on Test Data
lr_y_pred = logistic_regression_classifier.predict(X_test)
logistic_regression_accuracy = accuracy_score(y_test, lr_y_pred)
print("Test Accuracy: ", logistic_regression_accuracy)
print(classification_report(y_test, lr_y_pred))

In [None]:
#Feature importance
fig, ax = plt.subplots()
width = 0.6
feature_importances = pd.DataFrame(random_forest_classifier.feature_importances_, index = dataset.columns, columns = ['Importance']).sort_values('Importance', ascending = False)
print(feature_importances)
ax.bar(np.arange(len(dataset.columns)), random_forest_classifier.feature_importances_, width, color='b')
ax.set_xticks(np.arange(len(random_forest_classifier.feature_importances_)))
ax.set_xticklabels(dataset.columns.values, rotation = 90, horizontalalignment='right')
plt.title('Feature Importances')
ax.set_ylabel('Normalized Gini Importance')

In [None]:
#Multi-layer perceptron
multi_layer_perceptron_classifier = MLPClassifier(solver='lbfgs', alpha = 1e-5, hidden_layer_sizes = (5, 2), random_state = 1)
multi_layer_perceptron_classifier.fit(X_train, y_train)     
mlp_y_pred = multi_layer_perceptron_classifier.predict(X_test)
mlp_accuracy = accuracy_score(y_test, mlp_y_pred)
print("Test Accuracy: ", mlp_accuracy)
print("Confusion Matrix: ", confusion_matrix(y_test, mlp_y_pred, labels=[0, 1]))
print(classification_report(y_test, mlp_y_pred))

In [None]:
#Multi-layer perceptron with Adam solver + changed hyperparams
multi_layer_perceptron_classifier_2 = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum = True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08,n_iter_no_change = 10)
multi_layer_perceptron_classifier_2.fit(X_train, y_train)     
mlp_y_pred_2 = multi_layer_perceptron_classifier.predict(X_test)
mlp_accuracy_2 = accuracy_score(y_test, mlp_y_pred_2)
print("Test Accuracy: ", mlp_accuracy_2)
print("Confusion Matrix: ", confusion_matrix(y_test, mlp_y_pred_2, labels=[0, 1]))
print(classification_report(y_test, mlp_y_pred_2))

In [None]:
#svm
