In [1]:
#packages
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing,metrics 
from haversine import haversine
pd.set_option("display.max_columns", 100)
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import linear_model,svm
from sklearn.metrics import average_precision_score, classification_report,confusion_matrix

In [2]:
#loading data 
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.sample(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
628529,id2967222,1,2016-02-07 17:24:28,2016-02-07 17:34:14,1,-73.98159,40.759583,-73.971283,40.782703,N,586
267393,id1266947,1,2016-03-19 16:29:35,2016-03-19 16:44:47,1,-73.96244,40.763844,-73.952049,40.74295,N,912
734798,id3624594,2,2016-06-26 09:34:04,2016-06-26 09:45:50,1,-73.953835,40.766987,-73.980835,40.734661,N,706
567035,id1711908,2,2016-05-02 14:51:40,2016-05-02 15:01:06,1,-73.976021,40.733349,-73.978333,40.734329,N,566
1370562,id1384127,1,2016-04-16 14:53:00,2016-04-16 14:59:36,2,-73.991615,40.754646,-73.987717,40.760323,N,396


In [3]:
#check for missing column values 
missing_train = train_data.isnull().mean().sort_values(ascending=False)
missing_test = train_data.isnull().mean().sort_values(ascending=False)
missing_train.head(5)

trip_duration         0.0
store_and_fwd_flag    0.0
dropoff_latitude      0.0
dropoff_longitude     0.0
pickup_latitude       0.0
dtype: float64

In [4]:
print(train_data.shape)
print(test_data.shape)

(1458644, 11)
(625134, 9)


In [5]:
#drop uneceessary features and add haversine distance 
train_data.drop(labels=['id','pickup_datetime','dropoff_datetime'], axis=1, inplace=True)
test_data.drop(labels=['id','pickup_datetime'], axis=1, inplace=True)
train_dist = []
test_dist = []
for index, row in train_data.iterrows():
    dist_train = haversine([row['pickup_latitude'],row['pickup_longitude']],[row['dropoff_latitude'],row['dropoff_longitude']])
    train_dist.append(dist_train)
for index, row in test_data.iterrows():
    dist_test = haversine([row['pickup_latitude'],row['pickup_longitude']],[row['dropoff_latitude'],row['dropoff_longitude']])
    test_dist.append(dist_test)
train_data['dist'] = train_dist
test_data['dist'] = test_dist

In [6]:
#encoding necessary features
train_data = train_data.replace({"store_and_fwd_flag": {"N":0, "Y":1}})
test_data = test_data.replace({"store_and_fwd_flag": {"N":0, "Y":1}})
train_data.sample(5)
test_data.sample(5)

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,dist
457781,1,2,-73.956306,40.76738,-73.956604,40.708244,0,6.575617
491905,2,1,-73.994072,40.732384,-73.992371,40.734699,0,0.294692
214477,1,2,-73.98307,40.748093,-73.976051,40.757301,0,1.182397
207592,2,1,-73.985168,40.759689,-73.9496,40.831451,0,8.522831
94268,1,1,-73.970596,40.75481,-74.004791,40.729649,0,4.015873


In [7]:
print(np.mean(test_data['dist']))

3.4334155628450507


In [8]:
#randomly sample train and test data to reduce dataset
train_data = train_data.sample(frac=0.01,random_state=4)
test_data = test_data.sample(frac=0.01,random_state=4)

In [9]:
#normalizing train and test data 
train_data.fillna(train_data.mean(), inplace = True)
train_labels = train_data.columns
scaler = preprocessing.StandardScaler()
scaled_train_data = scaler.fit_transform(train_data[train_labels])
train_data = pd.DataFrame(scaled_train_data, columns = train_labels)
test_data.fillna(test_data.mean(), inplace = True)
test_labels = test_data.columns
scaler = preprocessing.StandardScaler()
scaled_test_data = scaler.fit_transform(train_data[test_labels])
test_data = pd.DataFrame(scaled_test_data, columns = test_labels)
test_data.head(5)

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,dist
0,0.938278,-0.511769,-0.721672,-0.019974,-0.568992,0.037908,-0.073324,-0.543043
1,-1.065782,-0.511769,0.417687,1.196327,0.316058,0.779388,-0.073324,-0.522492
2,0.938278,1.744553,-0.537976,-0.572028,-0.683654,-0.156406,-0.073324,-0.21435
3,0.938278,-0.511769,1.882657,0.246477,2.04296,0.172967,-0.073324,-0.721513
4,0.938278,0.240338,-0.404345,0.1012,-0.13642,0.032979,-0.073324,-0.513999


In [10]:
train_target = pd.DataFrame(train_data['trip_duration'])
train_target.drop('trip_duration', axis = 1, inplace = True)
lab_enc = preprocessing.LabelEncoder()
train_target_encoded = lab_enc.fit_transform(train_target)

X_train, X_test, y_train, y_test = train_test_split(np.array(train_data), np.array(train_target_encoded), test_size=0.30)
eval_set=[(X_test, y_test)]
print("train_target: ", train_target_encoded.shape)
print('train_set: ', X_train.shape, y_train.shape)
print('test_set: ', X_test.shape, y_test.shape)

ValueError: bad input shape (14586, 0)

In [None]:
#Logistic Regression Model
logistic_regression_classifier = LogisticRegression(C = 10, tol = 0.0001, random_state = 51, solver = 'liblinear', class_weight = 'balanced')
train_results = logistic_regression_classifier.fit(X_train, y_train)
train_score = train_results.score(X_train, y_train)
print("Train accuracy: ", train_score)
# Run on Test Data
lr_y_pred = logistic_regression_classifier.predict(X_test)
logistic_regression_accuracy = accuracy_score(y_test, lr_y_pred)
print("Test Accuracy: ", logistic_regression_accuracy)
print(classification_report(y_test, lr_y_pred))