In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
taxi_df = pd.read_csv("taxi_training.csv")
taxi_testing  = pd.read_csv("taxi_testing.csv")

In [3]:
print(taxi_df.head())
print(taxi_df.info())
print(taxi_df.columns)

   booking_id  user_id  vehicle_model_id  package_id  travel_type_id  \
0           1    17712                12         NaN               2   
1           2    17037                12         NaN               2   
2           3      761                12         NaN               2   
3           4      868                12         NaN               2   
4           5    21716                28         NaN               2   

   from_area_id  to_area_id  from_city_id  to_city_id       from_date  \
0        1021.0      1323.0           NaN         NaN  1/1/2013 22:33   
1         455.0      1330.0           NaN         NaN  1/1/2013 12:43   
2         814.0       393.0           NaN         NaN   1/2/2013 0:28   
3         297.0       212.0           NaN         NaN  1/1/2013 13:12   
4        1237.0       330.0           NaN         NaN  1/1/2013 16:33   

         to_date  online_booking  mobile_site_booking booking_created  \
0            NaN               0                    0  

In [4]:
# filling nan values with forwad fill method
taxi_df['to_area_id'] = taxi_df['to_area_id'].fillna(method='ffill')
taxi_df['from_area_id'] = taxi_df['from_area_id'].fillna(method='ffill')

In [5]:
#filling nan values with backward fill method
taxi_testing['to_area_id'] = taxi_testing['to_area_id'].fillna(method='bfill')
taxi_testing['from_area_id'] = taxi_testing['from_area_id'].fillna(method='ffill')

In [36]:
X = taxi_df[['booking_id', 'user_id','from_area_id', 'to_area_id', 'online_booking', 'mobile_site_booking']]
y = taxi_df[['Car_Cancellation']]

In [38]:
taxi_testing

Unnamed: 0,booking_id,user_id,from_area_id,to_area_id,online_booking,mobile_site_booking
0,10093,23775,1217.0,87.0,0,0
1,10094,38447,393.0,87.0,0,0
2,10095,28359,58.0,1021.0,0,0
3,10096,2844,1337.0,571.0,0,0
4,10097,46947,852.0,393.0,1,0
...,...,...,...,...,...,...
2862,12955,36363,689.0,585.0,0,0
2863,12956,34749,49.0,393.0,1,0
2864,12957,45609,105.0,393.0,0,0
2865,12958,868,585.0,571.0,0,0


In [39]:
taxi_testing = taxi_testing[['booking_id', 'user_id', 'from_area_id', 'to_area_id', 'online_booking', 'mobile_site_booking']]




In [40]:
X.isna().sum()

booking_id                0
user_id                   0
from_area_id              0
to_area_id                0
online_booking            0
mobile_site_booking       0
from_lat                 10
to_lat                 1472
dtype: int64

In [41]:
taxi_testing.isna().sum()

booking_id             0
user_id                0
from_area_id           0
to_area_id             0
online_booking         0
mobile_site_booking    0
dtype: int64

In [27]:
#splitting data frame with 75 and 25 split for trainin and testing
X_training, X_test, y_training, y_test = train_test_split(X, y, test_size=0.25, random_state=32)

In [28]:
model_pipeline = []
model_pipeline.append(LogisticRegression(solver='liblinear'))
model_pipeline.append(SVC())
model_pipeline.append(KNeighborsClassifier())
model_pipeline.append(DecisionTreeClassifier())
model_pipeline.append(RandomForestClassifier(n_estimators=15))
model_pipeline.append(GaussianNB())

In [29]:
models_list = ['Logistic Regression', 'SVM', 'KNN', 'Decision Tree', 'Random Forest', 'Navie Bayes']
acc_list = []
for model in model_pipeline:
    model.fit(X_training, y_training)
    y_pred = model.predict(X_test)
    acc_list.append(metrics.accuracy_score(y_test, y_pred))

In [30]:
#accuracy list for all classification models
model_df = pd.DataFrame({'Model': models_list, 'Accuracy':acc_list})
model_df

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.931345
1,SVM,0.931908
2,KNN,0.92628
3,Decision Tree,0.893641
4,Random Forest,0.93247
5,Navie Bayes,0.931908


In [31]:
predicted = pd.DataFrame({"booking_id":X_test['booking_id'],"Car_Cancellation":model_pipeline[-1].predict(X_test)})
print(predicted)

      booking_id  Car_Cancellation
4445        4446                 0
3329        3330                 0
1893        1894                 0
2972        2973                 0
6649        6650                 0
...          ...               ...
809          810                 0
3049        3050                 0
5580        5581                 0
577          578                 0
2221        2222                 0

[1777 rows x 2 columns]


In [32]:
#predictions of traing dataframe
predicted = pd.DataFrame({"booking_id":X_test['booking_id'],"Car_Cancellation":model.predict(X_test)})

In [33]:
#predictions of the test dataframe
test_predictions = pd.DataFrame({"booking_id":taxi_testing['booking_id'], "Car_Cancellation":model.predict(taxi_testing)}).set_index('booking_id')
test_predictions

Unnamed: 0_level_0,Car_Cancellation
booking_id,Unnamed: 1_level_1
10093,0
10094,0
10095,0
10096,0
10097,0
...,...
12955,0
12956,0
12957,0
12958,0


In [34]:
test_predictions.head()

Unnamed: 0_level_0,Car_Cancellation
booking_id,Unnamed: 1_level_1
10093,0
10094,0
10095,0
10096,0
10097,0


In [35]:
#saving predicted test data file in csv
test_predictions.to_csv('test_predictions.csv')