**Loaded packages**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

**Load Data**

In [2]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

In [3]:
train.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female,2
1,T0005689461,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male,2
2,T0005689464,41.58,,10.0,,,E,3.50125,2,,56,77,Male,2
3,T0005689465,61.56,C,10.0,,,A,3.45375,0,,52,74,Male,3
4,T0005689467,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male,2


In [4]:
test.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,T0005689459,9.44,A,10.0,2.57438,A,B,3.68,2,,46,63,Male
1,T0005689462,32.15,B,10.0,2.85143,A,A,1.59375,0,65.0,49,80,Female
2,T0005689463,10.38,C,4.0,2.7053,A,D,4.505,0,,47,74,Male
3,T0005689466,14.94,,6.0,2.48159,C,E,4.53,0,63.0,43,54,Male
4,T0005689468,32.03,B,7.0,2.81598,A,K,4.60125,3,96.0,44,56,Male


Check the count of null values in each variable in Train and Test set

In [5]:
null_dict_train = {}
for col in train.columns:
    null_dict_train[col] = train[col].isnull().sum()

null_dict_test = {}
for col in test.columns:
    null_dict_test[col] = test[col].isnull().sum()

In [6]:
print(null_dict_train)

{'Trip_ID': 0, 'Trip_Distance': 0, 'Type_of_Cab': 20210, 'Customer_Since_Months': 5920, 'Life_Style_Index': 20193, 'Confidence_Life_Style_Index': 20193, 'Destination_Type': 0, 'Customer_Rating': 0, 'Cancellation_Last_1Month': 0, 'Var1': 71030, 'Var2': 0, 'Var3': 0, 'Gender': 0, 'Surge_Pricing_Type': 0}


In [7]:
print(null_dict_test)

{'Trip_ID': 0, 'Trip_Distance': 0, 'Type_of_Cab': 13158, 'Customer_Since_Months': 3966, 'Life_Style_Index': 13327, 'Confidence_Life_Style_Index': 13327, 'Destination_Type': 0, 'Customer_Rating': 0, 'Cancellation_Last_1Month': 0, 'Var1': 46789, 'Var2': 0, 'Var3': 0, 'Gender': 0}


### Preprocess Data

We first label encode the data manually because the KNN imputer needs int/float values

In [8]:
train['Type_of_Cab'] = train['Type_of_Cab'].replace("A",'1')
train['Type_of_Cab'] = train['Type_of_Cab'].replace("B",'2')
train['Type_of_Cab'] = train['Type_of_Cab'].replace("C",'3')
train['Type_of_Cab'] = train['Type_of_Cab'].replace("D",'4')
train['Type_of_Cab'] = train['Type_of_Cab'].replace("E",'5')
train['Type_of_Cab'] = train['Type_of_Cab'].astype(float).astype('Int64')

test['Type_of_Cab'] = test['Type_of_Cab'].replace("A",'1')
test['Type_of_Cab'] = test['Type_of_Cab'].replace("B",'2')
test['Type_of_Cab'] = test['Type_of_Cab'].replace("C",'3')
test['Type_of_Cab'] = test['Type_of_Cab'].replace("D",'4')
test['Type_of_Cab'] = test['Type_of_Cab'].replace("E",'5')
test['Type_of_Cab'] = test['Type_of_Cab'].astype(float).astype('Int64')


In [9]:
train['Confidence_Life_Style_Index'] = train['Confidence_Life_Style_Index'].replace("A",'1')
train['Confidence_Life_Style_Index'] = train['Confidence_Life_Style_Index'].replace("B",'2')
train['Confidence_Life_Style_Index'] = train['Confidence_Life_Style_Index'].replace("C",'3')
train['Confidence_Life_Style_Index'] = train['Confidence_Life_Style_Index'].astype(float).astype('Int64')

test['Confidence_Life_Style_Index'] = test['Confidence_Life_Style_Index'].replace("A",'1')
test['Confidence_Life_Style_Index'] = test['Confidence_Life_Style_Index'].replace("B",'2')
test['Confidence_Life_Style_Index'] = test['Confidence_Life_Style_Index'].replace("C",'3')
test['Confidence_Life_Style_Index'] = test['Confidence_Life_Style_Index'].astype(float).astype('Int64')

In [10]:
dst_type = train['Destination_Type'].unique().tolist()
dest_vals = [0,4,1,2,6,3,5,10,11,7,8,9,12,13]
dest_map = dict(zip(dst_type, dest_vals))

train['Destination_Type'] = train['Destination_Type'].replace(dest_map)
train['Destination_Type'] = train['Destination_Type'].astype(float).astype('Int64')

test['Destination_Type'] = test['Destination_Type'].replace(dest_map)
test['Destination_Type'] = test['Destination_Type'].astype(float).astype('Int64')

In [11]:
le = LabelEncoder()
trained_le = le.fit(train["Gender"])
train["Gender"] = trained_le.transform(train["Gender"])
test["Gender"] = trained_le.transform(test["Gender"])
train.head(10)

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,2.0,1.0,2.42769,1.0,0,3.905,0,40.0,46,60,0,2
1,T0005689461,29.47,2.0,10.0,2.78245,2.0,0,3.45,0,38.0,56,78,1,2
2,T0005689464,41.58,,10.0,,,4,3.50125,2,,56,77,1,2
3,T0005689465,61.56,3.0,10.0,,,0,3.45375,0,,52,74,1,3
4,T0005689467,54.95,3.0,10.0,3.03453,2.0,0,3.4025,4,51.0,49,102,1,2
5,T0005689469,19.06,5.0,10.0,,,0,2.5975,1,72.0,63,91,1,3
6,T0005689470,29.72,5.0,10.0,2.83958,3.0,1,2.975,1,83.0,50,75,1,2
7,T0005689472,18.44,2.0,2.0,2.81871,2.0,0,3.5825,0,103.0,46,63,1,2
8,T0005689473,106.8,3.0,3.0,,,0,3.14625,0,,58,92,1,2
9,T0005689474,107.19,4.0,5.0,3.04467,2.0,0,2.44375,1,,58,83,1,3


In [12]:
X = train.drop(["Surge_Pricing_Type","Trip_ID"], axis=1)
y = train.Surge_Pricing_Type
X_test = test.drop(["Trip_ID"], axis=1)
Trip_ID = test.Trip_ID

Fitting the KNN Imputer

In [13]:
# define imputer
imputer = KNNImputer(n_neighbors = 10, weights='uniform')
fitted_imputer = imputer.fit(X)
X = fitted_imputer.transform(X)
X_test = fitted_imputer.transform(X_test)

In [23]:
X_imputed = pd.DataFrame(X, columns = train.drop(["Surge_Pricing_Type","Trip_ID"], axis=1).columns)
X_test_imputed = pd.DataFrame(X_test, columns = test.drop(["Trip_ID"], axis=1).columns)

In [25]:
X_imputed.insert(loc = 0, column='Trip_ID', value=train['Trip_ID'])
X_imputed.insert(loc = 13, column='Surge_Pricing_Type', value=train['Surge_Pricing_Type'])
X_imputed

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,2.0,1.0,2.427690,1.0,0.0,3.90500,0.0,40.0,46.0,60.0,0.0,2
1,T0005689461,29.47,2.0,10.0,2.782450,2.0,0.0,3.45000,0.0,38.0,56.0,78.0,1.0,2
2,T0005689464,41.58,2.8,10.0,2.838005,1.7,4.0,3.50125,2.0,48.6,56.0,77.0,1.0,2
3,T0005689465,61.56,3.0,10.0,2.887663,2.4,0.0,3.45375,0.0,63.5,52.0,74.0,1.0,3
4,T0005689467,54.95,3.0,10.0,3.034530,2.0,0.0,3.40250,4.0,51.0,49.0,102.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131657,T0005908509,11.72,4.0,1.0,2.742290,1.0,0.0,3.28500,0.0,61.0,47.0,76.0,1.0,3
131658,T0005908510,74.81,3.0,7.0,2.810590,3.0,0.0,0.44500,0.0,65.1,63.0,88.0,1.0,2
131659,T0005908512,40.17,3.0,10.0,2.995650,2.0,0.0,3.33625,0.0,64.1,48.0,75.0,0.0,2
131660,T0005908513,46.88,2.0,4.0,3.047440,1.0,1.0,4.15750,1.0,47.0,54.0,79.0,1.0,2


In [26]:
X_test_imputed.insert(loc = 0, column='Trip_ID', value=test['Trip_ID'])
X_test_imputed

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,T0005689459,9.44,1.0,10.0,2.574380,1.0,1.0,3.68000,2.0,68.3,46.0,63.0,1.0
1,T0005689462,32.15,2.0,10.0,2.851430,1.0,0.0,1.59375,0.0,65.0,49.0,80.0,0.0
2,T0005689463,10.38,3.0,4.0,2.705300,1.0,3.0,4.50500,0.0,59.0,47.0,74.0,1.0
3,T0005689466,14.94,3.0,6.0,2.481590,3.0,4.0,4.53000,0.0,63.0,43.0,54.0,1.0
4,T0005689468,32.03,2.0,7.0,2.815980,1.0,10.0,4.60125,3.0,96.0,44.0,56.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87390,T0005908503,29.43,3.0,3.0,2.407200,3.0,0.0,0.99750,0.0,74.0,48.0,72.0,0.0
87391,T0005908504,14.83,3.0,10.0,2.570090,1.0,3.0,4.74375,0.0,109.0,42.0,60.0,0.0
87392,T0005908505,37.86,1.0,6.0,3.082270,3.0,0.0,1.63500,0.0,57.8,57.0,77.0,1.0
87393,T0005908511,10.63,3.0,1.0,2.776749,2.2,9.0,2.59875,1.0,107.0,51.0,77.0,1.0


In [27]:
X_imputed.to_csv('../data/preprocessed/train_process3.csv', index = False)
X_test_imputed.to_csv('../data/preprocessed/test_process3.csv', index = False)