**Loaded packages**

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

**Load Data**

In [5]:
train = pd.read_csv("../data/raw/train.csv")
test = pd.read_csv("../data/raw/test.csv")

In [None]:
train.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,B,1.0,2.42769,A,A,3.905,0,40.0,46,60,Female,2
1,T0005689461,29.47,B,10.0,2.78245,B,A,3.45,0,38.0,56,78,Male,2
2,T0005689464,41.58,,10.0,,,E,3.50125,2,,56,77,Male,2
3,T0005689465,61.56,C,10.0,,,A,3.45375,0,,52,74,Male,3
4,T0005689467,54.95,C,10.0,3.03453,B,A,3.4025,4,51.0,49,102,Male,2


In [None]:
test.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,T0005689459,9.44,A,10.0,2.57438,A,B,3.68,2,,46,63,Male
1,T0005689462,32.15,B,10.0,2.85143,A,A,1.59375,0,65.0,49,80,Female
2,T0005689463,10.38,C,4.0,2.7053,A,D,4.505,0,,47,74,Male
3,T0005689466,14.94,,6.0,2.48159,C,E,4.53,0,63.0,43,54,Male
4,T0005689468,32.03,B,7.0,2.81598,A,K,4.60125,3,96.0,44,56,Male


Check the count of null values in each variable in Train and Test set

In [None]:
null_dict_train = {}
for col in train.columns:
    null_dict_train[col] = train[col].isnull().sum()

null_dict_test = {}
for col in test.columns:
    null_dict_test[col] = test[col].isnull().sum()

In [None]:
print(null_dict_train)

{'Trip_ID': 0, 'Trip_Distance': 0, 'Type_of_Cab': 20210, 'Customer_Since_Months': 5920, 'Life_Style_Index': 20193, 'Confidence_Life_Style_Index': 20193, 'Destination_Type': 0, 'Customer_Rating': 0, 'Cancellation_Last_1Month': 0, 'Var1': 71030, 'Var2': 0, 'Var3': 0, 'Gender': 0, 'Surge_Pricing_Type': 0}


In [None]:
print(null_dict_test)

{'Trip_ID': 0, 'Trip_Distance': 0, 'Type_of_Cab': 13158, 'Customer_Since_Months': 3966, 'Life_Style_Index': 13327, 'Confidence_Life_Style_Index': 13327, 'Destination_Type': 0, 'Customer_Rating': 0, 'Cancellation_Last_1Month': 0, 'Var1': 46789, 'Var2': 0, 'Var3': 0, 'Gender': 0}


### Preprocess Data

We replace the categorical NA values by the most frequent category and replace the numeric NA values with the median of the column

In [9]:
print(train['Confidence_Life_Style_Index'].mode())
print(train['Type_of_Cab'].mode())
print(train['Confidence_Life_Style_Index'].mode())

0    B
dtype: object
0    B
dtype: object
0    B
dtype: object


In [11]:
#treating NA values

train['Type_of_Cab'].fillna("B", inplace=True)
test['Type_of_Cab'].fillna("B", inplace=True)

train['Confidence_Life_Style_Index'].fillna("B", inplace=True)
test['Confidence_Life_Style_Index'].fillna("B", inplace=True)


for col in ['Customer_Since_Months', 'Var1', 'Life_Style_Index']:
    train[col].fillna(train[col].median(), inplace = True)
    train[col] = train[col].astype('int')

for col in ['Customer_Since_Months', 'Var1', 'Life_Style_Index']:
    test[col].fillna(test[col].median(), inplace = True)
    test[col] = test[col].astype('int')

In [12]:
train.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,B,1,2,A,A,3.905,0,40,46,60,Female,2
1,T0005689461,29.47,B,10,2,B,A,3.45,0,38,56,78,Male,2
2,T0005689464,41.58,B,10,2,B,E,3.50125,2,61,56,77,Male,2
3,T0005689465,61.56,C,10,2,B,A,3.45375,0,61,52,74,Male,3
4,T0005689467,54.95,C,10,3,B,A,3.4025,4,51,49,102,Male,2


We label encode the categorical columns using LabelEncoder function

In [13]:
categorical_cols = ['Type_of_Cab', 'Confidence_Life_Style_Index', 'Destination_Type', 'Gender']

for col in categorical_cols:
  print(col)
  le = LabelEncoder()
  trained_le = le.fit(train[col])
  train[col] = trained_le.transform(train[col])
  test[col] = trained_le.transform(test[col])

train.head(10)

Type_of_Cab
Confidence_Life_Style_Index
Destination_Type
Gender


Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,1,1,2,0,0,3.905,0,40,46,60,0,2
1,T0005689461,29.47,1,10,2,1,0,3.45,0,38,56,78,1,2
2,T0005689464,41.58,1,10,2,1,4,3.50125,2,61,56,77,1,2
3,T0005689465,61.56,2,10,2,1,0,3.45375,0,61,52,74,1,3
4,T0005689467,54.95,2,10,3,1,0,3.4025,4,51,49,102,1,2
5,T0005689469,19.06,4,10,2,1,0,2.5975,1,72,63,91,1,3
6,T0005689470,29.72,4,10,2,2,1,2.975,1,83,50,75,1,2
7,T0005689472,18.44,1,2,2,1,0,3.5825,0,103,46,63,1,2
8,T0005689473,106.8,2,3,2,1,0,3.14625,0,61,58,92,1,2
9,T0005689474,107.19,3,5,3,1,0,2.44375,1,61,58,83,1,3


In [None]:
train.to_csv('../data/preprocessed/train_process2.csv', index = False)
test.to_csv('../data/preprocessed/test_process2.csv', index = False)