In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [237]:
train = pd.read_csv('./data/backfilled_data.csv')
test = pd.read_csv('./data/backfilled_test.csv')

# Model Training

In [238]:
train.head()

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_O51ZQ1B,2001-12-11,2011-12-10,Female,37.0,2001-12-11,1,Saloon,Black,Honda,Ekiti,Benue,Car Vintage,0
1,ID_VJ1FAVO,2002-03-25,2011-03-24,Male,37.0,2002-03-25,1,Saloon,Black,TOYOTA,Ekiti,Benue,Car Classic,0
2,ID_ULWS8VL,2003-04-13,2011-04-12,Male,41.0,2003-04-13,2,Saloon,Black,TOYOTA,Ikeja,Lagos,Car Vintage,0
3,ID_ZYKGSP7,2003-12-21,2034-05-20,Male,48.0,2003-12-21,2,Saloon,Gold,BMW,Ajah,Lagos,Car Vintage,0
4,ID_OEWBKGF,2005-08-05,2011-09-29,Female,44.0,2005-08-05,1,Saloon,Gold,Tata,Ajah,Lagos,CVTP,0


In [239]:
test.head()

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,StartMonth,EndMonth,FirstMonth,StartDay,EndDay,FirstDay
0,ID_6JEYVLP,2010-01-01,2010-12-31,Male,35.0,2010-01-01,1,JEEP,As Attached,ACURA,Lagos Island,Lagos,CarSafe,1,12,1,1,31,1
1,ID_BX1PNZ6,2010-01-01,2011-05-31,Other,93.0,2010-01-01,7,JEEP,As Attached,Honda,Lagos Island,Lagos,CarSafe,1,5,1,1,31,1
2,ID_83NJU2D,2010-01-01,2010-12-31,Male,41.0,2010-01-01,4,JEEP,White,Ford,Victoria Island,Lagos,Muuve,1,12,1,1,31,1
3,ID_Q6GKOAQ,2010-01-02,2011-01-01,Female,45.0,2010-01-02,1,JEEP,White,Ford,Victoria Island,Lagos,Car Classic,1,1,1,2,1,2
4,ID_WB3E64W,2010-01-02,2011-01-01,Male,66.0,2010-01-02,1,Saloon,As Attached,TOYOTA,Victoria Island,Lagos,CarSafe,1,1,1,2,1,2


In [240]:
train['Policy_Start_Date'] = pd.to_datetime(train['Policy_Start_Date'])

In [241]:
test['Policy_Start_Date'] = pd.to_datetime(test['Policy_Start_Date'])

In [242]:
train

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_O51ZQ1B,2001-12-11,2011-12-10,Female,37.0,2001-12-11,1,Saloon,Black,Honda,Ekiti,Benue,Car Vintage,0
1,ID_VJ1FAVO,2002-03-25,2011-03-24,Male,37.0,2002-03-25,1,Saloon,Black,TOYOTA,Ekiti,Benue,Car Classic,0
2,ID_ULWS8VL,2003-04-13,2011-04-12,Male,41.0,2003-04-13,2,Saloon,Black,TOYOTA,Ikeja,Lagos,Car Vintage,0
3,ID_ZYKGSP7,2003-12-21,2034-05-20,Male,48.0,2003-12-21,2,Saloon,Gold,BMW,Ajah,Lagos,Car Vintage,0
4,ID_OEWBKGF,2005-08-05,2011-09-29,Female,44.0,2005-08-05,1,Saloon,Gold,Tata,Ajah,Lagos,CVTP,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,ID_MLGO8DZ,2010-12-31,2011-06-30,Male,51.0,2010-12-31,4,Saloon,Black,Honda,Victoria Island,Lagos,Car Classic,0
12075,ID_62LBOWI,2010-12-31,2011-12-30,Male,44.0,2010-12-31,1,JEEP,Silver,Hyundai,Victoria Island,Lagos,Car Classic,0
12076,ID_XR8F115,2010-12-31,2011-12-30,Male,37.0,2010-12-31,1,JEEP,Silver,Hyundai,Anthony Village,Lagos,CarSafe,0
12077,ID_8P2UGYO,2010-12-31,2011-12-30,Male,20.0,2010-12-31,1,Saloon,Silver,Kia,Anthony Village,Lagos,CarSafe,0


In [245]:
categorical_columns = ['Gender', 'Car_Category', 'Subject_Car_Colour', 'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName']

le = LabelEncoder()
for col in categorical_columns:
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.fit_transform(test[col].astype(str))

In [246]:
train

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_O51ZQ1B,2001-12-11,2011-12-10,0,37.0,2001-12-11,1,14,34,18,230,27,3,0
1,ID_VJ1FAVO,2002-03-25,2011-03-24,1,37.0,2002-03-25,1,14,34,63,230,27,1,0
2,ID_ULWS8VL,2003-04-13,2011-04-12,1,41.0,2003-04-13,2,14,34,63,19,82,3,0
3,ID_ZYKGSP7,2003-12-21,2034-05-20,1,48.0,2003-12-21,2,14,15,56,135,82,3,0
4,ID_OEWBKGF,2005-08-05,2011-09-29,0,44.0,2005-08-05,1,14,15,64,135,82,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,ID_MLGO8DZ,2010-12-31,2011-06-30,1,51.0,2010-12-31,4,14,34,18,165,82,1,0
12075,ID_62LBOWI,2010-12-31,2011-12-30,1,44.0,2010-12-31,1,8,31,21,165,82,1,0
12076,ID_XR8F115,2010-12-31,2011-12-30,1,37.0,2010-12-31,1,8,31,21,190,82,5,0
12077,ID_8P2UGYO,2010-12-31,2011-12-30,1,20.0,2010-12-31,1,14,31,31,190,82,5,0


In [247]:
test

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,StartMonth,EndMonth,FirstMonth,StartDay,EndDay,FirstDay
0,ID_6JEYVLP,2010-01-01,2010-12-31,1,35.0,2010-01-01,1,1,0,0,72,29,4,1,12,1,1,31,1
1,ID_BX1PNZ6,2010-01-01,2011-05-31,2,93.0,2010-01-01,7,1,0,10,72,29,4,1,5,1,1,31,1
2,ID_83NJU2D,2010-01-01,2010-12-31,1,41.0,2010-01-01,4,1,15,8,109,29,7,1,12,1,1,31,1
3,ID_Q6GKOAQ,2010-01-02,2011-01-01,0,45.0,2010-01-02,1,1,15,8,109,29,1,1,1,1,2,1,2
4,ID_WB3E64W,2010-01-02,2011-01-01,1,66.0,2010-01-02,1,6,0,33,109,29,4,1,1,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1197,ID_3VAANJU,2010-12-30,2011-12-29,0,41.0,2010-12-30,1,6,14,11,85,32,1,12,12,12,30,29,30
1198,ID_A1YSYCA,2010-12-30,2011-12-29,1,45.0,2010-12-30,1,6,14,11,85,32,4,12,12,12,30,29,30
1199,ID_X7F8710,2010-12-31,2011-12-30,1,38.0,2010-12-31,3,6,14,11,85,32,1,12,12,12,31,30,31
1200,ID_MZPS3W5,2010-12-31,2011-07-10,1,34.0,2010-12-31,1,10,0,33,92,6,1,12,7,12,31,10,31


In [231]:
# Feature Selection
X = train.drop(columns=['ID', 'target', 'Policy_Start_Date', 'Policy_End_Date', 'First_Transaction_Date'])
y = train['target']
X_test = test.drop(columns=['ID', 'Policy_Start_Date', 'Policy_End_Date', 'First_Transaction_Date'])

In [232]:
X

Unnamed: 0,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,Gender_Female,...,State_Warri-South,ProductName_CVTP,ProductName_Car Classic,ProductName_Car Plus,ProductName_Car Vintage,ProductName_CarFlex,ProductName_CarSafe,ProductName_Customized Motor,ProductName_Motor Cycle,ProductName_Muuve
0,0,37.0,1,8,4,25,74,23,3,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,37.0,1,8,4,66,74,23,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,41.0,2,8,4,66,115,73,3,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1,48.0,2,8,22,6,22,73,3,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0,44.0,1,8,22,67,22,73,0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,1,51.0,4,8,4,25,247,73,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12075,1,44.0,1,2,37,28,247,73,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12076,1,37.0,1,2,37,28,38,73,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12077,1,20.0,1,8,37,37,38,73,5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [233]:
scaler = StandardScaler()
numerical_columns = ['Age', 'No_Pol', 'Policy Duration', 'Customer Tenure', 'Recency']
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

KeyError: "['Policy Duration', 'Customer Tenure', 'Recency'] not in index"

In [None]:
X

Unnamed: 0,Age,No_Pol,StartMonth,EndMonth,FirstMonth,StartDay,EndDay,FirstDay,Policy Duration,Customer Tenure,...,State_Warri-South,ProductName_CVTP,ProductName_Car Classic,ProductName_Car Plus,ProductName_Car Vintage,ProductName_CarFlex,ProductName_CarSafe,ProductName_Customized Motor,ProductName_Motor Cycle,ProductName_Muuve
0,-0.299943,-0.419106,12,12,12,11,10,11,15.373899,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.299943,-0.419106,3,3,3,25,24,25,13.674195,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.010773,0.945049,4,4,4,13,12,13,11.974491,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.495274,0.945049,12,5,12,21,20,21,50.099079,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.206105,-0.419106,8,9,8,5,29,5,8.831204,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,0.712152,3.673359,12,6,12,31,30,31,-0.784928,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12075,0.206105,-0.419106,12,12,12,31,30,31,0.067252,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12076,-0.299943,-0.419106,12,12,12,31,30,31,0.067252,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12077,-1.528915,-0.419106,12,12,12,31,30,31,0.067252,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

In [None]:
train_predictions = model.predict(X)

f1 = f1_score(y, train_predictions)
precision = precision_score(y, train_predictions)
recall = recall_score(y, train_predictions)
conf_matrix = confusion_matrix(y, train_predictions)

print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

F1 Score: 0.8731
Precision: 0.9328
Recall: 0.8206
Confusion Matrix:
[[10538    86]
 [  261  1194]]


In [None]:
train_predictions

array([0, 0, 0, ..., 0, 0, 1])

# Fitting Model to Test

In [None]:
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [None]:
test_predictions = model.predict(X_test)
pd.set_option('display.max_rows', 20)

pd.DataFrame(test_predictions).value_counts()

0
0    1192
1      10
Name: count, dtype: int64

In [None]:
pd.DataFrame([test['ID'], test_predictions]).T

Unnamed: 0,ID,Unnamed 0
0,ID_6JEYVLP,0
1,ID_BX1PNZ6,0
2,ID_83NJU2D,1
3,ID_Q6GKOAQ,0
4,ID_WB3E64W,0
...,...,...
1197,ID_3VAANJU,0
1198,ID_A1YSYCA,0
1199,ID_X7F8710,0
1200,ID_MZPS3W5,0


In [None]:
# sample_submission['target'] = test_predictions
pd.DataFrame([test['ID'], test_predictions]).T.to_csv('submission.csv', index=False)