In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [65]:
train = pd.read_csv('./data/backfilled_data.csv')
test = pd.read_csv('./data/backfilled_test.csv')
sample_submission = pd.read_csv('./data/SampleSubmission.csv')

# Model Training

In [66]:
train.head()

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,ID_O51ZQ1B,2001-12-11,2011-12-10,Female,37.0,2001-12-11,1,Saloon,Black,Honda,Ekiti,Benue,Car Vintage,0
1,ID_VJ1FAVO,2002-03-25,2011-03-24,Male,37.0,2002-03-25,1,Saloon,Black,TOYOTA,Ekiti,Benue,Car Classic,0
2,ID_ULWS8VL,2003-04-13,2011-04-12,Male,41.0,2003-04-13,2,Saloon,Black,TOYOTA,Ikeja,Lagos,Car Vintage,0
3,ID_ZYKGSP7,2003-12-21,2034-05-20,Male,48.0,2003-12-21,2,Saloon,Gold,BMW,Ajah,Lagos,Car Vintage,0
4,ID_OEWBKGF,2005-08-05,2011-09-29,Female,44.0,2005-08-05,1,Saloon,Gold,Tata,Ajah,Lagos,CVTP,0


In [67]:
test.head()

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,StartMonth,EndMonth,FirstMonth,StartDay,EndDay,FirstDay
0,ID_6JEYVLP,2010-01-01,2010-12-31,Male,35.0,2010-01-01,1,JEEP,As Attached,ACURA,Lagos Island,Lagos,CarSafe,1,12,1,1,31,1
1,ID_BX1PNZ6,2010-01-01,2011-05-31,Other,93.0,2010-01-01,7,JEEP,As Attached,Honda,Lagos Island,Lagos,CarSafe,1,5,1,1,31,1
2,ID_83NJU2D,2010-01-01,2010-12-31,Male,41.0,2010-01-01,4,JEEP,White,Ford,Victoria Island,Lagos,Muuve,1,12,1,1,31,1
3,ID_Q6GKOAQ,2010-01-02,2011-01-01,Female,45.0,2010-01-02,1,JEEP,White,Ford,Victoria Island,Lagos,Car Classic,1,1,1,2,1,2
4,ID_WB3E64W,2010-01-02,2011-01-01,Male,66.0,2010-01-02,1,Saloon,As Attached,TOYOTA,Victoria Island,Lagos,CarSafe,1,1,1,2,1,2


In [68]:
sample_submission.head()

Unnamed: 0,ID,target
0,ID_01QM0NU,0
1,ID_024NJLZ,0
2,ID_02NOVWQ,0
3,ID_02VSP68,0
4,ID_02YB37K,0


In [69]:
train['Policy_Start_Date'] = pd.to_datetime(train['Policy_Start_Date'])
train['Policy_End_Date'] = pd.to_datetime(train['Policy_End_Date'])
train['First_Transaction_Date'] = pd.to_datetime(train['First_Transaction_Date'])

train['StartMonth'] = train['Policy_Start_Date'].dt.month
train['EndMonth'] = train['Policy_End_Date'].dt.month
train['FirstMonth'] = train['First_Transaction_Date'].dt.month

train['StartDay'] = train['Policy_Start_Date'].dt.day
train['EndDay'] = train['Policy_End_Date'].dt.day
train['FirstDay'] = train['First_Transaction_Date'].dt.day

In [70]:
test['Policy_Start_Date'] = pd.to_datetime(train['Policy_Start_Date'])
test['Policy_End_Date'] = pd.to_datetime(train['Policy_End_Date'])
test['First_Transaction_Date'] = pd.to_datetime(train['First_Transaction_Date'])

test['StartMonth'] = test['Policy_Start_Date'].dt.month
test['EndMonth'] = test['Policy_End_Date'].dt.month
test['FirstMonth'] = test['First_Transaction_Date'].dt.month

test['StartDay'] = test['Policy_Start_Date'].dt.day
test['EndDay'] = test['Policy_End_Date'].dt.day
test['FirstDay'] = test['First_Transaction_Date'].dt.day

In [71]:
train['Policy_Start_Date'] = pd.to_datetime(train['Policy_Start_Date'])
train['Policy_End_Date'] = pd.to_datetime(train['Policy_End_Date'])
train['First_Transaction_Date'] = pd.to_datetime(train['First_Transaction_Date'])

test['Policy_Start_Date'] = pd.to_datetime(test['Policy_Start_Date'])
test['Policy_End_Date'] = pd.to_datetime(test['Policy_End_Date'])
test['First_Transaction_Date'] = pd.to_datetime(test['First_Transaction_Date'])

train['Policy Duration'] = (train['Policy_End_Date'] - train['Policy_Start_Date']).dt.days
train['Customer Tenure'] = (train['Policy_Start_Date'] - train['First_Transaction_Date']).dt.days
test['Policy Duration'] = (test['Policy_End_Date'] - test['Policy_Start_Date']).dt.days
test['Customer Tenure'] = (test['Policy_Start_Date'] - test['First_Transaction_Date']).dt.days

today = pd.Timestamp.today()
train['Recency'] = (today - train['Policy_End_Date']).dt.days
test['Recency'] = (today - test['Policy_End_Date']).dt.days

In [72]:
train['Policy Duration'] = (train['Policy_End_Date'] - train['Policy_Start_Date']).dt.days
train['Customer Tenure'] = (train['Policy_Start_Date'] - train['First_Transaction_Date']).dt.days
test['Policy Duration'] = (test['Policy_End_Date'] - test['Policy_Start_Date']).dt.days
test['Customer Tenure'] = (test['Policy_Start_Date'] - test['First_Transaction_Date']).dt.days

today = pd.Timestamp.today()
train['Recency'] = (today - train['Policy_End_Date']).dt.days
test['Recency'] = (today - test['Policy_End_Date']).dt.days

In [73]:
categorical_columns = ['Gender', 'Car_Category', 'Subject_Car_Colour', 'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName']
for col in categorical_columns:
    mode_value = train[col].mode()[0]
    train[col].fillna(mode_value, inplace=True)
    test[col].fillna(mode_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(mode_value, inplace=True)


In [74]:
def clean_age(age):
    if age < 0 or age > 100:
        return np.nan  
    return age

In [75]:
max = train['Age'].min()
max

0.0

In [76]:
train

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Gender,Age,First_Transaction_Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,...,target,StartMonth,EndMonth,FirstMonth,StartDay,EndDay,FirstDay,Policy Duration,Customer Tenure,Recency
0,ID_O51ZQ1B,2001-12-11,2011-12-10,Female,37.0,2001-12-11,1,Saloon,Black,Honda,...,0,12,12,12,11,10,11,3651,0,4867
1,ID_VJ1FAVO,2002-03-25,2011-03-24,Male,37.0,2002-03-25,1,Saloon,Black,TOYOTA,...,0,3,3,3,25,24,25,3286,0,5128
2,ID_ULWS8VL,2003-04-13,2011-04-12,Male,41.0,2003-04-13,2,Saloon,Black,TOYOTA,...,0,4,4,4,13,12,13,2921,0,5109
3,ID_ZYKGSP7,2003-12-21,2034-05-20,Male,48.0,2003-12-21,2,Saloon,Gold,BMW,...,0,12,5,12,21,20,21,11108,0,-3330
4,ID_OEWBKGF,2005-08-05,2011-09-29,Female,44.0,2005-08-05,1,Saloon,Gold,Tata,...,0,8,9,8,5,29,5,2246,0,4939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,ID_MLGO8DZ,2010-12-31,2011-06-30,Male,51.0,2010-12-31,4,Saloon,Black,Honda,...,0,12,6,12,31,30,31,181,0,5030
12075,ID_62LBOWI,2010-12-31,2011-12-30,Male,44.0,2010-12-31,1,JEEP,Silver,Hyundai,...,0,12,12,12,31,30,31,364,0,4847
12076,ID_XR8F115,2010-12-31,2011-12-30,Male,37.0,2010-12-31,1,JEEP,Silver,Hyundai,...,0,12,12,12,31,30,31,364,0,4847
12077,ID_8P2UGYO,2010-12-31,2011-12-30,Male,20.0,2010-12-31,1,Saloon,Silver,Kia,...,0,12,12,12,31,30,31,364,0,4847


In [77]:
train['Age'] = train['Age'].apply(clean_age)
test['Age'] = test['Age'].apply(clean_age)


age_median = train['Age'].mean()
train['Age'].fillna(age_median, inplace=True)
test['Age'].fillna(age_median, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(age_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(age_median, inplace=True)


In [78]:

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_train = encoder.fit_transform(train[categorical_columns])
encoded_test = encoder.transform(test[categorical_columns])

encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_columns))
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_columns))

train = train.drop(columns=categorical_columns).reset_index(drop=True)
test = test.drop(columns=categorical_columns).reset_index(drop=True)

train = pd.concat([train, encoded_train_df], axis=1)
test = pd.concat([test, encoded_test_df], axis=1)

In [79]:
train

Unnamed: 0,ID,Policy_Start_Date,Policy_End_Date,Age,First_Transaction_Date,No_Pol,target,StartMonth,EndMonth,FirstMonth,...,State_Warri-South,ProductName_CVTP,ProductName_Car Classic,ProductName_Car Plus,ProductName_Car Vintage,ProductName_CarFlex,ProductName_CarSafe,ProductName_Customized Motor,ProductName_Motor Cycle,ProductName_Muuve
0,ID_O51ZQ1B,2001-12-11,2011-12-10,37.0,2001-12-11,1,0,12,12,12,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,ID_VJ1FAVO,2002-03-25,2011-03-24,37.0,2002-03-25,1,0,3,3,3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_ULWS8VL,2003-04-13,2011-04-12,41.0,2003-04-13,2,0,4,4,4,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,ID_ZYKGSP7,2003-12-21,2034-05-20,48.0,2003-12-21,2,0,12,5,12,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,ID_OEWBKGF,2005-08-05,2011-09-29,44.0,2005-08-05,1,0,8,9,8,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,ID_MLGO8DZ,2010-12-31,2011-06-30,51.0,2010-12-31,4,0,12,6,12,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12075,ID_62LBOWI,2010-12-31,2011-12-30,44.0,2010-12-31,1,0,12,12,12,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12076,ID_XR8F115,2010-12-31,2011-12-30,37.0,2010-12-31,1,0,12,12,12,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12077,ID_8P2UGYO,2010-12-31,2011-12-30,20.0,2010-12-31,1,0,12,12,12,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [80]:
# Feature Selection
X = train.drop(columns=['ID', 'target', 'Policy_Start_Date', 'Policy_End_Date', 'First_Transaction_Date'])
y = train['target']
X_test = test.drop(columns=['ID', 'Policy_Start_Date', 'Policy_End_Date', 'First_Transaction_Date'])

In [86]:
X

Unnamed: 0,Age,No_Pol,StartMonth,EndMonth,FirstMonth,StartDay,EndDay,FirstDay,Policy Duration,Customer Tenure,...,State_Warri-South,ProductName_CVTP,ProductName_Car Classic,ProductName_Car Plus,ProductName_Car Vintage,ProductName_CarFlex,ProductName_CarSafe,ProductName_Customized Motor,ProductName_Motor Cycle,ProductName_Muuve
0,-0.299943,-0.419106,12,12,12,11,10,11,15.373899,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,-0.299943,-0.419106,3,3,3,25,24,25,13.674195,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.010773,0.945049,4,4,4,13,12,13,11.974491,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.495274,0.945049,12,5,12,21,20,21,50.099079,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.206105,-0.419106,8,9,8,5,29,5,8.831204,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12074,0.712152,3.673359,12,6,12,31,30,31,-0.784928,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12075,0.206105,-0.419106,12,12,12,31,30,31,0.067252,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12076,-0.299943,-0.419106,12,12,12,31,30,31,0.067252,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12077,-1.528915,-0.419106,12,12,12,31,30,31,0.067252,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
scaler = StandardScaler()
numerical_columns = ['Age', 'No_Pol', 'Policy Duration', 'Customer Tenure', 'Recency']
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

In [82]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

In [83]:
train_predictions = model.predict(X)

f1 = f1_score(y, train_predictions)
precision = precision_score(y, train_predictions)
recall = recall_score(y, train_predictions)
conf_matrix = confusion_matrix(y, train_predictions)

print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

F1 Score: 0.8731
Precision: 0.9328
Recall: 0.8206
Confusion Matrix:
[[10538    86]
 [  261  1194]]


# Fitting Model to Test

In [None]:
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [87]:
test_predictions = model.predict(X_test)
test_predictions

array([0, 0, 0, ..., 0, 0, 0])

In [85]:
sample_submission['target'] = test_predictions
sample_submission.to_csv('submission.csv', index=False)