In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold,train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import f1_score,classification_report

In [2]:
train = pd.read_csv('./Train.csv')
test = pd.read_csv('./Test.csv')
ss = pd.read_csv('./SampleSubmission.csv')
variable_def = pd.read_csv('./VariableDefinitions_-_Sheet1.csv')

In [3]:
train.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
0,0040R73,2010-05-14,2011-05-13,Male,30,2010-05-14,1,Saloon,Black,TOYOTA,,,Car Classic,0
1,0046BNK,2010-11-29,2011-11-28,Female,79,2010-11-29,1,JEEP,Grey,TOYOTA,,,Car Classic,1
2,005QMC3,2010-03-21,2011-03-20,Male,43,2010-03-21,1,Saloon,Red,TOYOTA,,,Car Classic,0
3,0079OHW,2010-08-21,2011-08-20,Male,2,2010-08-21,1,,,,,,CarSafe,0
4,00BRP63,2010-08-29,2010-12-31,Entity,20,2010-08-29,3,,,,Lagos,Lagos,Muuve,1


In [4]:
test.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName
0,009D84L,2010-04-24,2011-03-27,NO GENDER,25,2010-04-24,1,,As Attached,Iveco,Victoria Island,Lagos,CVTP
1,01DO2EQ,2010-01-01,2010-12-31,,120,2010-01-01,4,,As Attached,TOYOTA,Victoria Island,Lagos,Muuve
2,01QM0NU,2010-10-23,2011-10-22,Female,46,2010-10-23,1,,,Ford,Abuja Municipal,Abuja-Municipal,Car Classic
3,024NJLZ,2010-10-14,2011-10-13,Male,32,2010-10-14,1,,,,Kosofe,Benue,Car Classic
4,02BYET3,2010-09-16,2010-12-31,,120,2010-09-16,4,,,TOYOTA,Victoria Island,Lagos,Muuve


In [5]:
train.shape, test.shape

((12079, 14), (5177, 13))

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      12079 non-null  object
 1   Policy Start Date       12079 non-null  object
 2   Policy End Date         12079 non-null  object
 3   Gender                  11720 non-null  object
 4   Age                     12079 non-null  int64 
 5   First Transaction Date  12079 non-null  object
 6   No_Pol                  12079 non-null  int64 
 7   Car_Category            8341 non-null   object
 8   Subject_Car_Colour      5117 non-null   object
 9   Subject_Car_Make        9603 non-null   object
 10  LGA_Name                5603 non-null   object
 11  State                   5591 non-null   object
 12  ProductName             12079 non-null  object
 13  target                  12079 non-null  int64 
dtypes: int64(3), object(11)
memory usage: 1.3+ MB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5177 entries, 0 to 5176
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ID                      5177 non-null   object
 1   Policy Start Date       5177 non-null   object
 2   Policy End Date         5177 non-null   object
 3   Gender                  5021 non-null   object
 4   Age                     5177 non-null   int64 
 5   First Transaction Date  5177 non-null   object
 6   No_Pol                  5177 non-null   int64 
 7   Car_Category            3539 non-null   object
 8   Subject_Car_Colour      2172 non-null   object
 9   Subject_Car_Make        4116 non-null   object
 10  LGA_Name                2395 non-null   object
 11  State                   2389 non-null   object
 12  ProductName             5177 non-null   object
dtypes: int64(2), object(11)
memory usage: 525.9+ KB


In [8]:
train.isnull().sum()

ID                           0
Policy Start Date            0
Policy End Date              0
Gender                     359
Age                          0
First Transaction Date       0
No_Pol                       0
Car_Category              3738
Subject_Car_Colour        6962
Subject_Car_Make          2476
LGA_Name                  6476
State                     6488
ProductName                  0
target                       0
dtype: int64

In [9]:
train_df = train.shape[0]
test_df = test.shape[0]

df = pd.concat((train, test)).reset_index(drop=True)
print("all_data size is : {}".format(df.shape))

all_data size is : (17256, 14)


In [10]:
df.tail()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target
17251,ZYXX5AF,2010-07-18,2011-07-17,Male,48,2010-07-18,1,Saloon,,Honda,,,CarSafe,
17252,ZYYOZ5L,2010-12-04,2011-12-03,,50,2010-12-04,1,,,,Victoria Island,Lagos,Car Classic,
17253,ZZ1GTKD,2010-09-24,2011-09-23,Male,1,2010-09-24,1,,,,,,CarSafe,
17254,ZZDXQSI,2010-07-17,2011-07-16,Female,52,2010-07-17,1,Saloon,Grey,TOYOTA,,,Car Classic,
17255,ZZYTLV1,2010-07-17,2011-07-16,Male,68,2010-07-17,1,Saloon,,Peugeot,,,CarSafe,


In [11]:
df.describe()

Unnamed: 0,Age,No_Pol,target
count,17256.0,17256.0,12079.0
mean,42.677272,1.307545,0.120457
std,82.48988,0.726804,0.325509
min,-6099.0,1.0,0.0
25%,34.0,1.0,0.0
50%,40.0,1.0,0.0
75%,50.0,1.0,0.0
max,320.0,10.0,1.0


In [12]:
train['target'].value_counts()

0    10624
1     1455
Name: target, dtype: int64

In [13]:
cat_cols=df.select_dtypes(include='object').columns
cat_cols

Index(['ID', 'Policy Start Date', 'Policy End Date', 'Gender',
       'First Transaction Date', 'Car_Category', 'Subject_Car_Colour',
       'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName'],
      dtype='object')

In [14]:
df['Gender'].unique()

array(['Male', 'Female', 'Entity', 'Joint Gender', nan, 'NO GENDER',
       'NOT STATED', 'SEX'], dtype=object)

In [15]:
mapper = {"Male":1,"Female":2,'Entity':0,'Joint Gender':0,'NO GENDER':0,'NOT STATED':0,'SEX':0 ,'nan':-999999}
df['Gender']=df['Gender'].map(mapper)

In [16]:
df['Gender'].unique()

array([ 1.,  2.,  0., nan])

In [17]:
date_col = ['Policy Start Date','Policy End Date','First Transaction Date']

In [18]:
for feat in date_col:
    df[feat] = pd.to_datetime(df[feat])

In [19]:
def date_info(dataframe,cols,):
    for feat in cols:
        dataframe[feat +'_year'] = dataframe[feat].dt.year
        dataframe[feat +'_day'] = dataframe[feat].dt.day
        dataframe[feat +'_month'] = dataframe[feat].dt.month
        dataframe[feat +'_quarter'] = dataframe[feat].dt.quarter
        dataframe[feat + '_weekofyear']=dataframe[feat].dt.weekofyear
    dataframe.drop(columns=date_col,axis=1,inplace=True)

In [20]:
date_info(df,date_col)

In [21]:
df.head()

Unnamed: 0,ID,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,...,Policy End Date_year,Policy End Date_day,Policy End Date_month,Policy End Date_quarter,Policy End Date_weekofyear,First Transaction Date_year,First Transaction Date_day,First Transaction Date_month,First Transaction Date_quarter,First Transaction Date_weekofyear
0,0040R73,1.0,30,1,Saloon,Black,TOYOTA,,,Car Classic,...,2011,13,5,2,19,2010,14,5,2,19
1,0046BNK,2.0,79,1,JEEP,Grey,TOYOTA,,,Car Classic,...,2011,28,11,4,48,2010,29,11,4,48
2,005QMC3,1.0,43,1,Saloon,Red,TOYOTA,,,Car Classic,...,2011,20,3,1,11,2010,21,3,1,11
3,0079OHW,1.0,2,1,,,,,,CarSafe,...,2011,20,8,3,33,2010,21,8,3,33
4,00BRP63,0.0,20,3,,,,Lagos,Lagos,Muuve,...,2010,31,12,4,52,2010,29,8,3,34


In [23]:
# Creating columns for age categories
# young < 18, adolescent 18 - 27, adult 27 - 45, old > 45
df['Age_Cat'] = ['Young' if Age<18.0 else 'Adolescent' \
                      if 17.9<Age<27.0 else 'Adult' if 26.9<Age<45.0 \
                      else 'Old' if 44.9<Age<321 else None \
                      for Age in list(df['Age'].values)]

In [None]:
df.A

In [24]:
df.head()

Unnamed: 0,ID,Gender,Age,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,...,Policy End Date_day,Policy End Date_month,Policy End Date_quarter,Policy End Date_weekofyear,First Transaction Date_year,First Transaction Date_day,First Transaction Date_month,First Transaction Date_quarter,First Transaction Date_weekofyear,Age_Cat
0,0040R73,1.0,30,1,Saloon,Black,TOYOTA,,,Car Classic,...,13,5,2,19,2010,14,5,2,19,Adult
1,0046BNK,2.0,79,1,JEEP,Grey,TOYOTA,,,Car Classic,...,28,11,4,48,2010,29,11,4,48,Old
2,005QMC3,1.0,43,1,Saloon,Red,TOYOTA,,,Car Classic,...,20,3,1,11,2010,21,3,1,11,Adult
3,0079OHW,1.0,2,1,,,,,,CarSafe,...,20,8,3,33,2010,21,8,3,33,Young
4,00BRP63,0.0,20,3,,,,Lagos,Lagos,Muuve,...,31,12,4,52,2010,29,8,3,34,Adolescent


In [25]:
df=df.drop(['ID','State','LGA_Name','Subject_Car_Colour','Subject_Car_Make'],axis=1)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17256 entries, 0 to 17255
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             16741 non-null  float64
 1   Age                                17256 non-null  int64  
 2   No_Pol                             17256 non-null  int64  
 3   Car_Category                       11880 non-null  object 
 4   ProductName                        17256 non-null  object 
 5   target                             12079 non-null  float64
 6   Policy Start Date_year             17256 non-null  int64  
 7   Policy Start Date_day              17256 non-null  int64  
 8   Policy Start Date_month            17256 non-null  int64  
 9   Policy Start Date_quarter          17256 non-null  int64  
 10  Policy Start Date_weekofyear       17256 non-null  int64  
 11  Policy End Date_year               17256 non-null  int

In [27]:
Age_Cat_map={'Young':1,'Adolescent':2,'Adult':3,'Old':4}
df['Age_Cat']=df['Age_Cat'].map(Age_Cat_map)

In [28]:
productname_map={'Car Classic':1, 'CarSafe':2, 'Muuve':3, 'CVTP':4, 'Car Plus':5,
       'Motor Cycle':6, 'Customized Motor':7, 'CarFlex':8, 'Car Vintage':9}
df['ProductName']=df['ProductName'].map(productname_map)

In [29]:
car_category_map={'Saloon':1, 'JEEP':2, 'Motorcycle':3, 'Truck':4, 'Bus':5, 'Mini Bus':6,
       'Pick Up':7, 'Mini Van':8, 'Van':9, 'Pick Up > 3 Tons':10, 'CAMRY CAR HIRE':11,
       'Wagon':12, 'Shape Of Vehicle Chasis':13, 'Sedan':14, 'Station 4 Wheel':15,
       'Tipper Truck':16}
df['Car_Category']=df['Car_Category'].map(car_category_map)

In [30]:
df.head()

Unnamed: 0,Gender,Age,No_Pol,Car_Category,ProductName,target,Policy Start Date_year,Policy Start Date_day,Policy Start Date_month,Policy Start Date_quarter,...,Policy End Date_day,Policy End Date_month,Policy End Date_quarter,Policy End Date_weekofyear,First Transaction Date_year,First Transaction Date_day,First Transaction Date_month,First Transaction Date_quarter,First Transaction Date_weekofyear,Age_Cat
0,1.0,30,1,1.0,1,0.0,2010,14,5,2,...,13,5,2,19,2010,14,5,2,19,3
1,2.0,79,1,2.0,1,1.0,2010,29,11,4,...,28,11,4,48,2010,29,11,4,48,4
2,1.0,43,1,1.0,1,0.0,2010,21,3,1,...,20,3,1,11,2010,21,3,1,11,3
3,1.0,2,1,,2,0.0,2010,21,8,3,...,20,8,3,33,2010,21,8,3,33,1
4,0.0,20,3,,3,1.0,2010,29,8,3,...,31,12,4,52,2010,29,8,3,34,2


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17256 entries, 0 to 17255
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             16741 non-null  float64
 1   Age                                17256 non-null  int64  
 2   No_Pol                             17256 non-null  int64  
 3   Car_Category                       11880 non-null  float64
 4   ProductName                        17256 non-null  int64  
 5   target                             12079 non-null  float64
 6   Policy Start Date_year             17256 non-null  int64  
 7   Policy Start Date_day              17256 non-null  int64  
 8   Policy Start Date_month            17256 non-null  int64  
 9   Policy Start Date_quarter          17256 non-null  int64  
 10  Policy Start Date_weekofyear       17256 non-null  int64  
 11  Policy End Date_year               17256 non-null  int

In [32]:
df=df.fillna(-999999)

In [33]:
df.isnull().sum()

Gender                               0
Age                                  0
No_Pol                               0
Car_Category                         0
ProductName                          0
target                               0
Policy Start Date_year               0
Policy Start Date_day                0
Policy Start Date_month              0
Policy Start Date_quarter            0
Policy Start Date_weekofyear         0
Policy End Date_year                 0
Policy End Date_day                  0
Policy End Date_month                0
Policy End Date_quarter              0
Policy End Date_weekofyear           0
First Transaction Date_year          0
First Transaction Date_day           0
First Transaction Date_month         0
First Transaction Date_quarter       0
First Transaction Date_weekofyear    0
Age_Cat                              0
dtype: int64

In [34]:
#Get the new dataset
train_n = df[:train_df]
test_n = df[train_df:]

In [35]:
train_n.head()

Unnamed: 0,Gender,Age,No_Pol,Car_Category,ProductName,target,Policy Start Date_year,Policy Start Date_day,Policy Start Date_month,Policy Start Date_quarter,...,Policy End Date_day,Policy End Date_month,Policy End Date_quarter,Policy End Date_weekofyear,First Transaction Date_year,First Transaction Date_day,First Transaction Date_month,First Transaction Date_quarter,First Transaction Date_weekofyear,Age_Cat
0,1.0,30,1,1.0,1,0.0,2010,14,5,2,...,13,5,2,19,2010,14,5,2,19,3
1,2.0,79,1,2.0,1,1.0,2010,29,11,4,...,28,11,4,48,2010,29,11,4,48,4
2,1.0,43,1,1.0,1,0.0,2010,21,3,1,...,20,3,1,11,2010,21,3,1,11,3
3,1.0,2,1,-999999.0,2,0.0,2010,21,8,3,...,20,8,3,33,2010,21,8,3,33,1
4,0.0,20,3,-999999.0,3,1.0,2010,29,8,3,...,31,12,4,52,2010,29,8,3,34,2


In [36]:
test_n.drop("target",axis = 1,inplace = True)

In [37]:
test_n.shape,train_n.shape

((5177, 21), (12079, 22))

In [38]:
train_n['target'].value_counts()

0.0    10624
1.0     1455
Name: target, dtype: int64

In [39]:
train_n.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12079 entries, 0 to 12078
Data columns (total 22 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Gender                             12079 non-null  float64
 1   Age                                12079 non-null  int64  
 2   No_Pol                             12079 non-null  int64  
 3   Car_Category                       12079 non-null  float64
 4   ProductName                        12079 non-null  int64  
 5   target                             12079 non-null  float64
 6   Policy Start Date_year             12079 non-null  int64  
 7   Policy Start Date_day              12079 non-null  int64  
 8   Policy Start Date_month            12079 non-null  int64  
 9   Policy Start Date_quarter          12079 non-null  int64  
 10  Policy Start Date_weekofyear       12079 non-null  int64  
 11  Policy End Date_year               12079 non-null  int

In [40]:
x=np.array(train_n.drop(['target'],axis=1))
y=np.array(train_n['target'])

In [41]:
sm=SMOTE()

In [42]:
x,y=sm.fit_resample(x,y)

In [43]:
x.shape,y.shape

((21248, 21), (21248,))

In [44]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)

In [45]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((15936, 21), (5312, 21), (15936,), (5312,))

In [51]:
tot_rf=[]
fold=StratifiedKFold(n_splits=10,shuffle=False,random_state=42)

for train_index,test_index in fold.split(x,y):
    x_train,x_test=x[train_index], x[test_index]
    y_train,y_test=y[train_index], y[test_index]
    estimator=RandomForestClassifier(n_estimators=185, criterion="entropy", min_samples_split=10,max_depth=19, n_jobs = -1)
    
    estimator.fit(x_train,y_train)
    pred=estimator.predict_proba(x_test)[:,1]
    predictions=estimator.predict_proba(test_n)[:,1]
    
    tot_rf.append(predictions)

In [52]:
y_pred=np.mean(tot_rf,0)
y_pred

array([7.46901785e-01, 9.32679882e-01, 2.07235703e-02, ...,
       6.10181193e-04, 3.21503436e-01, 5.37073659e-03])

In [53]:
l=[]
for i in y_pred:
    if i >0.4:
        l.append(1)
    else:
        l.append(0)

In [639]:
y_pred=estimator.predict(test_n)
y_pred

array([0., 1., 0., ..., 0., 0., 0.])

In [56]:
ss.head()

Unnamed: 0,ID,target
0,009D84L,0
1,01DO2EQ,0
2,01QM0NU,0
3,024NJLZ,0
4,02BYET3,0


In [57]:
ss['target']=l

In [58]:
ss.to_csv('rf.csv',index=False)

In [534]:
y_pred=estimator.predict(x_test)

In [535]:
scores=f1_score(y_pred,y_test)
scores

0.9641360037261294

In [536]:
preds=estimator.predict(test_n)
preds

array([1., 1., 0., ..., 0., 0., 0.])

## SMOTE FUNCTION

In [14]:
##Spot-Checking Algorithms

models = []

models.append(('LR', LogisticRegression()))
models.append(('CAT', CatBoostClassifier()))
models.append(('LGB', LGBMClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('GBC', GradientBoostingClassifier()))
#testing models

results = []
names = []

In [None]:
for name, model in models:
    fold = StratifiedKFold(n_splits=10, random_state=42)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='f1_score')
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)