In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

import seaborn as sns           

import numpy as np

import math

pd.set_option('display.max_rows',1600)
pd.set_option('display.max_columns',1600)


In [2]:
porto_seguro_df=pd.read_csv('porto_seguro_train.csv')
porto_seguro_df.shape
porto_seguro_df.replace(-1,np.nan,inplace=True)
y=porto_seguro_df['target']
porto_seguro_df.drop(columns='target',inplace=True)
porto_seguro_df.shape
porto_seguro_df
porto_seguro_df.isnull().sum()

id                    0
ps_ind_01             0
ps_ind_02_cat        17
ps_ind_03             0
ps_ind_04_cat         8
ps_ind_05_cat       507
ps_ind_06_bin         0
ps_ind_07_bin         0
ps_ind_08_bin         0
ps_ind_09_bin         0
ps_ind_10_bin         0
ps_ind_11_bin         0
ps_ind_12_bin         0
ps_ind_13_bin         0
ps_ind_14             0
ps_ind_15             0
ps_ind_16_bin         0
ps_ind_17_bin         0
ps_ind_18_bin         0
ps_reg_01             0
ps_reg_02             0
ps_reg_03         10118
ps_car_01_cat         8
ps_car_02_cat         0
ps_car_03_cat     38117
ps_car_04_cat         0
ps_car_05_cat     24582
ps_car_06_cat         0
ps_car_07_cat      1009
ps_car_08_cat         0
ps_car_09_cat        53
ps_car_10_cat         0
ps_car_11_cat         0
ps_car_11             0
ps_car_12             0
ps_car_13             0
ps_car_14          3987
ps_car_15             0
ps_calc_01            0
ps_calc_02            0
ps_calc_03            0
ps_calc_04      

In [3]:
missing_value_df=pd.DataFrame({'Column_Name':porto_seguro_df.columns,
                             'missing_value_count':porto_seguro_df.isnull().sum(),
                             'missing_value_percent':porto_seguro_df.isnull().sum()/porto_seguro_df.shape[0]*100})

missing_value_df.sort_values(by='missing_value_percent',ascending=False)

cols_drop=list(missing_value_df[missing_value_df['missing_value_percent']>50]['Column_Name'])
cols_drop

porto_seguro_df.shape
cols_drop

['ps_car_03_cat']

In [4]:
for col in porto_seguro_df.columns:
    if porto_seguro_df[col].nunique()==porto_seguro_df.shape[0] or porto_seguro_df[col].nunique()==1:
        cols_drop.append(col)
cols_drop

['ps_car_03_cat', 'id']

In [5]:
porto_seguro_df.drop(columns=cols_drop,inplace=True)
porto_seguro_df.shape

(55299, 56)

In [6]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test=train_test_split(porto_seguro_df,y,test_size=0.2,random_state=43)

X_train.shape

(44239, 56)

In [7]:
for col in X_train.columns:
    if X_train[col].dtype=='object':
        X_train[col]=X_train[col].fillna(X_train[col].mode()[0]).astype(object)
        X_test[col]=X_test[col].fillna(X_train[col].mode()[0]).astype(object)
    
    else:
        X_train[col]=X_train[col].fillna(X_train[col].mean())
        X_test[col]=X_test[col].fillna(X_train[col].mean())
        
X_train.dtypes

ps_ind_01           int64
ps_ind_02_cat     float64
ps_ind_03           int64
ps_ind_04_cat     float64
ps_ind_05_cat     float64
ps_ind_06_bin       int64
ps_ind_07_bin       int64
ps_ind_08_bin       int64
ps_ind_09_bin       int64
ps_ind_10_bin       int64
ps_ind_11_bin       int64
ps_ind_12_bin       int64
ps_ind_13_bin       int64
ps_ind_14           int64
ps_ind_15           int64
ps_ind_16_bin       int64
ps_ind_17_bin       int64
ps_ind_18_bin       int64
ps_reg_01         float64
ps_reg_02         float64
ps_reg_03         float64
ps_car_01_cat     float64
ps_car_02_cat       int64
ps_car_04_cat       int64
ps_car_05_cat     float64
ps_car_06_cat       int64
ps_car_07_cat     float64
ps_car_08_cat       int64
ps_car_09_cat     float64
ps_car_10_cat       int64
ps_car_11_cat       int64
ps_car_11           int64
ps_car_12         float64
ps_car_13         float64
ps_car_14         float64
ps_car_15         float64
ps_calc_01        float64
ps_calc_02        float64
ps_calc_03  

In [8]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(sampling_strategy='auto',k_neighbors=3)

X_train,Y_train=sm.fit_resample(X_train,Y_train)

In [9]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
import numpy as np


class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """     
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)
    
le=LabelEncoderExt()
mm=MinMaxScaler()

for col in X_train.columns:
    if X_train[col].dtype=='object':
        le.fit(X_train[col])
        X_train[col]=le.transform(X_train[col])
        X_test[col]=le.transform(X_test[col])
    else:
        mm.fit([X_train[col]])
        X_train[col]=mm.fit_transform(np.array(X_train[col]).reshape(-1,1))
        X_test[col]=mm.transform(np.array(X_test[col]).reshape(-1,1))



In [10]:
from sklearn.linear_model import LogisticRegression

logreg=LogisticRegression()

logreg.fit(X_train,Y_train)

Y_pred=logreg.predict(X_test)


In [11]:
comparison_df=pd.DataFrame({'Actuals':Y_test,'Predictions':Y_pred})

from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_curve,roc_auc_score

print(recall_score(Y_test,Y_pred))

print(precision_score(Y_test,Y_pred))

print(confusion_matrix(Y_test,Y_pred))

print(f1_score(Y_test,Y_pred))

0.07228915662650602
0.03994673768308921
[[9924  721]
 [ 385   30]]
0.05145797598627787


In [12]:
from sklearn.tree import DecisionTreeClassifier

DTC = DecisionTreeClassifier(random_state=0)

DTC.fit(X_train,Y_train)

Y_DTC_predict=DTC.predict(X_test)

print(confusion_matrix(Y_DTC_predict,Y_test))

print(recall_score(Y_DTC_predict,Y_test))

print(precision_score(Y_DTC_predict,Y_test))

print(f1_score(Y_DTC_predict,Y_test))

[[10037   387]
 [  608    28]]
0.0440251572327044
0.06746987951807229
0.0532825880114177


In [13]:
from sklearn.ensemble import AdaBoostClassifier

ABC=AdaBoostClassifier()

ABC.fit(X_train,Y_train)
ada_pred=ABC.predict(X_test)

ada_pred

print('Confusion matrix --------',confusion_matrix(ada_pred,Y_test))

print('recall_score-------------',recall_score(ada_pred,Y_test))

print('precision_score----------',precision_score(ada_pred,Y_test))

print('f1_score-----------------',f1_score(ada_pred,Y_test))

Confusion matrix -------- [[10282   390]
 [  363    25]]
recall_score------------- 0.06443298969072164
precision_score---------- 0.060240963855421686
f1_score----------------- 0.062266500622665005


In [18]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier(n_estimators=50,max_depth=10)

#cross_val_score(clf, mic, iris.target, cv=10)

RFC.fit(X_train,Y_train)

Y_RFC_predict=RFC.predict(X_test)

print('Confusion matrix --------',confusion_matrix(Y_RFC_predict,Y_test))

print('recall_score-------------',recall_score(Y_RFC_predict,Y_test))

print('precision_score----------',precision_score(Y_RFC_predict,Y_test))

print('f1_score-----------------',f1_score(Y_RFC_predict,Y_test))

Y_pred_proba = RFC.predict_proba(X_test)[::,1]

fpr,tpr,thresholds=roc_curve(Y_test,Y_pred_proba) # the function roc_curve returns three values, the False Positive Rate,True Positive rate
                                                # threshold values it used to calculate the TPR and FPR

auc = roc_auc_score(Y_test, Y_pred_proba)

print(auc)

Confusion matrix -------- [[10470   407]
 [  175     8]]
recall_score------------- 0.04371584699453552
precision_score---------- 0.01927710843373494
f1_score----------------- 0.026755852842809368
0.5395557617977783
