# Porto Seguro DataSet

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 65000)

from statsmodels import stats
from scipy import stats

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score, roc_curve, make_scorer
score_fun = make_scorer('roc_auc')

In [2]:
#df = pd.read_csv("Porto_Seguro_Sample_58k.csv")

In [3]:
df = pd.read_csv("TRAIN.csv")

In [4]:
df['target'].value_counts(normalize=True)

0    0.963486
1    0.036514
Name: target, dtype: float64

In [5]:
df['target'].value_counts()

0    480895
1     18225
Name: target, dtype: int64

In [6]:
df = df.replace(-1, np.NAN)

In [7]:
missing_values_per = df.isna().sum()/df.shape[0]*100
missing_values_per[missing_values_per>40]

ps_car_03_cat    69.165932
ps_car_05_cat    44.814473
dtype: float64

In [8]:
df.drop(columns='ps_car_03_cat', inplace=True)

In [9]:
df_id = df['id']
y = df['target']
df.drop(columns=['id', 'target'], inplace=True)

In [10]:
df.shape

(499120, 56)

In [11]:
columns = df.columns.to_list()

In [12]:
cat = []
reg = []
for i in columns:
    if 'cat' in i:
        cat.append(i)
    elif 'bin' in i:
        cat.append(i)        
    elif 'reg' in i:
        reg.append(i)
    elif 'ind' in i:
        cat.append(i)
    elif df[i].dtype=='float64':
        reg.append(i)
    else:
        cat.append(i)

In [13]:
df[reg].head()

Unnamed: 0,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03
0,0.7,0.2,0.71807,2.0,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2
1,0.8,0.4,0.766078,3.0,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3
2,0.0,0.0,,1.0,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1
3,0.9,0.2,0.580948,1.0,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1
4,0.7,0.6,0.840759,3.0,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0


In [14]:
df[reg] = df[reg].astype('float64')
df[cat] = df[cat].astype('O')

In [15]:
df[cat].describe()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_car_01_cat,ps_car_02_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,499120,498935.0,499120,499049.0,494230.0,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499028.0,499116.0,499120,275442.0,499120,489467.0,499120,498651.0,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120,499120
unique,8,4.0,12,2.0,7.0,2,2,2,2,2,2,2,2,5,14,2,2,2,12.0,2.0,10,2.0,18,2.0,2,5.0,3,104,6,7,11,10,11,8,26,19,11,14,24,2,2,2,2,2,2
top,0,1.0,2,0.0,0.0,0,0,0,0,0,0,0,0,0,7,1,0,0,11.0,1.0,0,1.0,11,1.0,1,2.0,1,104,2,2,8,3,9,2,8,5,1,2,7,0,1,1,0,0,0
freq,157091,361934.0,80545,290735.0,442648.0,302625,370927,417321,406487,498937,498280,494400,498657,493770,54797,330006,438623,422624,173931.0,414249.0,416525,144739.0,110352,463814.0,415347,296380.0,494879,71138,162831,163652,146665,136352,127277,153185,68798,85914,170372,116790,72819,437850,313346,276760,355759,324886,422517


In [16]:
rows, columns = df.shape
drop_nunique_col = []
for col in df.columns:
    if df[col].nunique == rows or df[col].nunique == 1:
        drop_nunique_col.append(col)
drop_nunique_col    

[]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.25, random_state=0)

In [18]:
#X_train[cat].iloc[:400, :]

In [19]:
#X_test[reg].iloc[:400, :]

In [20]:
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        X_train[col] = X_train[col].fillna(X_train[col].mode()[0])
        X_test[col] = X_test[col].fillna(X_train[col].mode()[0])
    else:
        X_train[col] = X_train[col].fillna(X_train[col].mean())
        X_test[col] = X_test[col].fillna(X_train[col].mean())

## Data Preprocessing -Lable Encoding, MinMax Scaling, Chi2Test
### Apply SMOTE - Synthetic Minority OverSampling Technique

In [21]:
class LabelEncoderExt(object):
    def __init__(self):
        """
        It differs from LabelEncoder by handling new classes and providing a value for it [Unknown]
        Unknown will be added in fit and transform will take care of new item. It gives unknown class id
        """
        self.label_encoder = LabelEncoder()
        # self.classes_ = self.label_encoder.classes_

    def fit(self, data_list):
        """
        This will fit the encoder for all the unique values and introduce unknown value
        :param data_list: A list of string
        :return: self
        """
        self.label_encoder = self.label_encoder.fit(list(data_list) + ['Unknown'])
        self.classes_ = self.label_encoder.classes_

        return self

    def transform(self, data_list):
        """
        This will transform the data_list to id list where the new values get assigned to Unknown class
        :param data_list:
        :return:
        """
        new_data_list = list(data_list)
        for unique_item in np.unique(data_list):
            if unique_item not in self.label_encoder.classes_:
                new_data_list = ['Unknown' if x==unique_item else x for x in new_data_list]

        return self.label_encoder.transform(new_data_list)

In [22]:
Le = LabelEncoderExt()
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        Le.fit(X_train[col])
        X_train[col] = Le.transform(X_train[col]).asdtype('int64')
        X_test[col] = Le.transform(X_test[col]).asdtype('int64')

In [23]:
mMs = MinMaxScaler()
for col in reg:
        #mMs.fit([X_train[col]])
        X_train[col] = mMs.fit_transform(np.array(X_train[col]).reshape(-1, 1))
        X_test[col] = mMs.transform(np.array(X_test[col]).reshape(-1, 1))

In [24]:
ch2_Value = []
pValue = []
for col in cat:
    ct = pd.crosstab(X_train[col], y_train)   
    ch2_Value.append(stats.chi2_contingency(ct)[0])
    pValue.append(stats.chi2_contingency(ct)[1])
    
ch2_df = pd.DataFrame()
ch2_df['cat_columns'] = cat
ch2_df['ch2_value'] = ch2_Value
ch2_df['pValue'] = pValue



print("Before Feature Selection[Ch2_Test] No of Categorical Columns: =======>", len(cat))
ch2_test_af_col = ch2_df[ch2_df['pValue']<0.06]['cat_columns'].tolist()
print("After Feature Selection[Ch2_Test] No of Categorical Columns: ========>", len(ch2_test_af_col))

final_col = reg + ch2_test_af_col     ## combing continous and ch2 test outcome columns 

X_train = X_train[final_col]
X_test = X_test[final_col]


print("Final Number of Columns : ========>", len(final_col))



In [25]:
'''
print(y_train.value_counts())
print(y_train.value_counts(normalize=True))
print(y_train.shape)

smort = SMOTE(sampling_strategy=0.3, k_neighbors=8)  ## SMOTE Parameters
X_train, y_train = smort.fit_resample(X_train, y_train)

print(y_train.value_counts())
print(y_train.value_counts(normalize=True))
print(y_train.shape)
'''

'\nprint(y_train.value_counts())\nprint(y_train.value_counts(normalize=True))\nprint(y_train.shape)\n\nsmort = SMOTE(sampling_strategy=0.3, k_neighbors=8)  ## SMOTE Parameters\nX_train, y_train = smort.fit_resample(X_train, y_train)\n\nprint(y_train.value_counts())\nprint(y_train.value_counts(normalize=True))\nprint(y_train.shape)\n'

## Checking Feature Importance

In [27]:
rf = RandomForestClassifier()


In [28]:
features_importance = rf.feature_importances_
features_importance[::-1].sort()

feature_imp_df = pd.DataFrame()
feature_imp = []
for i, col in enumerate(X_train.columns):
    feature_imp.append(features_importance[i])
    #print("{}. {} ({})".format(i + 1, col, features_importance[i]))
feature_imp_df['cName'] = X_train.columns
feature_imp_df['feature_imp_Val'] = feature_imp

#feature_imp_df

NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
plt.figure(figsize=(15, 15))
plt.plot(np.arange(1, features_importance.shape[0]+1), np.cumsum(features_importance))

## HyperParameter Tuning

In [None]:
rf_hyp = RandomForestClassifier(n_jobs=-1)

In [None]:
rf_params = {
    'n_estimators': [50, 75, 100],
    'criterion' : ['gini', 'entropy'], 
    'max_depth' : [4, 5, 6, 7, 8], 
    'max_leaf_nodes': [20, 30, 40, 50],
    'min_samples_leaf': [5, 10, 15, 20]
}

In [None]:
rsCV = RandomizedSearchCV(estimator=rf_hyp, param_distributions=rf_params, scoring='roc_auc')

In [None]:
rsCV.fit(X_train, y_train)

In [None]:
best_params = rsCV.best_params_

In [None]:
best_params

In [None]:
rf_best_Model = rsCV.best_estimator_

In [None]:
rf_best_Model.fit(X_train, y_train)

In [None]:
scoresCV = cross_val_score(X=X_train, y=y_train, estimator=rf_best_Model, cv=3, scoring='roc_auc')
for i, score in enumerate(scoresCV):
    print("Fit", i+1, "Model RF : =====>",score)
print(str(rf).split("(")[0], "CV Mean Score", scoresCV.mean())

In [None]:
y_predict = rf_best_Model.predict(X_test)
y_predict_proba = rf_best_Model.predict_proba(X_test)[::,1]

In [None]:
train_acc = rf_best_Model.score(X_train, y_train)
test_acc =  rf_best_Model.score(X_test, y_test)

recallScore = recall_score(y_test, y_predict)
precisionScore = precision_score(y_test, y_predict)

f1Score = f1_score(y_test, y_predict)
auc = roc_auc_score(y_test, y_predict_proba)
fpr, tpr, thrshould = roc_curve(y_test, y_predict_proba)

In [None]:
print("\n\n")
print("Model Name: ", str(rf_best_Model).split("(")[0])
print("ConfusionMatrix: \n", confusion_matrix(y_test, y_predict))    
print("TrainAcc: ====> {}".format(train_acc))
print("TestAccuracy : ====> {}".format(test_acc))
print("recall: ====> {}".format(recallScore))
print("Precision: ====> {}".format(precisionScore))
print("F1Score: ====> {}".format(f1Score))
print("AUC: ====> {}".format(auc))

In [None]:
plt.figure(figsize=(8,8))
plt.plot(fpr, tpr, label="Model Name: "+mName+"\n"+"auc="+str(auc))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Area Under The Curve AUC-ROC')

plt.legend(loc= 7)
plt.show()