In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot

import os

## Import Data

In [2]:
data_path = r'C:\Users\Reljod\Desktop\Study Materials\kaggle\dataset\prudential-insurance'

In [3]:
train_path = os.path.join(data_path, 'train.csv')
test_path = os.path.join(data_path, 'test.csv')
sample_sub_path = os.path.join(data_path, 'sample_submission.csv')

In [4]:
df_train_raw = pd.read_csv(train_path)
print(df_train_raw.shape)
df_test_raw = pd.read_csv(test_path)
print(df_test_raw.shape)

(59381, 128)
(19765, 127)


## Inspect Data

In [5]:
#Copy the raw data
df_train = df_train_raw.copy()
df_test = df_test_raw.copy()

In [6]:
df_train.set_index('Id', inplace=True)

In [7]:
df_test.set_index('Id', inplace=True)

In [8]:
df_train_cols = list(df_train.columns)
df_test_cols = list(df_test.columns)

### Use the Train only then split it.
As we see, there's no target or output label/response in the Test dataset, therefore we will not use it as a test for the model.

In [9]:
del(df_test)
del(df_test_raw)

In [10]:
y = df_train.pop('Response')

In [11]:
X = df_train

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

In [14]:
print("Train Input data shape:", X_train.shape)
print("Train Label data shape:", y_train.shape)
print("Test Input data shape:", X_test.shape)
print("Test Label data shape:", y_test.shape)

Train Input data shape: (41566, 126)
Train Label data shape: (41566,)
Test Input data shape: (17815, 126)
Test Label data shape: (17815,)


## Try using Raw Data

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
rfc = RandomForestClassifier(n_estimators=200,
                            max_depth=8,
                            random_state=10)

In [17]:
rfc.fit(X_train, y_train)

ValueError: could not convert string to float: 'D4'

## Why is there a Error?
<i>ValueError: could not convert string to float: 'D4'</i> means that the data has some <b>object or string type data</b>. That type of data can't be processed and need to be converted to numerical data.
So, we need to convert first the <b>categorical data</b> into a <b>numerical data</b>
<br><br>
Looking at the data again, we can see the categorical data in <i>Product_Info_2.</i>
<br><br>
So what do we need to do? <b>Clean the Data</b>

In [18]:
#Looking at the data again, we can see the categorical data in Product_Info_2
X_train.head()

Unnamed: 0_level_0,Product_Info_1,Product_Info_2,Product_Info_3,Product_Info_4,Product_Info_5,Product_Info_6,Product_Info_7,Ins_Age,Ht,Wt,...,Medical_Keyword_39,Medical_Keyword_40,Medical_Keyword_41,Medical_Keyword_42,Medical_Keyword_43,Medical_Keyword_44,Medical_Keyword_45,Medical_Keyword_46,Medical_Keyword_47,Medical_Keyword_48
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59759,1,D4,29,0.230769,2,3,1,0.119403,0.745455,0.320084,...,0,0,0,1,0,0,0,0,0,0
25815,1,D4,26,0.076923,2,3,1,0.402985,0.672727,0.246862,...,0,0,0,0,0,0,0,0,0,0
8183,1,A1,26,0.102564,2,1,1,0.059701,0.672727,0.1841,...,0,0,0,0,0,0,0,0,0,0
77468,1,A8,29,0.025641,2,3,1,0.462687,0.781818,0.299163,...,0,0,0,0,0,0,0,0,0,0
76832,1,E1,26,0.230769,2,3,1,0.134328,0.654545,0.205021,...,0,0,0,1,0,0,0,0,0,0


## Data Cleaning

In [19]:
X_train.info()
#Looking at the info, there's only 1 column that needs to be converted from numerical to categorical

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41566 entries, 59759 to 23552
Columns: 126 entries, Product_Info_1 to Medical_Keyword_48
dtypes: float64(18), int64(107), object(1)
memory usage: 40.3+ MB


### Convert Categorical Data to Numerical Data

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
#Look for unique values and check if how many categories are there
print("Unique Value counts:\n",X_train["Product_Info_2"].value_counts())

Unique Value counts:
 D3    10082
D4     7471
A8     4802
D1     4642
D2     4399
E1     1857
A1     1638
A6     1486
A2     1342
A7      969
B2      786
A3      688
A5      547
C3      215
C1      199
C4      152
A4      147
C2      106
B1       38
Name: Product_Info_2, dtype: int64


In [22]:
prod_info2 = X_train["Product_Info_2"]

In [23]:
prod_info2.value_counts()

D3    10082
D4     7471
A8     4802
D1     4642
D2     4399
E1     1857
A1     1638
A6     1486
A2     1342
A7      969
B2      786
A3      688
A5      547
C3      215
C1      199
C4      152
A4      147
C2      106
B1       38
Name: Product_Info_2, dtype: int64

## Create a function for converting categorical data to numerical

In [24]:
def categorical_to_numerical(dataframe, name):
    series = dataframe.pop(name)
    le = LabelEncoder()
    le_numpy = le.fit_transform(series)
    le_df = pd.DataFrame(le_numpy, columns=["le_"+name])
    le_df.set_index(dataframe.index, inplace=True)
    return pd.concat([dataframe, le_df], axis=1)

In [25]:
X_train1 = categorical_to_numerical(X_train, 'Product_Info_2')
X_test1 = categorical_to_numerical(X_test, 'Product_Info_2')

In [26]:
#check if still the same shape
print("Input train shape:",X_train1.shape)
print("Input test shape:", X_test1.shape)

Input train shape: (41566, 126)
Input test shape: (17815, 126)


## Try inputting the data to the model

In [27]:
rfc = RandomForestClassifier(n_estimators=200,
                            max_depth=8,
                            random_state=10)

In [28]:
rfc.fit(X_train1, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

## There's still an Error?? Why?
<br>
<i>ValueError: Input contains NaN, infinity or a value too large for dtype('float32').</i><br>
The error might mean that the input value has some values of NaN (Not a Number), an infinity or a value that is too large for the dtype.<br> 
So, we must convert first those value into a suitable value.

We can drop, fill, or replace the value to eliminate those undesired values 

In [29]:
# Check the columns that have a NaN Values
X_train1.info(max_cols=len(X_train1.columns))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41566 entries, 59759 to 23552
Data columns (total 126 columns):
Product_Info_1         41566 non-null int64
Product_Info_3         41566 non-null int64
Product_Info_4         41566 non-null float64
Product_Info_5         41566 non-null int64
Product_Info_6         41566 non-null int64
Product_Info_7         41566 non-null int64
Ins_Age                41566 non-null float64
Ht                     41566 non-null float64
Wt                     41566 non-null float64
BMI                    41566 non-null float64
Employment_Info_1      41553 non-null float64
Employment_Info_2      41566 non-null int64
Employment_Info_3      41566 non-null int64
Employment_Info_4      36820 non-null float64
Employment_Info_5      41566 non-null int64
Employment_Info_6      34053 non-null float64
InsuredInfo_1          41566 non-null int64
InsuredInfo_2          41566 non-null int64
InsuredInfo_3          41566 non-null int64
InsuredInfo_4          41566 non-n

In [30]:
#Getting the columns that contains null values or NaN
na_cols = X_train1.columns[X_train1.isnull().any()].tolist()
df_na_cols = X_train1.loc[:,na_cols]
df_na_cols.isnull().sum()

Employment_Info_1         13
Employment_Info_4       4746
Employment_Info_6       7513
Insurance_History_5    17728
Family_Hist_2          20106
Family_Hist_3          23911
Family_Hist_4          13490
Family_Hist_5          29206
Medical_History_1       6222
Medical_History_10     41185
Medical_History_15     31124
Medical_History_24     38892
Medical_History_32     40770
dtype: int64

In [31]:
df_na_cols.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41566 entries, 59759 to 23552
Data columns (total 13 columns):
Employment_Info_1      41553 non-null float64
Employment_Info_4      36820 non-null float64
Employment_Info_6      34053 non-null float64
Insurance_History_5    23838 non-null float64
Family_Hist_2          21460 non-null float64
Family_Hist_3          17655 non-null float64
Family_Hist_4          28076 non-null float64
Family_Hist_5          12360 non-null float64
Medical_History_1      35344 non-null float64
Medical_History_10     381 non-null float64
Medical_History_15     10442 non-null float64
Medical_History_24     2674 non-null float64
Medical_History_32     796 non-null float64
dtypes: float64(13)
memory usage: 4.4 MB


### So many Null Values!!!
<br>
What can we do about it?<br>
My decision is to just <b>drop columns that has so many Null values</b>.. this columns might just affect the overall performance of the model therefore it is necessary to just drop them.

This is a function that chooses what columns are needed to be dropped:

In [32]:
def choose_columns_to_drop(*dataframes, drop_ratio=0.6):
    drop_df = {}
    for df in dataframes:
        for cols in df.columns:
            number_of_not_nulls = df[cols].notnull().sum()
            number_of_instances = len(df)
            if number_of_not_nulls/number_of_instances < drop_ratio:
                if cols in drop_df:
                    drop_df[cols] += 1
                else:
                    drop_df[cols] = 1
    return [i for i in drop_df.keys() if drop_df[i] > 0]

This is a function that drops the features that contains only 0.6 not null values

In [33]:
def drop_columns(*dataframes, columns):
    df_list = []
    for df in dataframes:
        df.drop(columns, axis=1, inplace=True)
        df_list.append(df)
    return tuple(df_list)

In [34]:
drop_cols = choose_columns_to_drop(X_train1, X_test1)

In [35]:
X_train2, X_test2 = drop_columns(X_train1, X_test1, columns=drop_cols)

In [36]:
def fill_null(*dataframes):
    df_list = []
    for df in dataframes:
        df.fillna(np.ceil(df.median()), inplace=True)
        df_list.append(df)
    return tuple(df_list)

In [37]:
X_train3, X_test3 = fill_null(X_train2, X_test2)

In [38]:
#Check if there's still a null value in X_train3
X_train3.info(max_cols=len(X_train3))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41566 entries, 59759 to 23552
Data columns (total 118 columns):
Product_Info_1         41566 non-null int64
Product_Info_3         41566 non-null int64
Product_Info_4         41566 non-null float64
Product_Info_5         41566 non-null int64
Product_Info_6         41566 non-null int64
Product_Info_7         41566 non-null int64
Ins_Age                41566 non-null float64
Ht                     41566 non-null float64
Wt                     41566 non-null float64
BMI                    41566 non-null float64
Employment_Info_1      41566 non-null float64
Employment_Info_2      41566 non-null int64
Employment_Info_3      41566 non-null int64
Employment_Info_4      41566 non-null float64
Employment_Info_5      41566 non-null int64
Employment_Info_6      41566 non-null float64
InsuredInfo_1          41566 non-null int64
InsuredInfo_2          41566 non-null int64
InsuredInfo_3          41566 non-null int64
InsuredInfo_4          41566 non-n

In [39]:
#Check if there's still a null value in X_test3
X_test3.info(max_cols=len(X_test3))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17815 entries, 64443 to 24740
Data columns (total 118 columns):
Product_Info_1         17815 non-null int64
Product_Info_3         17815 non-null int64
Product_Info_4         17815 non-null float64
Product_Info_5         17815 non-null int64
Product_Info_6         17815 non-null int64
Product_Info_7         17815 non-null int64
Ins_Age                17815 non-null float64
Ht                     17815 non-null float64
Wt                     17815 non-null float64
BMI                    17815 non-null float64
Employment_Info_1      17815 non-null float64
Employment_Info_2      17815 non-null int64
Employment_Info_3      17815 non-null int64
Employment_Info_4      17815 non-null float64
Employment_Info_5      17815 non-null int64
Employment_Info_6      17815 non-null float64
InsuredInfo_1          17815 non-null int64
InsuredInfo_2          17815 non-null int64
InsuredInfo_3          17815 non-null int64
InsuredInfo_4          17815 non-n

### Try putting the data in into the model again

In [53]:
rfc = RandomForestClassifier(n_estimators=400,
                            max_depth=10,
                            random_state=10)

In [54]:
rfc.fit(X_train3, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=None,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [55]:
y_pred = rfc.predict(X_test3)

In [56]:
from sklearn.metrics import accuracy_score

In [57]:
acc = accuracy_score(y_test, y_pred)

In [82]:
print("The Accuracy is {:.3f}%".format(acc*100))

The Accuracy is 50.772%


## The Accuracy is pretty low, what do we need to do?
### First: Modify the Data

In [67]:
# These are the types of data based on the data documentation
# One is Dummy variable data or One Hot Encoding
# Discrete values means the numbers are on the fixed interval
# Continuous values means the number can be anything between a range of values
# Categorical values means that the value can be only chosen between its categories.

dummy = "Medical_Keyword_"
discrete = "Medical_History_1, Medical_History_10, Medical_History_15, Medical_History_24, Medical_History_32"
continuous = "Product_Info_4, Ins_Age, Ht, Wt, BMI, Employment_Info_1, Em"
categorical_cols = "Product_Info_1, Product_Info_2, Product_Info_3, Product_Info_5, Product_Info_6, Product_Info_7, Employment_Info_2, Employment_Info_3, Employment_Info_5, InsuredInfo_1, InsuredInfo_2, InsuredInfo_3, InsuredInfo_4, InsuredInfo_5, InsuredInfo_6, InsuredInfo_7, Insurance_History_1, Insurance_History_2, Insurance_History_3, Insurance_History_4, Insurance_History_7, Insurance_History_8, Insurance_History_9, Family_Hist_1, Medical_History_2, Medical_History_3, Medical_History_4, Medical_History_5, Medical_History_6, Medical_History_7, Medical_History_8, Medical_History_9, Medical_History_11, Medical_History_12, Medical_History_13, Medical_History_14, Medical_History_16, Medical_History_17, Medical_History_18, Medical_History_19, Medical_History_20, Medical_History_21, Medical_History_22, Medical_History_23, Medical_History_25, Medical_History_26, Medical_History_27, Medical_History_28, Medical_History_29, Medical_History_30, Medical_History_31, Medical_History_33, Medical_History_34, Medical_History_35, Medical_History_36, Medical_History_37, Medical_History_38, Medical_History_39, Medical_History_40, Medical_History_41"

In [68]:
category_cols = categorical_cols.split(', ')
discrete_cols = discrete.split(", ")
continuous_cols = continuous.split(", ")
dummy_cols = [dummy+str(i) for i in range(1,49)]

### Change label into One Hot Encoding

In [91]:
from sklearn.preprocessing import OneHotEncoder

In [92]:
ohe = OneHotEncoder()

In [96]:
ohe_y_train = ohe.fit_transform(y_train.values.reshape(-1,1))

In [105]:
y_train1 = ohe_y_train.toarray()

In [111]:
def convert_to_one_hot(*series):
    list_ser = []
    ohe = OneHotEncoder(categories="auto")
    for ser in series:
        ohe_ser = ohe.fit_transform(ser.values.reshape(-1,1))
        list_ser.append(ohe_ser.toarray())
    return tuple(list_ser)

In [112]:
y_train1, y_test1 = convert_to_one_hot(y_train, y_test)

In [152]:
from sklearn.linear_model import LogisticRegression

In [172]:
lr = LogisticRegression(n_jobs=-1, multi_class="auto")

In [173]:
lr.fit(X_train1, y_train)

  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False)

In [177]:
y_pred = lr.predict(X_test1)

In [178]:
acc = accuracy_score(y_test, y_pred)

In [179]:
print("The Accuracy is {:.3f}%".format(acc*100))

The Accuracy is 47.662%


In [180]:
y_pred_train = lr.predict(X_train1)
acc_train = accuracy_score(y_train, y_pred_train)
print("The Accuracy is {:.3f}%".format(acc_train*100))

The Accuracy is 48.123%
