- In this notebook we see which feature engineering technique works the best for our categorical data.

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN 
import random
from sklearn.preprocessing import MinMaxScaler
import pickle
import Data_Ingestion_And_Preprocessing as dip

In [2]:
results=pd.DataFrame(columns=['categoric_method','train_score','test_score','difference'])

In [3]:
def fit_rf_model(df_train,df_test,results,categoric_method):
    
    
    X_train=df_train.drop(['default.payment.next.month'],axis=1)
    X_test=df_test.drop(['default.payment.next.month'],axis=1)
    
    y_train=df_train['default.payment.next.month']
    y_test=df_test['default.payment.next.month']
    
    rf=RandomForestClassifier(max_depth=6,random_state=0)
    
    rf=rf.fit(X_train,y_train)
    
    y_train_prob=rf.predict_proba(X_train)[:,1]
    y_test_prob=rf.predict_proba(X_test)[:,1]
    
    train_score=roc_auc_score(y_train,y_train_prob)
    test_score=roc_auc_score(y_test,y_test_prob)
    
    results=results.append({'categoric_method':categoric_method,'train_score':train_score,'test_score':test_score,'difference':train_score-test_score},ignore_index=True)
     
    return results
    

    

In [4]:
# Class Imbalance handeled in Train

df_train_d , df_test_d=dip.load_data_and_preprocess('credit_fraud.csv')

In [18]:
# Fitting a basic model without any FE 

results=fit_rf_model(df_train=df_train_d,df_test=df_test_d,results=results,categoric_method='None')
results

Unnamed: 0,categoric_method,train_score,test_score,difference
0,,0.798849,0.786854,0.011995


### CONVERTING TO CATEGORIC FEATURES 

In [19]:
df_train_d['MARRIAGE']=df_train_d['MARRIAGE'].replace({1:'married',2:'single',3:'others'})
df_train_d['EDUCATION']=df_train_d['EDUCATION'].replace({1:'graduate school',2:'university',3:'high school',4:'others'})
df_train_d['SEX']=df_train_d['SEX'].replace({1:'male', 2:'female'})

In [20]:
df_train_d[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']]=df_train_d[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']].astype('category')

In [21]:
df_train_d[['SEX','MARRIAGE','EDUCATION']]=df_train_d[['SEX','MARRIAGE','EDUCATION']].astype('category')

In [22]:
df_test_d['MARRIAGE']=df_test_d['MARRIAGE'].replace({1:'married',2:'single',3:'others'})
df_test_d['EDUCATION']=df_test_d['EDUCATION'].replace({1:'graduate school',2:'university',3:'high school',4:'others'})
df_test_d['SEX']=df_test_d['SEX'].replace({1:'male', 2:'female'})

In [23]:
df_test_d[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']]=df_test_d[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']].astype('category')

In [24]:
df_test_d[['SEX','MARRIAGE','EDUCATION']]=df_test_d[['SEX','MARRIAGE','EDUCATION']].astype('category')

In [25]:
# This will be needed in deployment#

categoric_columns=df_train_d.select_dtypes('category').columns
target_categoric_encoder={}

for col in categoric_columns:
    
    replace_dict=df_train_d.groupby([col])['default.payment.next.month'].mean().to_dict()
    target_categoric_encoder[col]=replace_dict

In [26]:
pickle.dump(target_categoric_encoder,open("target_encoder_dict.pickle","wb"))

In [27]:
def handle_categoric_column(method,df_train,df_test):
    
    categoric_columns=df_train_d.select_dtypes('category').columns

    if method=='count':
        for col in categoric_columns:
            replace_dict=df_train[col].value_counts().to_dict()

            df_train[col]=df_train[col].replace(replace_dict)
            df_test[col]=df_test[col].replace(replace_dict)
            
    elif method=='target_mean_encoding':
        
        for col in categoric_columns:
            replace_dict=df_train.groupby([col])['default.payment.next.month'].mean().to_dict()

            df_train[col]=df_train[col].replace(replace_dict)
            df_test[col]=df_test[col].replace(replace_dict)
     
    elif method=='one_hot_encoding':
        
        

        df_train['role']=['train']*len(df_train)
        df_test['role']=['test']*len(df_test)
        df=df_train.append([df_test],ignore_index=True)
        df[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']]=df[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']].astype(int)
        df[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']]=df[['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']].astype('category')
        
        categoric_dummies=pd.get_dummies(df[['SEX','MARRIAGE','EDUCATION','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']],drop_first=True)
        df=pd.concat([categoric_dummies,df],axis=1)
        df.drop(['SEX','MARRIAGE','EDUCATION','PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6'],axis=1,inplace=True)
        
        df_train=df[df['role']=='train']
        df_test=df[df['role']=='test']
        
        df_train.drop(['role'],axis=1,inplace=True)
        df_test.drop(['role'],axis=1,inplace=True)
        
    return df_train,df_test


### REPLACING THE CATEGORY WITH ITS COUNT

In [28]:
df_train_count=df_train_d.copy()
df_test_count=df_test_d.copy()

In [29]:
df_train_count,df_test_count=handle_categoric_column(method='count',df_train=df_train_count,df_test=df_test_count)

In [30]:
results=fit_rf_model(df_train=df_train_count,df_test=df_test_count,results=results,categoric_method='Count')

In [31]:
results

Unnamed: 0,categoric_method,train_score,test_score,difference
0,,0.798849,0.786854,0.011995
1,Count,0.797779,0.782674,0.015104


### TARGET MEAN ENCODING

In [32]:
df_train_TEM=df_train_d.copy()
df_test_TEM=df_test_d.copy()

In [33]:
df_train_TEM,df_test_TEM=handle_categoric_column(method='target_mean_encoding',df_train=df_train_TEM,df_test=df_test_TEM)

In [34]:
results=fit_rf_model(df_train=df_train_TEM,df_test=df_test_TEM,results=results,categoric_method='target_mean_encoding')

In [35]:
results

Unnamed: 0,categoric_method,train_score,test_score,difference
0,,0.798849,0.786854,0.011995
1,Count,0.797779,0.782674,0.015104
2,target_mean_encoding,0.797412,0.786448,0.010965


### ONE HOT ENCODING

In [36]:
df_train_OHE=df_train_d.copy()
df_test_OHE=df_test_d.copy()


In [37]:
df_train_OHE,df_test_OHE=handle_categoric_column(method='one_hot_encoding',df_train=df_train_OHE,df_test=df_test_OHE)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [38]:
results=fit_rf_model(df_train=df_train_OHE,df_test=df_test_OHE,results=results,categoric_method='one_hot_encoding')

In [39]:
results

Unnamed: 0,categoric_method,train_score,test_score,difference
0,,0.798849,0.786854,0.011995
1,Count,0.797779,0.782674,0.015104
2,target_mean_encoding,0.797412,0.786448,0.010965
3,one_hot_encoding,0.787904,0.783069,0.004835


### XG-BOOST RESULTS

In [40]:
results_xg=pd.DataFrame(columns=['categoric_method','train_score','test_score','difference'])

In [41]:
import xgboost as xgb

In [42]:
def fit_xg_model(df_train,df_test,results,categoric_method):
    
    
    X_train=df_train.drop(['default.payment.next.month'],axis=1)
    X_test=df_test.drop(['default.payment.next.month'],axis=1)
    
    y_train=df_train['default.payment.next.month']
    y_test=df_test['default.payment.next.month']
    
    xg=xgb.XGBClassifier(max_depth=2,gamma=10,colsample_bytree=0.4)
    
    xg=xg.fit(X_train,y_train)
    
    y_train_prob=xg.predict_proba(X_train)[:,1]
    y_test_prob=xg.predict_proba(X_test)[:,1]
    
    train_score=roc_auc_score(y_train,y_train_prob)
    test_score=roc_auc_score(y_test,y_test_prob)
    
    results=results.append({'categoric_method':categoric_method,'train_score':train_score,'test_score':test_score,'difference':train_score-test_score},ignore_index=True)
     
    return results

### XG_CATEGORY WITH ITS COUNT

In [43]:
results_xg=fit_xg_model(df_train=df_train_count,df_test=df_test_count,results=results_xg,categoric_method='Count')





In [44]:
results_xg

Unnamed: 0,categoric_method,train_score,test_score,difference
0,Count,0.809358,0.780666,0.028692


In [45]:
results_xg=fit_xg_model(df_train=df_train_TEM,df_test=df_test_TEM,results=results_xg,categoric_method='target_mean_encoding')





In [46]:
results_xg

Unnamed: 0,categoric_method,train_score,test_score,difference
0,Count,0.809358,0.780666,0.028692
1,target_mean_encoding,0.804496,0.784769,0.019727


In [47]:
results_xg=fit_xg_model(df_train=df_train_OHE,df_test=df_test_OHE,results=results_xg,categoric_method='one_hot_encoding')





In [48]:
results_xg

Unnamed: 0,categoric_method,train_score,test_score,difference
0,Count,0.809358,0.780666,0.028692
1,target_mean_encoding,0.804496,0.784769,0.019727
2,one_hot_encoding,0.805175,0.782574,0.022601


In [49]:
results

Unnamed: 0,categoric_method,train_score,test_score,difference
0,,0.798849,0.786854,0.011995
1,Count,0.797779,0.782674,0.015104
2,target_mean_encoding,0.797412,0.786448,0.010965
3,one_hot_encoding,0.787904,0.783069,0.004835


In [50]:
compare_results=results_xg.append([results],ignore_index=True)

In [51]:
compare_results.sort_values(by=['difference'])

Unnamed: 0,categoric_method,train_score,test_score,difference
6,one_hot_encoding,0.787904,0.783069,0.004835
5,target_mean_encoding,0.797412,0.786448,0.010965
3,,0.798849,0.786854,0.011995
4,Count,0.797779,0.782674,0.015104
1,target_mean_encoding,0.804496,0.784769,0.019727
2,one_hot_encoding,0.805175,0.782574,0.022601
0,Count,0.809358,0.780666,0.028692


- Best Results are given by One hot encoding
- Second-best and first best have a marginal difference and target mean encoding reduces the dimentionality of the data 
- Hence we consider target mean encoding

### Train And Test Sets

In [52]:
df_unscaled_train=df_train_TEM.copy()
df_unscaled_test=df_test_TEM.copy()

In [53]:
X_train=df_unscaled_train.drop('default.payment.next.month',axis=1)
y_train=df_unscaled_train['default.payment.next.month']

In [54]:
X_test=df_unscaled_test.drop('default.payment.next.month',axis=1)
y_test=df_unscaled_test['default.payment.next.month']

In [55]:
scaler=MinMaxScaler()

In [56]:
X_train=pd.DataFrame(data=scaler.fit_transform(X_train),columns=X_train.columns,index=X_train.index)

In [57]:
X_test=pd.DataFrame(data=scaler.transform(X_test),columns=X_test.columns,index=X_test.index)

In [58]:
df_train_scaled=pd.concat([X_train,y_train],axis=1)

In [59]:
df_test_scaled=pd.concat([X_test,y_test],axis=1)

### SAVING THE PREPROCESSED DATA

In [None]:
df_unscaled_train.to_csv('train.csv')

In [None]:
df_unscaled_test.to_csv('test.csv')

In [None]:
df_train_scaled.to_csv('train_scaled.csv')

In [None]:
df_test_scaled.to_csv('test_scaled.csv')