In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import xticks
%matplotlib inline
from sklearn.metrics import precision_score, recall_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import statsmodels.api as sm
from sklearn.metrics import precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image  
from six import StringIO  
# from sklearn.tree import export_graphviz
# import pydotplus, graphviz
import os
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedKFold

pd.set_option('display.max_columns', None)

# Load demographic data

In [2]:
demographic = pd.read_csv("/kaggle/input/demo-data-eda/Demographic data.csv")
demographic.head()

Unnamed: 0,Application ID,Age,Gender,Marital Status (at the time of application),No of dependents,Income,Education,Profession,Type of residence,No of months in current residence,No of months in current company,Performance Tag
0,954457215,48,F,Married,2.0,40.0,Bachelor,SAL,Rented,113,56,0.0
1,432830445,31,M,Married,4.0,55.0,Professional,SE_PROF,Rented,112,46,0.0
2,941387308,32,M,Single,2.0,46.0,Bachelor,SE_PROF,Rented,104,49,0.0
3,392161677,43,M,Married,1.0,53.0,Bachelor,SE,Rented,94,53,0.0
4,182011211,35,F,Married,5.0,44.0,Professional,SAL,Rented,112,43,0.0


Age & Income variables have negative values, we will treat them later

In [3]:
def rename_column(data):
    data = data.rename(columns = {"Profession " : "Profession"})
    data = data.rename(columns = {"Marital Status (at the time of application)" : "Marital Status"})
    data.set_index('Application ID', inplace=True)
    return data

In [4]:
demographic = rename_column(demographic)
demographic.head()

Unnamed: 0_level_0,Age,Gender,Marital Status,No of dependents,Income,Education,Profession,Type of residence,No of months in current residence,No of months in current company,Performance Tag
Application ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
954457215,48,F,Married,2.0,40.0,Bachelor,SAL,Rented,113,56,0.0
432830445,31,M,Married,4.0,55.0,Professional,SE_PROF,Rented,112,46,0.0
941387308,32,M,Single,2.0,46.0,Bachelor,SE_PROF,Rented,104,49,0.0
392161677,43,M,Married,1.0,53.0,Bachelor,SE,Rented,94,53,0.0
182011211,35,F,Married,5.0,44.0,Professional,SAL,Rented,112,43,0.0


In [5]:
# Remove the rows with performance tag as NULL
# Remove those rows where performance tag is NA
Demo_tag_clean = demographic.dropna(subset=['Performance Tag'])
X = Demo_tag_clean.drop(columns=['Performance Tag'])
y = Demo_tag_clean['Performance Tag']

In [6]:
# Doubt 1
# What if a particular column is not missing in the train data but it found few missing cases
# in test data. If we are trying out with Logistic Regression or weak learners then our model
# will not work in production because these algorithms doesn't have inherent characteristic to 
# deal with missing values.

# Doubt 2
#  What if the test data distribution is different than train data distribution. For example:-
#  in the train data age column is in range of 25 to 60 but in test data we found few ages 
#  which are either ineligible/corrupt. 

In [7]:
demo_tag_na = demographic[pd.isna(demographic['Performance Tag'])]
demo_tag_na.to_csv('Demographic_Unlabelled_Test_Set.csv',index = False)

In [8]:
X_train, testx, y_train, testy = train_test_split(X, y, test_size=0.3, random_state=42)

X_test, X_cv, y_test, y_cv = train_test_split(testx, testy, test_size=0.5, random_state=42)

In [9]:
X_train['Performance Tag'] = y_train
X_test['Performance Tag'] = y_test
X_cv['Performance Tag'] = y_cv

X_train.to_csv('Demographic_Labelled_Train_Set.csv',index = False)
X_test.to_csv('Demographic_Labelled_Test_Set.csv',index = False)
X_cv.to_csv('Demographic_Labelled_CV_Set.csv',index = False)

In [10]:
# print(Demo_tag_clean.shape)
# print(X_train.shape)
# print(X_test.shape)
# print(X_cv.shape)

# (69870, 12)
# (48909, 11)
# (10480, 11)
# (10481, 11)

# Imputing Missing Values and Feature Engineering in the Demographic dataset

In [11]:
def impute_age(data, data2):
    data.loc[data.Age<18,'Age'] = None
    data['Age'] = data['Age'].fillna(data['Age'].median())
    
    data2.loc[data2.Age<18,'Age'] = None
    data2['Age'] = data2['Age'].fillna(data['Age'].median())
    
    return data,data2

    
def impute_income(data,data2):
    data.loc[data.Income<=0,'Income'] = None
    data['Income'] = data['Income'].fillna(data['Income'].median())
    
    data2.loc[data2.Income<=0,'Income'] = None
    data2['Income'] = data2['Income'].fillna(data['Income'].median())
    
    return data,data2

def impute_education(data,data2):
#     global pct_train
    freq_train = data.dropna().groupby(["Marital Status", "Gender","Profession","PCI_Flag"])["Education"].value_counts().unstack()
    pct_train = freq_train.divide(freq_train.sum(axis=0), axis=1)
    freq_train = freq_train.fillna(0)

    pct_train["new_val"] = pct_train.idxmax(axis=1)
    
    
    for i in range(pct_train.shape[0]):
        data.loc[(data.Education.isnull()) & (data['Marital Status'] == pct_train.index[i][0]) & \
                     (data.Gender == pct_train.index[i][1]) &\
                     (data.Profession == pct_train.index[i][2]) & \
                    (data.PCI_Flag == pct_train.index[i][3])
               , "Education"] = pct_train.new_val.values[i]
        
        data2.loc[(data2.Education.isnull()) & (data2['Marital Status'] == pct_train.index[i][0]) & \
                     (data2.Gender == pct_train.index[i][1]) &\
                     (data2.Profession == pct_train.index[i][2]) & \
                    (data2.PCI_Flag == pct_train.index[i][3])
               , "Education"] = pct_train.new_val.values[i]
    
    return data,data2

def impute_dependents(dataset,data2):
#     global pct_train
    freq_train = dataset.dropna().groupby(["Marital Status", "Gender"])["No of dependents"].value_counts().unstack()
    pct_train = freq_train.divide(freq_train.sum(axis=0), axis=1)
    freq_train = freq_train.fillna(0)

    pct_train["new_val"] = pct_train.idxmax(axis=1)
    
    
    for i in range(pct_train.shape[0]):
        dataset.loc[(dataset["No of dependents"].isnull()) & (dataset['Marital Status'] == pct_train.index[i][0]) & \
                     (dataset.Gender == pct_train.index[i][1])
               , "No of dependents"] = pct_train.new_val.values[i]
        
        data2.loc[(data2["No of dependents"].isnull()) & (data2['Marital Status'] == pct_train.index[i][0]) & \
                     (data2.Gender == pct_train.index[i][1])
               , "No of dependents"] = pct_train.new_val.values[i]
    
    return dataset, data2

def impute_profession(dataset,data2):
#     global pct_train
    freq_train = dataset.dropna().groupby(["Education", "PCI_Flag"])["Profession"].value_counts().unstack()
    pct_train = freq_train.divide(freq_train.sum(axis=0), axis=1)
    freq_train = freq_train.fillna(0)

    pct_train["new_val"] = pct_train.idxmax(axis=1)
    
    
    for i in range(pct_train.shape[0]):
        dataset.loc[(dataset["Profession"].isnull()) & (dataset['Education'] == pct_train.index[i][0]) & \
                     (dataset.PCI_Flag == pct_train.index[i][1])
               , "Profession"] = pct_train.new_val.values[i]
        
        data2.loc[(data2["Profession"].isnull()) & (data2['Education'] == pct_train.index[i][0]) & \
                     (data2.PCI_Flag == pct_train.index[i][1])
               , "Profession"] = pct_train.new_val.values[i]
    
    return dataset, data2

def impute_residence(dataset, data2):
#     global pct_train
    freq_train = dataset.dropna().groupby(["PCI_Flag",'No of years in current residence','Profession'])["Type of residence"].value_counts().unstack()
    pct_train = freq_train.divide(freq_train.sum(axis=0), axis=1)
    freq_train = freq_train.fillna(0)

    pct_train["new_val"] = pct_train.idxmax(axis=1)
    
    
    for i in range(pct_train.shape[0]):
        dataset.loc[(dataset["Type of residence"].isnull()) & (dataset['PCI_Flag'] == pct_train.index[i][0]) & \
                     (dataset['No of years in current residence'] == pct_train.index[i][1]) &\
                    (dataset.Profession == pct_train.index[i][2])
               , "Type of residence"] = pct_train.new_val.values[i]
        
        data2.loc[(data2["Type of residence"].isnull()) & (data2['PCI_Flag'] == pct_train.index[i][0]) & \
                     (data2['No of years in current residence'] == pct_train.index[i][1]) &\
                    (data2.Profession == pct_train.index[i][2])
               , "Type of residence"] = pct_train.new_val.values[i]
    
    return dataset,data2

def impute_marital(dataset,data2):
#     global pct_train
    freq_train = dataset.dropna().groupby(["No of dependents", "Gender"])["Marital Status"].value_counts().unstack()
    pct_train = freq_train.divide(freq_train.sum(axis=0), axis=1)
    freq_train = freq_train.fillna(0)

    pct_train["new_val"] = pct_train.idxmax(axis=1)
    
    
    for i in range(pct_train.shape[0]):
        dataset.loc[(dataset["Marital Status"].isnull()) & (dataset['No of dependents'] == pct_train.index[i][0]) & \
                     (dataset.Gender == pct_train.index[i][1])
               , "Marital Status"] = pct_train.new_val.values[i]
        
        data2.loc[(data2["Marital Status"].isnull()) & (data2['No of dependents'] == pct_train.index[i][0]) & \
                     (data2.Gender == pct_train.index[i][1])
               , "Marital Status"] = pct_train.new_val.values[i]
    
    return dataset,data2


def impute_gender(data,data2):
    data['Gender'] = data['Gender'].fillna(data['Gender'].mode()[0])
    data2['Gender'] = data2['Gender'].fillna(data['Gender'].mode()[0])
    return data,data2


In [12]:
def Feature_engineering(data,data2):
    data['Per_Capita_Income'] = data['Income']/(data['No of dependents'] + 1)
    data["No of years in current residence"] = data["No of months in current residence"] // 12
    data["No of years in current company"] = data["No of months in current company"] // 12
    
    data2['Per_Capita_Income'] = data2['Income']/(data2['No of dependents'] + 1)
    data2["No of years in current residence"] = data2["No of months in current residence"] // 12
    data2["No of years in current company"] = data2["No of months in current company"] // 12
    
    median_income = data['Per_Capita_Income'].median()
    mean_income = data['Per_Capita_Income'].mean()
    
    def assign_pci_flag(income, median_income, mean_income):
        if income < median_income:
            return 'Low'
        elif median_income <= income <= mean_income:
            return 'Medium'
        else:
            return 'High'
    
    
    data['PCI_Flag'] = data['Per_Capita_Income'].apply(lambda x: assign_pci_flag(x, median_income, mean_income))
    data2['PCI_Flag'] = data2['Per_Capita_Income'].apply(lambda x: assign_pci_flag(x, median_income, mean_income))

    
    return data,data2

In [13]:
def pre_process(df,df2):    
    df,df2 = Feature_engineering(df,df2)
    df,df2 = impute_age(df,df2)
    df,df2 = impute_income(df,df2)
    df,df2 = impute_education(df,df2)
    df,df2 = impute_dependents(df,df2)
    df,df2 = impute_profession(df,df2)
    df,df2 = impute_residence(df,df2)
    df,df2 = impute_marital(df,df2)
    df,df2 = impute_gender(df,df2)
    df,df2 = Feature_engineering(df,df2)
    
    return df,df2

In [14]:
X_train

Unnamed: 0_level_0,Age,Gender,Marital Status,No of dependents,Income,Education,Profession,Type of residence,No of months in current residence,No of months in current company,Performance Tag
Application ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
349360553,58,M,Married,5.0,4.5,Professional,SAL,Owned,58,3,0.0
214585044,65,F,Single,5.0,14.0,Professional,SAL,Rented,105,28,0.0
337084207,46,M,Married,2.0,30.0,Masters,SE,Rented,66,41,0.0
834400147,48,F,Married,2.0,20.0,Bachelor,SAL,Rented,6,55,0.0
823236964,28,F,Single,1.0,14.0,Professional,SE_PROF,Rented,6,54,0.0
...,...,...,...,...,...,...,...,...,...,...,...
994849870,40,M,Married,1.0,45.0,Masters,SAL,Rented,6,24,0.0
378628099,41,M,Married,3.0,49.0,Professional,SAL,Rented,36,28,0.0
705580754,53,M,Single,4.0,4.5,Professional,SE_PROF,Rented,32,75,0.0
134951209,16,M,Single,2.0,29.0,Masters,SAL,Living with Parents,67,54,0.0


In [15]:
# X_train,X_test = pre_process(X_train,X_test)
# X_train.isna().sum()

In [16]:
# X_test.isna().sum()

In [17]:
def rd(X_train1):
    rd1 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence']).size()
    rd2 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence'])["No of years in current company"].mean()
    rd3 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence'])["No of years in current company"].median()
    rd4 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence'])["Age"].median()
    rd5 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence'])["Income"].median()
    rd6 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence'])["Age"].mean()
    rd7 = X_train1.groupby(["Gender", "Marital Status",'No of dependents','Education','Profession','Type of residence'])["Income"].mean()
    
    return rd1,rd2,rd3,rd4,rd5,rd6,rd7

In [18]:
def d1(X_train1):
    rd11 = X_train1.groupby(["Marital Status"]).size()
    rd12 = X_train1.groupby(["Marital Status"])["No of years in current company"].mean()
    rd13 = X_train1.groupby(["Marital Status"])["No of years in current company"].median()
    rd14 = X_train1.groupby(["Marital Status"])["Age"].median()
    rd15 = X_train1.groupby(["Marital Status"])["Income"].median()
    rd16 = X_train1.groupby(["Marital Status"])["Age"].mean()
    rd17 = X_train1.groupby(["Marital Status"])["Income"].mean()
    
    return rd11,rd12,rd13,rd14,rd15,rd16,rd17

In [19]:
def d2(X_train1):
    rd21 = X_train1.groupby(["No of dependents"]).size()
    rd22 = X_train1.groupby(["No of dependents"])["No of years in current company"].mean()
    rd23 = X_train1.groupby(["No of dependents"])["No of years in current company"].median()
    rd24 = X_train1.groupby(["No of dependents"])["Age"].median()
    rd25 = X_train1.groupby(["No of dependents"])["Income"].median()
    rd26 = X_train1.groupby(["No of dependents"])["Age"].mean()
    rd27 = X_train1.groupby(["No of dependents"])["Income"].mean()
    
    return rd21,rd22,rd23,rd24,rd25,rd26,rd27

In [20]:
def d3(X_train1):
    rd31 = X_train1.groupby(["Education"]).size()
    rd32 = X_train1.groupby(["Education"])["No of years in current company"].mean()
    rd33 = X_train1.groupby(["Education"])["No of years in current company"].median()
    rd34 = X_train1.groupby(["Education"])["Age"].median()
    rd35 = X_train1.groupby(["Education"])["Income"].median()
    rd36 = X_train1.groupby(["Education"])["Age"].mean()
    rd37 = X_train1.groupby(["Education"])["Income"].mean()
    
    return rd31,rd32,rd33,rd34,rd35,rd36,rd37

In [21]:
def d4(X_train1):
    rd41 = X_train1.groupby(["Profession"]).size()
    rd42 = X_train1.groupby(["Profession"])["No of years in current company"].mean()
    rd43 = X_train1.groupby(["Profession"])["No of years in current company"].median()
    rd44 = X_train1.groupby(["Profession"])["Age"].median()
    rd45 = X_train1.groupby(["Profession"])["Income"].median()
    rd46 = X_train1.groupby(["Profession"])["Age"].mean()
    rd47 = X_train1.groupby(["Profession"])["Income"].mean()
    
    return rd41,rd42,rd43,rd44,rd45,rd46,rd47

In [22]:
def d5(X_train1):
    rd51 = X_train1.groupby(["Type of residence"]).size()
    rd52 = X_train1.groupby(["Type of residence"])["No of years in current company"].mean()
    rd53 = X_train1.groupby(["Type of residence"])["No of years in current company"].median()
    rd54 = X_train1.groupby(["Type of residence"])["Age"].median()
    rd55 = X_train1.groupby(["Type of residence"])["Income"].median()
    rd56 = X_train1.groupby(["Type of residence"])["Age"].mean()
    rd57 = X_train1.groupby(["Type of residence"])["Income"].mean()
    
    return rd51,rd52,rd53,rd54,rd55,rd56,rd57

In [23]:
def d6(X_train1):
    rd61 = X_train1.groupby(["Gender"]).size()
    rd62 = X_train1.groupby(["Gender"])["No of years in current company"].mean()
    rd63 = X_train1.groupby(["Gender"])["No of years in current company"].median()
    rd64 = X_train1.groupby(["Gender"])["Age"].median()
    rd65 = X_train1.groupby(["Gender"])["Income"].median()
    rd66 = X_train1.groupby(["Gender"])["Age"].mean()
    rd67 = X_train1.groupby(["Gender"])["Income"].mean()
    
    return rd61,rd62,rd63,rd64,rd65,rd66,rd67

In [24]:
def feature_engineering(X_train1,X_test1):
    
    
    rd1,rd2,rd3,rd4,rd5,rd6,rd7 = rd(X_train1)

    for i in range(rd1.shape[0]):
        Gender = rd1.index[i][0]
        Marital = rd1.index[i][1]
        Dependents = rd1.index[i][2]
        Education = rd1.index[i][3]
        Profession = rd1.index[i][4]
        Residence = rd1.index[i][5]
        

        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Customer_count"] =rd1[i]
        
        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Company_mean_year"] =rd2[i]
        
        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Company_median_year"] =rd3[i]
        
        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Median_age"] =rd4[i]
        
        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Median_income"] =rd5[i]
        
        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Mean_age"] =rd6[i]
        
        X_train1.loc[(X_train1.Gender == Gender) & (X_train1['Marital Status'] == Marital) \
                     &(X_train1['No of dependents'] == Dependents) \
                     &(X_train1['Education'] == Education) \
                     &(X_train1['Profession'] == Profession)
                     &(X_train1['Type of residence'] == Residence),\
                     "Mean_income"] =rd7[i]
        
    #---- X_test1
    
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Customer_count"] =rd1[i]
        
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Company_mean_year"] =rd2[i]
        
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Company_median_year"] =rd3[i]
        
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Median_age"] =rd4[i]
        
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Median_income"] =rd5[i]
        
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Mean_age"] =rd6[i]
        
        X_test1.loc[(X_test1.Gender == Gender) & (X_test1['Marital Status'] == Marital) \
                     &(X_test1['No of dependents'] == Dependents) \
                     &(X_test1['Education'] == Education) \
                     &(X_test1['Profession'] == Profession)
                     &(X_test1['Type of residence'] == Residence),\
                     "Mean_income"] =rd7[i]
    
    X_train1['PCI_mean_Income2']    = X_train1['Mean_income']/(X_train1['No of dependents']+1)
    X_train1['PCI_median_Income2']  = X_train1['Median_income']/(X_train1['No of dependents']+1)
    
    X_test1['PCI_mean_Income2']    = X_test1['Mean_income']/(X_test1['No of dependents']+1)
    X_test1['PCI_median_Income2']  = X_test1['Median_income']/(X_test1['No of dependents']+1)
        
    rd11,rd12,rd13,rd14,rd15,rd16,rd17 = d1(X_train1)
    rd21,rd22,rd23,rd24,rd25,rd26,rd27 = d2(X_train1)
    rd31,rd32,rd33,rd34,rd35,rd36,rd37 = d3(X_train1)
    rd41,rd42,rd43,rd44,rd45,rd46,rd47 = d4(X_train1)
    rd51,rd52,rd53,rd54,rd55,rd56,rd57 = d5(X_train1)
    rd61,rd62,rd63,rd64,rd65,rd66,rd67 = d6(X_train1)
    
    for i in list(rd11.index):
        marital = i

        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Cust_Count"] = rd11[i]
        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Company_mean_year"] = rd12[i]
        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Company_median_year"] = rd13[i]
        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Median_age"] = rd14[i]
        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Median_income"] = rd15[i]
        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Mean_age"] = rd16[i]
        X_train1.loc[(X_train1['Marital Status'] == marital), "Marital_Mean_income"] = rd17[i] 
        
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Cust_Count"] = rd11[i]
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Company_mean_year"] = rd12[i]
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Company_median_year"] = rd13[i]
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Median_age"] = rd14[i]
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Median_income"] = rd15[i]
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Mean_age"] = rd16[i]
        X_test1.loc[(X_test1['Marital Status'] == marital), "Marital_Mean_income"] = rd17[i] 
        
    
    for i in list(rd21.index):
        dependent = i

        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Cust_Count"] = rd21[i]
        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Company_mean_year"] = rd22[i]
        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Company_median_year"] = rd23[i]
        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Median_age"] = rd24[i]
        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Median_income"] = rd25[i]
        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Mean_age"] = rd26[i]
        X_train1.loc[(X_train1['No of dependents'] == dependent), "Dependent_Mean_income"] = rd27[i] 
        
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Cust_Count"] = rd21[i]
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Company_mean_year"] = rd22[i]
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Company_median_year"] = rd23[i]
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Median_age"] = rd24[i]
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Median_income"] = rd25[i]
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Mean_age"] = rd26[i]
        X_test1.loc[(X_test1['No of dependents'] == dependent), "Dependent_Mean_income"] = rd27[i]  
        
   #==============
    for i in list(rd31.index):
        Education = i

        X_train1.loc[(X_train1['Education'] == Education), "Edu_Cust_Count"] = rd31[i]
        X_train1.loc[(X_train1['Education'] == Education), "Edu_Company_mean_year"] = rd32[i]
        X_train1.loc[(X_train1['Education'] == Education), "Edu_Company_median_year"] = rd33[i]
        X_train1.loc[(X_train1['Education'] == Education), "Edu_Median_age"] = rd34[i]
        X_train1.loc[(X_train1['Education'] == Education), "Edu_Median_income"] = rd35[i]
        X_train1.loc[(X_train1['Education'] == Education), "Edu_Mean_age"] = rd36[i]
        X_train1.loc[(X_train1['Education'] == Education), "Edu_Mean_income"] = rd37[i] 
        
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Cust_Count"] = rd31[i]
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Company_mean_year"] = rd32[i]
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Company_median_year"] = rd33[i]
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Median_age"] = rd34[i]
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Median_income"] = rd35[i]
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Mean_age"] = rd36[i]
        X_test1.loc[(X_test1['Education'] == Education), "Edu_Mean_income"] = rd37[i] 
        
    
    for i in list(rd41.index):
        Profession = i

        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Cust_Count"] = rd41[i]
        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Company_mean_year"] = rd42[i]
        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Company_median_year"] = rd43[i]
        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Median_age"] = rd44[i]
        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Median_income"] = rd45[i]
        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Mean_age"] = rd46[i]
        X_train1.loc[(X_train1['Profession'] == Profession), "Prof_Mean_income"] = rd47[i] 
        
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Cust_Count"] = rd41[i]
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Company_mean_year"] = rd42[i]
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Company_median_year"] = rd43[i]
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Median_age"] = rd44[i]
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Median_income"] = rd45[i]
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Mean_age"] = rd46[i]
        X_test1.loc[(X_test1['Profession'] == Profession), "Prof_Mean_income"] = rd47[i]  
        
    
    #==============
    for i in list(rd51.index):
        Residence = i

        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Cust_Count"] = rd51[i]
        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Company_mean_year"] = rd52[i]
        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Company_median_year"] = rd53[i]
        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Median_age"] = rd54[i]
        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Median_income"] = rd55[i]
        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Mean_age"] = rd56[i]
        X_train1.loc[(X_train1['Type of residence'] == Residence), "Residence_Mean_income"] = rd57[i] 
        
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Cust_Count"] = rd51[i]
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Company_mean_year"] = rd52[i]
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Company_median_year"] = rd53[i]
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Median_age"] = rd54[i]
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Median_income"] = rd55[i]
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Mean_age"] = rd56[i]
        X_test1.loc[(X_test1['Type of residence'] == Residence), "Residence_Mean_income"] = rd57[i] 
        
    
    for i in list(rd61.index):
        Gender = i

        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Cust_Count"] = rd61[i]
        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Company_mean_year"] = rd62[i]
        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Company_median_year"] = rd63[i]
        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Median_age"] = rd64[i]
        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Median_income"] = rd65[i]
        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Mean_age"] = rd66[i]
        X_train1.loc[(X_train1['Gender'] == Gender), "Gender_Mean_income"] = rd67[i] 
        
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Cust_Count"] = rd61[i]
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Company_mean_year"] = rd62[i]
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Company_median_year"] = rd63[i]
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Median_age"] = rd64[i]
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Median_income"] = rd65[i]
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Mean_age"] = rd66[i]
        X_test1.loc[(X_test1['Gender'] == Gender), "Gender_Mean_income"] = rd67[i]  
         
    #=======
    
    return X_train1, X_test1

In [25]:
# rd11,rd12,rd13,rd14,rd15,rd16,rd17 = d1(X_train)
# rd21,rd22,rd23,rd24,rd25,rd26,rd27 = d2(X_train)
# rd31,rd32,rd33,rd34,rd35,rd36,rd37 = d3(X_train)
# rd41,rd42,rd43,rd44,rd45,rd46,rd47 = d4(X_train)
# rd51,rd52,rd53,rd54,rd55,rd56,rd57 = d5(X_train)
# rd61,rd62,rd63,rd64,rd65,rd66,rd67 = d6(X_train)

In [26]:
def calc_smooth_mean(X_train1, X_test1,X_train1_1, groupby_col, target_col, m):
    mean = X_train1_1[target_col].mean()
    agg = X_train1_1.groupby(groupby_col)[target_col].agg(['count', 'mean'])
    
    counts = agg['count']
    means = agg['mean']
    
    smooth = (counts * means + m * mean) / (counts + m)
    
    return X_train1[groupby_col].map(smooth), X_test1[groupby_col].map(smooth)

In [27]:
def get_dummies(df,col):
    dummies = pd.get_dummies(df[str(col)], drop_first = True)
    fin = pd.concat([df, dummies], axis = 1).drop([str(col)], axis = 1)
    
    return fin

In [28]:
def scaling(X_train, X_test):
    
#     X_train.set_index('Application ID', inplace=True)
#     X_test.set_index('Application ID', inplace=True)
    
    scaler = RobustScaler().fit(X_train)

    X_train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns, index = X_train.index)
    X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns,index = X_test.index)
    
    
    return X_train_scaled, X_test_scaled

In [29]:
X_train.isna().sum()

Age                                   0
Gender                                2
Marital Status                        4
No of dependents                      3
Income                                0
Education                            85
Profession                           12
Type of residence                     5
No of months in current residence     0
No of months in current company       0
Performance Tag                       0
dtype: int64

In [30]:
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

i = 1
# Perform k-fold cross-validation 
for train_index, test_index in kfold.split(X_train, y_train):
    X_train1, X_test1 = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train1, y_test1 = y_train.iloc[train_index], y_train.iloc[test_index]

    # Impute Missing values in train
    X_train1,X_test1 = pre_process(X_train1,X_test1)
    
    
    # Feature Engineering : Import class
    X_train1, X_test1 = feature_engineering(X_train1, X_test1)

    # Smoothening : Function defined above

    #https://maxhalford.github.io/blog/target-encoding/
    X_train1_1 = X_train1.copy()
    X_train1_1['Performance Tag'] = y_train1



    X_train1["Education"], X_test1["Education"] = calc_smooth_mean(X_train1, X_test1,X_train1_1, "Education", "Performance Tag", 300)
    X_train1["Profession"], X_test1["Profession"] = calc_smooth_mean(X_train1, X_test1,X_train1_1, "Profession", "Performance Tag", 300)
    X_train1["Type of residence"], X_test1["Type of residence"] = calc_smooth_mean(X_train1, X_test1,X_train1_1, "Type of residence", "Performance Tag", 300)
    X_train1["PCI_Flag"], X_test1["PCI_Flag"] = calc_smooth_mean(X_train1, X_test1,X_train1_1, "PCI_Flag", "Performance Tag", 300)

    
    
    # Dummy variable
    X_train1 = get_dummies(X_train1, "Gender")
    X_train1 = get_dummies(X_train1, "Marital Status")
    
    X_test1 = get_dummies(X_test1, "Gender")
    X_test1 = get_dummies(X_test1, "Marital Status")

    # Scaling
    X_train1, X_test1 = scaling(X_train1, X_test1)
    X_train1['Performance Tag'] = y_train1
    X_test1['Performance Tag'] = y_test1

    X_train1.to_csv(f'X_train{i}.csv',index = True)
    X_test1.to_csv(f'X_test{i}.csv',index = True)
    print(i,"th Iteration done")
    
    i+=1

1 th Iteration done
2 th Iteration done
3 th Iteration done
4 th Iteration done
5 th Iteration done
6 th Iteration done
7 th Iteration done
8 th Iteration done
9 th Iteration done
10 th Iteration done


In [31]:
# QC of train sets and test sets
X_train1 = pd.read_csv('/kaggle/working/X_train1.csv')
X_train1['Performance Tag'].value_counts()

Performance Tag
0.0    42188
1.0     1830
Name: count, dtype: int64

In [32]:
# Impute the test set and cross validation set basis of train set