## Home Credit Default Risk - Team 3 (Kahsai, Nichols, Pellerito)

### Import packages

In [None]:
# standard Python tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss   # need this for chi-squared function

# special tools for working in Kaggle
import joblib   # save and load ML models
import gc       # garbage collection
import os 

# preprocessing steps
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# machine learning models and tools
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# surely there will be a lot more packages loaded by the time we are done!

# First look at training data set

### Read the training data

In [None]:
MainDir = "../input/../input/home-credit-default-risk"
print(os.listdir(MainDir))

# Main table
train = pd.read_csv(f'{MainDir}/application_train.csv')

# Supplemental data - we can create additional feature sets by analyzing these.
bureau = pd.read_csv(f'{MainDir}/bureau_balance.csv')
cc = pd.read_csv(f'{MainDir}/credit_card_balance.csv')
# and so on - not going to worry about these just yet

In [None]:
# bureau - separate data set with history for each customer. Let's make a crosstable and then 
# left-join it into the training data.

#bureau.shape           # 27299925, 3
#bureau.nunique()       # 817,395 unique customers, 8 unique statuses
bureau_table = pd.crosstab(bureau.SK_ID_BUREAU, bureau.STATUS)
bureau_table.head()

In [None]:
# cc - credit card data. Let's get average balance by customer
#cc.shape           # 3840312, 23
#cc.nunique()       
cc_table = cc.groupby(['SK_ID_CURR']).agg(np.mean)['AMT_BALANCE'].to_frame()
cc_table.head()


### View the training data set

In [None]:
print('Shape of training data:', train.shape)
pd.set_option("display.max_columns", None)            # makes it scrollable horizontally instead of suppressing columns
train.head(5)
# over 300,000 records in the training set, and 121 features (plus whatever other features we end up importing
# from the supplemental tables.) Target variable is "TARGET."

### Proportion Table for target variable

In [None]:
# Proportion - about 91.9% are zero (not default) and 8.1% are one (default.)
# Project is being scored as AUC (area under curve) i.e. confidence matters. The best scores of all time are around 80-81%
# so you can't score high by making a model that naively guesses that everything is a no-default.
(train['TARGET'].value_counts() / len(train)).to_frame()

### How many categorical variables do we have, and how many levels in each?

In [None]:
train.dtypes.value_counts()
# There are 16 categorical variables in our model.
# We should also take a look at the 41 integer variables - some could be counting statistics (e.g. family size) but others could be integer-encoded categorical variables

In [None]:
train.select_dtypes('object').apply(pd.Series.nunique, axis = 0).to_frame()
# These are the sixteen categorical variables: there would be 140 dummy variables in our model if we one-hot encoded all of these.

### Missing Data

In [None]:
train.isna().sum().to_frame().sort_values(0, ascending = False).head(50)
# Lots of variables have missing values. We need to come up with a strategy for imputing missing values.

### Some Visualizations

In [None]:
# Let's create a sample of 1000 rows from the training data, so that these graphics can render in a reasonable amount of time.
train_1K = train.sample(n=1000, random_state=1)

In [None]:
# Income by occupation type
fig, ax = plt.subplots(figsize=(15, 6))
ax = sns.boxplot(y = "OCCUPATION_TYPE", x = "AMT_INCOME_TOTAL", orient = "h", data = train_1K)
plt.xlim([0, 1e6])
plt.show()

In [None]:
# I like two-sided violin plots for categorical classification problems. They can help you see whether different groups
# have different sensitivities.

fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.violinplot(x="WEEKDAY_APPR_PROCESS_START", y="AMT_CREDIT", hue = "TARGET", split = True, data=train_1K)
plt.show()

In [None]:
# Pairplot - I just picked a few continuous variables to show. Blue = 0 (not default), orange = 1 (default)
NumRows = train_1K.iloc[:,[1, 7,8,9]]
ax = plt.figure(figsize = (8, 8))
ax=sns.pairplot(NumRows, hue = "TARGET", plot_kws={'s':20})
plt.tight_layout()
plt.show()

### Correlations

In [None]:
correl = train.corr()
fig, ax = plt.subplots(figsize=(24, 24))
sns.heatmap(correl, annot = False, cmap = "BuPu", label = 'small', cbar = False)
ax.set_title('Correlation Matrix'); 
plt.show()

# Big ol' correlation matrix shows that there are some highly correlated variables. This data set could be
# a candidate for feature reduction using Principal Component Analysis.

### Categorical association - Cramer's V

In [None]:
# https://towardsdatascience.com/the-search-for-categorical-correlation-a1cf7f1888c9

# Cramer's V (φ) is similar to Pearson's R (correlation coefficient) but it works with categorical data.
# While R has a range from -1 to +1, V has a range of 0 to +1. We are going to use this to build a
# heatmap that will help us evaluate whether the categorical variables are independent.

def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
# I don't see a big opportunity for feature reduction here, but it was worth taking a look. And the heatmap looks cool.

CatFeatures = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 
               'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START', 
               'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']

LL = len(CatFeatures)
cramers_outputs = []
for x in range(0,LL):
    first = train.loc[:,CatFeatures[x]]
    for y in range(0,LL):
        second = train.loc[:,CatFeatures[y]]
        result = round(cramers_v(first,second),4)
        cramers_outputs.append(result.tolist())
array = np.array(cramers_outputs)
reshaped = array.reshape(LL,LL)

fig = plt.figure(figsize = (8, 8))  # instanciate figure for heat map
ax = sns.heatmap(reshaped, annot = True,  cmap = "BuPu", fmt=".0%", cbar = False)
ax.set_xticklabels(CatFeatures)
ax.set_yticklabels(CatFeatures)
ax.tick_params(axis = 'x', labelrotation = 90)
ax.tick_params(axis = 'y', labelrotation = 0)
ax.set_title("Heatmap of Cramer's V on categorical variables");

### Some barplots

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def DisplayBreakdown(vars) :
    df_nb = []
    df_cols = []
    for var in vars :
        df_nb.append(pd.crosstab(train_1K['TARGET'], train_1K[var], normalize='index').reset_index())
        df_cols.append(train_1K[var].unique())

    for idx in range(len(df_nb)) :
        fig = px.bar(df_nb[idx], y='TARGET', x=df_cols[idx], orientation='h')
        fig.update_layout(height=275, width=800, xaxis_tickformat = '.0%', title_text=vars[idx], legend_title='', xaxis_title='', yaxis_type='category', legend=dict(orientation='h'))
        fig.show()

DisplayBreakdown(['NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE', 'NAME_FAMILY_STATUS', 'WEEKDAY_APPR_PROCESS_START'])

### Principal Component Analysis

In [None]:
# from sklearn.preprocessing import StandardScaler    <- already loaded
from sklearn.decomposition import PCA                 

PCAFeatures = train_1K.iloc[:,np.r_[41:85]]
PCAFeatures.replace(np.nan,0, inplace = True)
PCAFeaturesScaled = preprocessing.scale(PCAFeatures)

# display(round(pd.DataFrame(PCAFeaturesScaled).describe(),4))  # check to see if these columns have mean 0 and std 1: looks good

pca = PCA()
pca.fit(PCAFeaturesScaled)
pca_data = pca.transform(PCAFeaturesScaled)

# Plot component importance - maybe keep the top 10 or so?
fig = plt.figure(figsize = (22, 3))
pcvar = np.round(pca.explained_variance_ratio_ * 100,1)
labels = ['PC'+str(x) for x in range(1,len(pcvar)+1)]
plt.bar(x=range(1,len(pcvar)+1), height = pcvar, tick_label = labels)
plt.ylabel('%age of explained variance')
plt.xlabel('Principal Component')
plt.title('PCA Components')
plt.show()

In [None]:
# PCA data sets are completely uncorrelated
pca_df = pd.DataFrame(pca_data, columns = labels)
fig = plt.figure(figsize = (26,10))
sns.heatmap(pca_df.corr(), cmap = 'BuPu', annot = True, fmt=".0%")
plt.title('Correlation Heatmap for principal components')
plt.show()

In [None]:
pca_df = pd.DataFrame(pca_data, columns = labels)
pca_df['TARGET'] = train_1K.TARGET

pca_df.head(5)

# Scatterplot of PC1 and PC2 components.
#Yes = pca_df.loc[(pca_df.TARGET == 1),:]
#No = pca_df.loc[(pca_df.TARGET == 0),:]
#plt.scatter('PC1','PC2', data = No, s=4, label = '0')
#plt.scatter('PC1','PC2', data = Yes, s=4, label = '1')
#plt.xlabel('PC1')
#plt.ylabel('PC2')
#plt.title('Scatter Plot - LeftUnion by PC1 and PC2')
#plt.legend()

#plt.show()


### Build Model Scoreboard

In [None]:
# set up table
results = pd.DataFrame(columns = ['Model Type','Accuracy','Hyperparameters'])

# each time you run a model, run this code
results = results.append({'Model Type' : 'Logistic Regression',                              # logistic regression, random forest, etc
                          'Accuracy' : 0.7243,                                               # variable that contains best model run
                          'Hyperparameters' : "{'max_depth': 9, 'min_samples_leaf': 1}"},    # variable that contains hyperparameters from best model run
                        ignore_index=True)    
results

# Appendix - data descriptions

In [None]:
# Description table contains characters that are unprintable with UTF8 encoding, so we need to open it this way:

with open(f'{MainDir}/HomeCredit_columns_description.csv', 'r', encoding = 'ISO-8859-1') as csvfile:
    desc = pd.read_csv(csvfile)
pd.set_option("display.max_rows", None)               # print entire thing, not just first and last rows
pd.options.display.max_colwidth = 100                 # description column
desc