
## Libraries

In [104]:
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.dummy import DummyClassifier

In [None]:
df = pd.read_csv('application_data.csv')

### Data

In [None]:
print(df)

print('\n')

print('Dataset Dimension: ', end='')
print(df.shape)

## Columns

In [None]:
print(df.columns.to_list())

# Exploratory Data Analysis

### Missing Values

In [None]:
fig, ax = plt.subplots(figsize=(16,14))
(1 - df.isnull().mean()).abs().plot.bar(ax=ax, color='cadetblue', alpha=0.9)
plt.title('Missing Values', fontsize=16,fontfamily='serif', fontweight='bold')
plt.ylabel('Percentage of Non-missing Values', fontsize=20, fontfamily='serif')
plt.tight_layout()
plt.show()

In [None]:
print('Missing Values Count: ')
print('\n')
print(df.isnull().sum())

### Default Summary

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(x='TARGET', data=df, color="palevioletred", alpha=0.5)
plt.xlabel('Target', fontsize=16, fontfamily='serif', fontweight='bold')
plt.legend(['Good Standing = 0'
           '    Defaulted = 1'])
text_1 = '282686'
text_2 = '24825'
plt.text(0.0,0.0, text_1, horizontalalignment='center', verticalalignment='bottom',fontsize=16, fontfamily='serif')
plt.text(0.98,-550.0, text_2, horizontalalignment='center', verticalalignment='bottom',fontsize=16, fontfamily='serif')
plt.title('Loan Defaulter', fontsize=16, fontfamily='serif')
plt.tight_layout()
plt.show()

print('Target Summary: ')
print(df['TARGET'].value_counts())

# Financial Products

### Cash Loans

In [None]:
loan = df['TARGET'].groupby(df['NAME_CONTRACT_TYPE'])

cash = loan.get_group('Cash loans').value_counts()
revolving = loan.get_group('Revolving loans').value_counts()


plt.figure(figsize=(8,6))
cash.plot(kind='bar', label='0 = Good Standing', color='palevioletred',alpha=0.3)
cash.plot(kind='bar', label='1 = Defaulted', color='palevioletred',alpha=0.3)
plt.xticks(rotation=0)
text1='8.35% Defaulted'
text2 = '91.65% Current'
plt.text(.789, -5, text1, fontsize=13, fontfamily='serif', verticalalignment='bottom')
plt.text(-.21, 16.1, text2, fontsize=13, fontfamily='serif', verticalalignment='bottom')
plt.legend()
plt.title('Cash Loans', fontsize=16, fontfamily='serif')
plt.tight_layout()
plt.show()

print('Cash Loans: ')
print(cash)

### Revolving Loans

In [None]:
plt.figure(figsize=(8,6))
revolving.plot(kind='bar', label='0 = Good Standing', color='palevioletred',alpha=0.3)
revolving.plot(kind='bar', label='1 = Defaulted', color='palevioletred',alpha=0.3)
plt.xticks(rotation=0)
text='5.48% Defaulted'
text_ = '94.52% Current'
plt.text(.752, -5, text, fontsize=15, fontfamily='serif', verticalalignment='bottom')
plt.text(-.241, -5, text_, fontsize=15, fontfamily='serif', verticalalignment='bottom')
plt.legend()
plt.title('Revolving Loans', fontsize=16, fontfamily='serif')
plt.tight_layout()
plt.show()

print('Revolving Loans: ')
print(revolving)

### Defaults vs Good Standing on Financial Products

In [None]:
plt.figure(figsize=(10,8))
df[['TARGET','NAME_CONTRACT_TYPE']].value_counts().plot(kind='bar', color=['skyblue', 'black'], alpha=0.5)
plt.xticks(rotation=1)
plt.title('Financial Products on Defaults and Good Standing', fontsize=16, fontfamily='serif')
plt.xlabel('Financial Products', fontsize=16, fontfamily='serif')
plt.tight_layout()
plt.show()

# Consumers

### Total Consumers by Gender

In [None]:
#Removing category XNA
df['CODE_GENDER'] = df['CODE_GENDER'].astype('category')
df['CODE_GENDER'] = df['CODE_GENDER'].cat.remove_categories('XNA')

ftxt = '65.84%'
mtxt = '34.17%'

plt.figure(figsize=(8,6))
df['CODE_GENDER'].value_counts().plot(kind='bar', color=['thistle', 'lightskyblue'], alpha=0.70)
plt.text(-0.078,0.0, ftxt, fontsize=14)
plt.text(0.920,0.0, mtxt, fontsize=14)
plt.title('Consumers by Gender', fontsize=16, fontfamily='serif')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print('Gender Summary: ')
print(df['CODE_GENDER'].value_counts())

### Default Summary by Gender
#### [ 0 = Good Standing, 1 = Defaulted ]

In [None]:
gender = df['TARGET'].groupby(df['CODE_GENDER'])

female_code = gender.get_group('F')
male_code = gender.get_group('M')

default_gender = pd.DataFrame()
default_gender['female'] = female_code.value_counts()
default_gender['male'] = male_code.value_counts()

In [None]:
plt.figure(figsize=(12,10))
default_gender.plot(kind='bar', color=['thistle', 'lightskyblue'], alpha=0.7)
plt.title('Default Summary by Gender', fontsize=16, fontfamily='serif')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print(default_gender)

# Correlation for Defaulting

In [None]:
df_mx = df[['TARGET', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'HOUR_APPR_PROCESS_START', 'REGION_RATING_CLIENT_W_CITY','CNT_FAM_MEMBERS', 'DAYS_EMPLOYED', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_INCOME_TYPE', 'NAME_HOUSING_TYPE']]

corr = df_mx.corr()

cmap = sns.diverging_palette(230, 20, as_cmap=True)

plt.figure(figsize=(12,8))
sns.heatmap(corr, annot=True, alpha=0.8, cmap=cmap, vmin=-1, vmax=1)
plt.title('Correlation for Defaulting', fontsize=16, fontfamily='serif')
plt.tight_layout()
plt.show()

# Creating Classification Model

In [None]:
one_hot = preprocessing.LabelEncoder()
df['NAME_EDUCATION_TYPE'] = one_hot.fit_transform(df['NAME_EDUCATION_TYPE'])
df['NAME_FAMILY_STATUS'] = one_hot.fit_transform(df['NAME_FAMILY_STATUS'])
df['NAME_INCOME_TYPE'] = one_hot.fit_transform(df['NAME_INCOME_TYPE'])
df['NAME_HOUSING_TYPE'] = one_hot.fit_transform(df['NAME_HOUSING_TYPE'])

X = df[['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'HOUR_APPR_PROCESS_START', 'REGION_RATING_CLIENT_W_CITY','DAYS_EMPLOYED',
        'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE']]
y = df['TARGET']

In [None]:
ros = RandomOverSampler(random_state=1)
x_ros, y_ros = ros.fit_resample(X, y)
scaler = StandardScaler()
scaler.fit(x_ros)
scaled_x = scaler.transform(x_ros)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(scaled_x, y_ros, test_size=0.20, random_state=0,shuffle=True)

### Logistic Model

In [None]:
model = LogisticRegression(C=1, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, n_jobs=1, penalty='l2',random_state=42, solver='liblinear')
model.fit(x_train,y_train)

model_predict = model.predict(x_test)
print(confusion_matrix(y_test, model_predict))
print(classification_report(y_test, model_predict))
print('Logistic Regression Accuracy: ', model.score(x_test,y_test) *100, '%')

### Gaussian Model

In [None]:
nb = GaussianNB(priors=None, var_smoothing=1e-09)
nb.fit(x_train,y_train)
model_predict2 = nb.predict(x_test)
print(confusion_matrix(y_test, model_predict2))
print(classification_report(y_test, model_predict2))
print('Naive Bayes Accuracy: ', nb.score(x_test, y_test) *100, '%')

### Dummy Model

In [None]:
dm = DummyClassifier(strategy='stratified', random_state=2)
dm.fit(x_train, y_train)
model_predict3 = dm.predict(x_test)
print(confusion_matrix(y_test, model_predict3))
print(classification_report(y_test, model_predict3))
print('DummyClassifier Accuracy: ', dm.score(x_test, y_test)*100)