The "Credit Score" problem solved with Random Forest Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

%matplotlib inline

Importing the dataset

In [None]:
arquivo = '../../../Dados/cs-training.csv'
df = pd.read_csv(arquivo, index_col=0)

Checking the dataset

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

Univariate analysis

In [None]:
df['SeriousDlqin2yrs'].value_counts(normalize=True).plot.bar()

In [None]:
pd.DataFrame({'Contagens':df['SeriousDlqin2yrs'].value_counts(),
              'pct':df['SeriousDlqin2yrs'].value_counts(normalize=True)},
              index=df['SeriousDlqin2yrs'].value_counts().index,
              ).style.format(precision=2, decimal=',', thousands='.',
                             formatter={'pct':'{:.1%}'})

In [None]:
variavel = 'age'
sns.displot(df,
            x=variavel,
            bins=110,
            alpha=.2,
            kde=True,
            element='step')
plt.show()

In [None]:
variavel='RevolvingUtilizationOfUnsecuredLines'
sns.displot(df,
            x=variavel,
            bins=50,
            aspect=3,
            height=3)
sns.displot(df[df['RevolvingUtilizationOfUnsecuredLines']<2],
            x=variavel,
            bins=50,
            aspect=3,
            height=3)
plt.show()

In [None]:
df.head(5)

Cleaning the Dataset

In [None]:
# count the duplicated rows by each column
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isna().sum()

Handle with missing values inside a Random Forest algorithm

In [None]:
df['MonthlyIncome'] = df['MonthlyIncome'].fillna(-10000)
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(-1)

In [None]:
pd.DataFrame(df.isna().sum())

Bivariate Analysis

In [None]:
var= 'RevolvingUtilizationOfUnsecuredLines'
num_cat=4
var_resp= 'SeriousDlqin2yrs'

cat_srs, bins = pd.qcut(df[var], num_cat, retbins=True, duplicates='drop')

ax= sns.pointplot(x=cat_srs, y=var_resp, data=df)

labels =['0', '0 a 15%', '15% a 55%', '55% ou mais']
ax.set_xticklabels(labels, rotation=30, fontsize='large')
ax.set_title('Bivariada - bad rate e IC de 95% de confiança')
ax.set_xlabel('Índice de utilização do rotativo')
ax.set_ylabel('Bad Rate em 2 anos')



Starting the Random Forest Classifier Algorithm

In [None]:
X = df.drop(columns=['SeriousDlqin2yrs'], axis=1).copy()
y = df['SeriousDlqin2yrs'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.head()

In [None]:
clf = RandomForestClassifier(n_estimators=5,
                             ccp_alpha= .005,
                             class_weight="balanced")

clf.fit(X_train, y_train)

Variable Importance

In [None]:
importances = pd.Series(clf.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).plot.bar()

Performance Metrics

In [None]:
# Acurácia
acc = metrics.accuracy_score(y_test, y_pred)
#AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)
auc = metrics.auc(fpr, tpr)
#Gini
gini = 2*auc -1

print('Acurácia: {0:.2f}% \nAUC: {1:.2f}% \nGINI: {2:.2f}%'
      .format(acc*100, auc*100, gini*100))

In [None]:
plot_confusion_matrix(clf, X_test, y_test, display_labels=['Bom', 'Mau'])