<a href="https://colab.research.google.com/github/Pedro-hn/Random-Forest/blob/main/Credit%20Score%20-%20Random%20Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importando as bibliotecas necessárias

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

%matplotlib inline

##Importando o banco de dados e analisando superficialmente o seu conteúdo

In [2]:
dados = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')

In [None]:
dados.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [None]:
dados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83832 entries, 0 to 83831
Data columns (total 12 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            83832 non-null  int64  
 1   SeriousDlqin2yrs                      83832 non-null  int64  
 2   RevolvingUtilizationOfUnsecuredLines  83831 non-null  float64
 3   age                                   83831 non-null  float64
 4   NumberOfTime30-59DaysPastDueNotWorse  83831 non-null  float64
 5   DebtRatio                             83831 non-null  float64
 6   MonthlyIncome                         67212 non-null  float64
 7   NumberOfOpenCreditLinesAndLoans       83831 non-null  float64
 8   NumberOfTimes90DaysLate               83831 non-null  float64
 9   NumberRealEstateLoansOrLines          83831 non-null  float64
 10  NumberOfTime60-89DaysPastDueNotWorse  83831 non-null  float64
 11  NumberOfDepende

In [None]:
dados.shape

(83832, 12)

In [None]:
dados.isna().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [None]:
dados.duplicated().sum()
dados.drop_duplicates(inplace=True)

In [None]:
media_um = dados['MonthlyIncome'].mean()
media_um

6670.221237392844

In [None]:
media_dois = dados['NumberOfDependents'].mean()
media_dois

0.7572222678605657

In [None]:
dados['MonthlyIncome'] = dados['MonthlyIncome'].fillna(media_um)

In [None]:
dados['NumberOfDependents'] = dados['NumberOfDependents'].fillna(media_dois)

In [None]:
dados.isna().sum()

Unnamed: 0                              0
SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

##Separando a base de dados em treino e teste

In [None]:
X = dados.drop(columns='SeriousDlqin2yrs', axis= 1)

In [None]:
y = dados['SeriousDlqin2yrs']

In [None]:
print(f'dados : {dados.shape}')
print(f'X : {X.shape}')
print(f'y : {y.shape}')

dados : (150000, 12)
X : (150000, 11)
y : (150000,)


In [None]:
from numpy.matrixlib import test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

##Estimando o modelo de Random Forest

In [None]:
params = {'max_depth':[2,3,4],
          'class_weight':[None, 'balance']}

model = RandomForestClassifier()
grid = GridSearchCV(estimator = model, param_grid = params, cv= 3, scoring= 'roc_auc')

grid.fit(X_train, y_train)
predit = grid.predict_proba(X_test)

auc = roc_auc_score(y_test, predit[:, 1])


##Encontrando os melhores Hiper Parâmetros e avaliando o valor da AUC

In [None]:
print(f'Grid : {grid.best_params_}')
print(f'AUC Train : {grid.best_score_:.2f}')
print(f'AUC Test : {auc:.2f}')

Grid : {'class_weight': None, 'max_depth': 3}
AUC Train : 0.85
AUC Test : 0.86


##Modelo Final com Hiper Parâmetros otimizados pelo Cross Validation

In [None]:
params = {'max_depth':[3],
          'class_weight':[None]}
model = RandomForestClassifier()
grid = GridSearchCV(estimator = model, param_grid = params, cv= 10, scoring= 'roc_auc')

grid.fit(X_train, y_train)
predit = grid.predict_proba(X_test)

auc = roc_auc_score(y_test, predit[:, 1])


##Avaliação das métricas do modelo final

In [None]:
print(f'Grid : {grid.best_params_}')
print(f'AUC Train : {grid.best_score_:.2f}')
print(f'AUC Test : {auc:.2f}')

Grid : {'class_weight': None, 'max_depth': 3}
AUC Train : 0.85
AUC Test : 0.85
