# Machine Learning - Random Forest (Floresta Aleatória)

#### Importação das bibliotecas necessárias

In [21]:
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Carregando a base de dados.

In [5]:
df_edu = pd.read_csv('xAPI-Edu-Data.csv')

In [6]:
df_edu.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,15,16,2,20,Yes,Good,Under-7,M
1,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,20,20,3,25,Yes,Good,Under-7,M
2,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,10,7,0,30,No,Bad,Above-7,L
3,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,30,25,5,35,No,Bad,Above-7,L
4,M,KW,KuwaIT,lowerlevel,G-04,A,IT,F,Father,40,50,12,50,No,Bad,Above-7,M


#### Verificando as distribuições de classes.

In [7]:
df_edu['Class'].value_counts()

M    211
H    142
L    127
Name: Class, dtype: int64

#### Verificando os registros nulos

In [8]:
df_edu.isnull().sum()

gender                      0
NationalITy                 0
PlaceofBirth                0
StageID                     0
GradeID                     0
SectionID                   0
Topic                       0
Semester                    0
Relation                    0
raisedhands                 0
VisITedResources            0
AnnouncementsView           0
Discussion                  0
ParentAnsweringSurvey       0
ParentschoolSatisfaction    0
StudentAbsenceDays          0
Class                       0
dtype: int64

#### Codificando os atributos numéricos.

In [9]:
Features = df_edu
Cat_Colums = Features.dtypes.pipe(lambda Features: Features[Features=='object']).index
for col in Cat_Colums:
    label = LabelEncoder()
    Features[col] = label.fit_transform(Features[col])

In [10]:
Features.head()

Unnamed: 0,gender,NationalITy,PlaceofBirth,StageID,GradeID,SectionID,Topic,Semester,Relation,raisedhands,VisITedResources,AnnouncementsView,Discussion,ParentAnsweringSurvey,ParentschoolSatisfaction,StudentAbsenceDays,Class
0,1,4,4,2,1,0,7,0,0,15,16,2,20,1,1,1,2
1,1,4,4,2,1,0,7,0,0,20,20,3,25,1,1,1,2
2,1,4,4,2,1,0,7,0,0,10,7,0,30,0,0,0,1
3,1,4,4,2,1,0,7,0,0,30,25,5,35,0,0,0,1
4,1,4,4,2,1,0,7,0,0,40,50,12,50,0,0,0,2


#### Separando os dados e classes

In [11]:
dataset = df_edu.drop('Class',axis=1)

In [12]:
classes = df_edu['Class']

# Random Forest vs Árvore de Decisão

#### Resultados Random Forest

In [13]:
random_clf = RandomForestClassifier(random_state=1,n_estimators=100)

In [14]:
resultados_random = cross_val_predict(random_clf, dataset, classes, cv=5)

In [15]:
print(classification_report(classes,resultados_random))

              precision    recall  f1-score   support

           0       0.65      0.64      0.65       142
           1       0.77      0.78      0.77       127
           2       0.63      0.63      0.63       211

    accuracy                           0.67       480
   macro avg       0.68      0.68      0.68       480
weighted avg       0.67      0.67      0.67       480



#### Resultados Decision Tree

In [16]:
tree_clf = DecisionTreeClassifier(random_state=1)

In [17]:
resultados_tree = cross_val_predict(tree_clf,dataset,classes,cv=5)

In [18]:
print(classification_report(classes,resultados_tree))

              precision    recall  f1-score   support

           0       0.50      0.61      0.55       142
           1       0.74      0.68      0.70       127
           2       0.54      0.49      0.52       211

    accuracy                           0.57       480
   macro avg       0.59      0.59      0.59       480
weighted avg       0.58      0.57      0.58       480



#### Verificando Overfitting

In [22]:
X_train, X_test, y_train, y_test = train_test_split(df_edu.drop('Class',axis=1),df_edu['Class'],test_size=0.3,random_state=1)

In [26]:
def compara_modelos_random_forest(maxdepth):
    if maxdepth == 0:
        rf = RandomForestClassifier(n_estimators=100,random_state=1)
    else: 
        rf = RandomForestClassifier(n_estimators=100,random_state=1, max_depth=maxdepth)
    rf.fit(X_train, y_train)
    train_score = rf.score(X_train, y_train)*100
    test_score = rf.score(X_test, y_test)*100
    return train_score,test_score

In [27]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_random_forest(2))))
print('{:1}         {} '.format(3,str(compara_modelos_random_forest(3))))
print('{:1}         {} '.format(4,str(compara_modelos_random_forest(4))))
print('{:1}         {} '.format(10,str(compara_modelos_random_forest(10))))
print('{:1}         {} '.format(15,str(compara_modelos_random_forest(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_random_forest(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (75.0, 61.80555555555556) 
3         (82.44047619047619, 68.05555555555556) 
4         (87.20238095238095, 71.52777777777779) 
10         (100.0, 75.69444444444444) 
15         (100.0, 79.86111111111111) 
Full         (100.0, 79.86111111111111) 


In [28]:
def compara_modelos_decision_tree(maxdepth):
    if maxdepth == 0:
        df = DecisionTreeClassifier(random_state=1)
    else: 
        df = DecisionTreeClassifier(random_state=1, max_depth=maxdepth)
    df.fit(X_train, y_train)
    train_score = df.score(X_train, y_train)*100
    test_score = df.score(X_test, y_test)*100
    return train_score,test_score

In [29]:
print('{:10} {:20} {:20}'.format('depth', 'Training score','Testing score'))
print('{:10} {:20} {:20}'.format('-----', '--------------','-------------'))
print('{:1}         {} '.format(2,str(compara_modelos_decision_tree(2))))
print('{:1}         {} '.format(3,str(compara_modelos_decision_tree(3))))
print('{:1}         {} '.format(4,str(compara_modelos_decision_tree(4))))
print('{:1}         {} '.format(10,str(compara_modelos_decision_tree(10))))
print('{:1}         {} '.format(15,str(compara_modelos_decision_tree(15))))
print('{:1}         {} '.format('Full',str(compara_modelos_decision_tree(0))))

depth      Training score       Testing score       
-----      --------------       -------------       
2         (63.988095238095234, 68.05555555555556) 
3         (73.21428571428571, 70.13888888888889) 
4         (79.16666666666666, 74.30555555555556) 
10         (99.10714285714286, 68.75) 
15         (100.0, 69.44444444444444) 
Full         (100.0, 69.44444444444444) 


# Tunning do Modelo para Garantir o Melhor Desempenho

#### Como encontrar os melhores valores para os parametros do modelo?

RandomForestClassifier(
n_estimators=?,
criterion='gini' ou 'entropy',
max_depth=?,
min_samples_split=?,
min_samples_leaf=?
) ...

#### GridSearchCV para testes de Hyperparametros

In [30]:
from sklearn.model_selection import GridSearchCV

#### Lista de possíveis valores de estimators ou quantidade de árvores da floresta.

In [31]:
valores_estimators = [10, 20, 50, 100, 150]

#### Lista de possíveis valores para o critério de divisão.

In [32]:
valores_criterion = ['gini','entropy']

#### Lista de possíveis valores para a profundidade máxima de cada árvore

In [33]:
valores_max_depth = [10, 20, 50, 100]

#### Lista de possíveis valores para os parametros min_samples_split e min_samples_leaf.

In [34]:
valores_min_samples_split = [2, 5, 10,15]
valores_min_samples_leaf = [1, 5, 10,15]

#### Define um dicionário que recebe as listas de parâmetros e valores.

In [35]:
parametros_grid = dict(n_estimators=valores_estimators,
                       criterion=valores_criterion,
                       max_depth=valores_max_depth,
                       min_samples_split=valores_min_samples_split,
                       min_samples_leaf=valores_min_samples_leaf 
                      )

#### Dicionário com os parametros que serão utilizados no grid.

In [36]:
parametros_grid

{'n_estimators': [10, 20, 50, 100, 150],
 'criterion': ['gini', 'entropy'],
 'max_depth': [10, 20, 50, 100],
 'min_samples_split': [2, 5, 10, 15],
 'min_samples_leaf': [1, 5, 10, 15]}

#### Instancia o GridSearch com o modelo a ser utilizado, parametros, número de folds e scoring.

In [37]:
rf = RandomForestClassifier()

In [38]:
grid = GridSearchCV(rf, parametros_grid, cv=5, scoring='accuracy')

#### Aplica o GridSearch passando as features e classes

In [39]:
grid.fit(df_edu.drop('Class',axis=1),df_edu['Class'])

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

#### Imprime os scores por combinações.

In [41]:
grid.cv_results_

{'mean_fit_time': array([0.01221099, 0.02302084, 0.05565062, 0.11050034, 0.16514997,
        0.0120111 , 0.02282081, 0.05525041, 0.11190186, 0.16294789,
        0.01221166, 0.02242026, 0.05404921, 0.10663853, 0.15861363,
        0.01120429, 0.02202001, 0.05304832, 0.10459151, 0.16574521,
        0.01161075, 0.0220202 , 0.05304809, 0.10469518, 0.15654216,
        0.01161056, 0.02222323, 0.05304823, 0.10469489, 0.1567421 ,
        0.0120111 , 0.02161956, 0.05304823, 0.10429435, 0.1550313 ,
        0.01121011, 0.02141938, 0.05205302, 0.10283527, 0.15472255,
        0.01201077, 0.02201996, 0.05204682, 0.100491  , 0.14991403,
        0.01118374, 0.0212193 , 0.05102053, 0.10098243, 0.1498455 ,
        0.01123538, 0.02117109, 0.0508153 , 0.10082121, 0.15217257,
        0.01181064, 0.02121944, 0.0516468 , 0.10129209, 0.15093699,
        0.01121001, 0.02101927, 0.05004554, 0.09929032, 0.14913578,
        0.01101012, 0.02041869, 0.05004559, 0.09868946, 0.1467011 ,
        0.01120429, 0.02081866,

#### Verificando os melhores parâmetros.

In [42]:
grid.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 20}

#### Verificando o melhor score.

In [43]:
grid.best_score_

0.7270833333333333