## Random Forest

In [1]:
# import bibliotecas

import pandas as pd
import numpy  as np

from sklearn import metrics as mt
from sklearn.ensemble import RandomForestClassifier


### Datasets

In [2]:
X_train = pd.read_csv('X_training.csv')
y_train = pd.read_csv('y_training.csv') 
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')
X_val = pd.read_csv('X_validation.csv')
y_val = pd.read_csv('y_validation.csv') 

**Remover a coluna do ID dos datasets pois não é uma coluna categórica**

In [3]:
X_train = X_train.drop(['id'] , axis =1)
X_test = X_test.drop(['id'] , axis =1)
X_val = X_val.drop(['id'] , axis =1)

In [4]:
X_val.head(2)

Unnamed: 0,customer_type,age,class,flight_distance,inflight_wifi_service,departure_arrival_time_convenient,ease_of_online_booking,gate_location,food_and_drink,online_boarding,...,baggage_handling,checkin_service,inflight_service,cleanliness,departure_delay_in_minutes,arrival_delay_in_minutes,gender_Female,gender_Male,type_of_travel_business_travel,type_of_travel_personal_travel
0,1,0.525641,1.0,0.714055,1.0,1.0,1.0,1.0,0.8,0.8,...,0.75,0.75,0.8,0.8,0.0,0.0,0.0,1.0,1.0,0.0
1,1,0.615385,1.0,0.054725,0.4,1.0,1.0,1.0,0.4,0.4,...,0.25,0.0,0.4,0.6,0.004916,0.004946,0.0,1.0,1.0,0.0


In [5]:
# preparação dos dados 

y_train = y_train.values.ravel()
y_val = y_val.values.ravel()

### Dados de Treino

In [6]:
md_list = np.arange(2, 25, 2)
acc_scores = []
precision_scores = []
recall_scores = []
f1score_scores = []

max_acc = 0
max_prec = 0
max_rec = 0
max_f1 = 0

for i in md_list:
    #Definition
    model = RandomForestClassifier(n_estimators=20, max_depth=i)

    #training 
    model.fit(X_train, y_train)

    #predict
    y_pred = model.predict( X_train)

    #Acurracy
    acc = mt.accuracy_score (y_train , y_pred)
    acc_scores.append(acc)
    if acc > max_acc:
        max_acc = acc

    #Precision
    precision = mt.precision_score (y_train , y_pred)
    precision_scores.append(precision)
    if precision > max_prec:
        max_prec = precision

    #Recall 
    recall = mt.recall_score(y_train , y_pred)
    recall_scores.append(recall)
    if recall > max_rec:
        max_rec = recall

    #F1 Score
    f1score = mt.f1_score(y_train , y_pred)
    f1score_scores.append(f1score)
    if f1score > max_f1:
        max_f1 = f1score

print('Accuracy: {}'.format(max_acc))
print( 'Precision = {}'.format( max_prec ))
print( 'Recall = {}'.format( max_rec ))
print( 'f1-score = {}'.format( max_f1 ))

Accuracy: 0.9978211404536992
Precision = 0.9994250303456207
Recall = 0.9955453735522464
f1-score = 0.9974814295278477


**melhor max_depth:**

In [7]:
best_md = acc_scores.index(max(acc_scores))
best_md

11

### Dados de Validação

**Agora com o melhor parametro de max_depth sobre os dados de validação**

In [8]:
#Definition
model = RandomForestClassifier(n_estimators=20, max_depth=best_md)

#training 
model.fit(X_train, y_train)

#predict
yhat_val = model.predict( X_val)

#Acurácia
acc_val = mt.accuracy_score(y_val , yhat_val)
print('Accuracy: {}'.format(acc_val))

#Precision
prec_val = mt.precision_score(y_val , yhat_val)
print('Precision: {}'.format(prec_val))

#Recall 
recall_val = mt.recall_score (y_val , yhat_val)
print('Recall: {}'.format(recall_val))

#F1Score
f1 = mt.f1_score(y_val ,yhat_val)
print ('F1 Score: {}'.format(f1))


Accuracy: 0.9476173622059911
Precision: 0.946055902960898
Recall: 0.9322889598336922
F1 Score: 0.9391219804053549


### Dados de Teste

**Modelo treinado e validado. Agora com o melhor parametro de max_depth sobre os dados de teste**

In [9]:
#Definition
model = RandomForestClassifier(n_estimators=20, max_depth=best_md)

#Training 
model.fit (np.concatenate((X_train,X_val)),
           np.concatenate((y_train,y_val)))

# predict
ypred_test = model.predict( X_test)

#Acurácia
acc_test = mt.accuracy_score(y_test , ypred_test)
print('Accuracy: {}'.format(acc_test))

#Precision
prec_test  = mt.precision_score(y_test , ypred_test)
print('Precision: {}'.format(prec_test))

#Recall 
recall_test  = mt.recall_score (y_test , ypred_test)
print('Recall: {}'.format(recall_test))

#F1Score
f1test  = mt.f1_score(y_test ,ypred_test)
print ('F1 Score: {}'.format(f1test))



Accuracy: 0.9501409647395049
Precision: 0.9511823719097098
Recall: 0.9343598768147823
F1 Score: 0.9426960806072173
