# Enunciado
1. Implemente os algoritmos Árvore de Decisão, Random Forest, AdaBoost e Perceptron usando uma interface semelhante ao Scikit-Learn. O algoritmo deve ser uma classe Python em uma biblioteca externa.

Dica: veja as implementações disponíveis em: https://github.com/python-engineer/MLfromscratch/tree/master/mlfromscratch 
2. Treine e avalie (de acordo com a métrica F1-Score), usando suas implementações por você, o problema de classificação binária a seguir: [Heart Disease Dataset
](https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset)

3. Compare o resultado de sua implementação com as implementações do scikit learn (Decision Tree, Random Forest, AdaBoost, Gradient Boosting, Perceptron) em um grid search. Varie os hiper-parâmetros da implementação do scikit learn.

4. Crie e use um modelo de [Bagging](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html) usando o scikit learn com 100 árvores de decisão. Compare os resultados de suas predições sobre o conjunto de teste com os resultados do melhor Random Forest obtido na Questão 3. 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#1.Implementação e lib externa

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

#minhas libs
from libs import decision_tree
from libs import random_forest
from libs import adaboost
from libs import perceptron
from libs import ranking_gs

#2.Treinar e avaliar

In [3]:
df = pd.read_csv('/content/drive/MyDrive/machine-learning/heart.csv')

In [4]:
x = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=.8, random_state=42)

In [6]:
df_test = pd.DataFrame(columns=['model','f1_micro', 'f1_macro', 'f1_weighted'])
models = [decision_tree.DecisionTree(), random_forest.RandomForest(), 
          adaboost.Adaboost(), perceptron.Perceptron()]

for i in range(len(models)):
  models[i].fit(X_train, y_train)
  pred = models[i].predict(X_test)
  micro = f1_score(y_test, pred, average='micro')
  macro = f1_score(y_test, pred, average='macro')
  weighted = f1_score(y_test, pred, average='weighted')
  df_test.loc[i,:] = [models[i], micro, macro, weighted]

In [7]:
df_test.sort_values(by='f1_micro', ascending=False)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted
0,<libs.decision_tree.DecisionTree object at 0x7...,0.985366,0.985364,0.985364
1,<libs.random_forest.RandomForest object at 0x7...,0.965854,0.96585,0.965849
3,<libs.perceptron.Perceptron object at 0x7f49fc...,0.580488,0.498235,0.497244
2,<libs.adaboost.Adaboost object at 0x7f49fc186350>,0.502439,0.334416,0.336047


In [8]:
df_test.sort_values(by='f1_macro', ascending=False)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted
0,<libs.decision_tree.DecisionTree object at 0x7...,0.985366,0.985364,0.985364
1,<libs.random_forest.RandomForest object at 0x7...,0.965854,0.96585,0.965849
3,<libs.perceptron.Perceptron object at 0x7f49fc...,0.580488,0.498235,0.497244
2,<libs.adaboost.Adaboost object at 0x7f49fc186350>,0.502439,0.334416,0.336047


In [9]:
df_test.sort_values(by='f1_weighted', ascending=False)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted
0,<libs.decision_tree.DecisionTree object at 0x7...,0.985366,0.985364,0.985364
1,<libs.random_forest.RandomForest object at 0x7...,0.965854,0.96585,0.965849
3,<libs.perceptron.Perceptron object at 0x7f49fc...,0.580488,0.498235,0.497244
2,<libs.adaboost.Adaboost object at 0x7f49fc186350>,0.502439,0.334416,0.336047


#3.Comparação

In [10]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.ensemble import AdaBoostClassifier

In [11]:
clf1 = tree.DecisionTreeClassifier(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = Perceptron(random_state=42)
clf4 = AdaBoostClassifier(random_state=42)
clf5 = decision_tree.DecisionTree()
clf6 = random_forest.RandomForest()
clf7 = adaboost.Adaboost()
clf8 = perceptron.Perceptron()
classifiers = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8]

In [12]:
param1 = {}
param1['classifier__max_depth'] = [5,10,25,None]
param1['classifier__min_samples_split'] = [2,5,10]
param1['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param1['classifier'] = [clf1]

param2 = {}
param2['classifier__max_depth'] = [5,10,25,None]
param2['classifier__n_estimators'] = [10, 50, 100, 250]
param2['classifier__min_samples_split'] = [2,5,10]
param2['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param2['classifier'] = [clf2]

param3 = {}
param3['classifier__penalty'] = ['l1', 'l2']
param3['classifier__alpha'] = [10**0, 10**1, 10**2]
param3['classifier__class_weight'] = [None, {0:1,1:5}, {0:1,1:10}, {0:1,1:25}]
param3['classifier__max_iter'] = [500, 1000, 2000]
param3['classifier'] = [clf3]

param4 = {}
param4['classifier__n_estimators'] = [10, 50, 100, 250]
param4['classifier'] = [clf4]

param5 = {}
param5['classifier'] = [clf5]

param6 = {}
param6['classifier'] = [clf6]

param7 = {}
param7['classifier'] = [clf7]

param8 = {}
param8['classifier'] = [clf8]

In [13]:
params = [param1, param2, param3, param4, param5, param6, param7, param8]

In [14]:
pipeline = Pipeline([('standard', StandardScaler()), ('classifier', clf1)])

In [15]:
gs = GridSearchCV(pipeline, params, cv=3, n_jobs=-1, scoring='f1_micro', return_train_score=True).fit(X_train, y_train)

In [16]:
pred = gs.predict(X_test)
print("Test F1-Score:", f1_score(y_test, pred, average=None))
print("Test F1-Score:", f1_score(y_test, pred, average='micro'))
print("Test F1-Score:", f1_score(y_test, pred, average='macro'))
print("Test F1-Score:", f1_score(y_test, pred, average='weighted'))

Test F1-Score: [0.98550725 0.98522167]
Test F1-Score: 0.9853658536585366
Test F1-Score: 0.9853644606268295
Test F1-Score: 0.9853637641109759


In [17]:
df_results_gs = pd.DataFrame(gs.cv_results_)

In [18]:
ranking_train, ranking_test = ranking_gs.rankings(df_results_gs, classifiers)

In [19]:
ranking_train

Unnamed: 0,model,mean_train_score,mean_test_score
0,DecisionTreeClassifier(random_state=42),1.0,0.975616
1,"RandomForestClassifier(class_weight={0: 1, 1: ...",1.0,0.982928
4,<libs.decision_tree.DecisionTree object at 0x7...,1.0,0.96829
5,<libs.random_forest.RandomForest object at 0x7...,0.997561,0.963415
3,AdaBoostClassifier(random_state=42),0.985364,0.947559
7,<libs.perceptron.Perceptron object at 0x7f49fb...,0.815846,0.797572
2,Perceptron(random_state=42),0.550672,0.57056
6,<libs.adaboost.Adaboost object at 0x7f49fb2b7b50>,0.515854,0.515855


In [20]:
ranking_test

Unnamed: 0,model,mean_train_score,mean_test_score
1,"RandomForestClassifier(class_weight={0: 1, 1: ...",1.0,0.982928
0,DecisionTreeClassifier(random_state=42),1.0,0.975616
4,<libs.decision_tree.DecisionTree object at 0x7...,1.0,0.96829
5,<libs.random_forest.RandomForest object at 0x7...,0.997561,0.963415
3,AdaBoostClassifier(random_state=42),0.985364,0.947559
7,<libs.perceptron.Perceptron object at 0x7f49fb...,0.815846,0.797572
2,Perceptron(random_state=42),0.550672,0.57056
6,<libs.adaboost.Adaboost object at 0x7f49fb2b7b50>,0.515854,0.515855


#4.Bagging vs Random Forest

In [21]:
from sklearn.ensemble import BaggingClassifier

In [22]:
gs.best_params_

{'classifier': RandomForestClassifier(class_weight={0: 1, 1: 25}, max_depth=25,
                        n_estimators=50, random_state=42),
 'classifier__class_weight': {0: 1, 1: 25},
 'classifier__max_depth': 25,
 'classifier__min_samples_split': 2,
 'classifier__n_estimators': 50}

In [23]:
random_f = RandomForestClassifier(class_weight={0: 1, 1: 25}, max_depth=25,
                                  n_estimators=50, random_state=42)
bagging = BaggingClassifier(n_estimators=50, random_state=42)
bagging100 = BaggingClassifier(n_estimators=100, random_state=42)

In [24]:
models = [random_f, bagging, bagging100]
models_text = ['random_f', 'bagging', 'bagging100']

In [25]:
df_test2 = pd.DataFrame(columns=['model','f1_micro', 'f1_macro', 'f1_weighted'])

for i in range(len(models)):
  models[i].fit(X_train, y_train)
  pred = models[i].predict(X_test)
  micro = f1_score(y_test, pred, average='micro')
  macro = f1_score(y_test, pred, average='macro')
  weighted = f1_score(y_test, pred, average='weighted')
  df_test2.loc[i,:] = [models_text[i], micro, macro, weighted]

In [26]:
df_test2.sort_values(by='f1_micro', ascending=False)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted
0,random_f,0.985366,0.985364,0.985364
1,bagging,0.985366,0.985364,0.985364
2,bagging100,0.985366,0.985364,0.985364


In [27]:
df_test2.sort_values(by='f1_macro', ascending=False)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted
0,random_f,0.985366,0.985364,0.985364
1,bagging,0.985366,0.985364,0.985364
2,bagging100,0.985366,0.985364,0.985364


In [28]:
df_test2.sort_values(by='f1_weighted', ascending=False)

Unnamed: 0,model,f1_micro,f1_macro,f1_weighted
0,random_f,0.985366,0.985364,0.985364
1,bagging,0.985366,0.985364,0.985364
2,bagging100,0.985366,0.985364,0.985364


#Fim