<a href="https://colab.research.google.com/github/Rogerio-mack/Modelos_de_Linguagem_e_Generativos/blob/main/MLG_02_ML_Supervisionado_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Árvores de Decisão, Seleção de Atributos e de Modelos**

---



# Caso: **Propensão de Compra de Clientes por Telemarketing**

https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

Os dados acima estão relacionados com campanhas de marketing direto de uma instituição bancária portuguesa. As campanhas de marketing foram baseadas em telefonemas. Freqüentemente, era necessário mais de um contato para o mesmo cliente, para acessar se o produto (depósito bancário) seria ('yes') ou não ('no') assinado.






In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [None]:
df = pd.read_csv('http://meusite.mackenzie.br/rogerio/TIC/bank-full.csv')
display(df.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# Exercício 1. Label Encode e Conjuntos de Treinamento e Teste

Você vai preparar os dados para fazer um modelo de Árvore de Decisão. Faça o Label Encode e separe os Conjuntos de Treinamento e Teste.



1. O `LabelEncoder()` é projetado para funcionar em **um único atributo por vez**. Há, entretanto, um método `ColumnTransformer` que pode ser aplicado a vários atributos, mas não empregaremos aqui.

2. A rigor, para evitarmos *data leak* deveríamos empregar o estimador sobre os dados de treino para somente depois aplicar sobre os dados de teste. O mesmo ocorrendo com a normalização. Por isso, a separação de dados de treinamento e teste deve ocorrer antes.

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)


In [None]:
%%script echo skipping
#
# este script tem a desvantagem de não salvar as transformações para uso futuro
#
from sklearn.preprocessing import LabelEncoder

categorical_features = X_train.select_dtypes(include=['object']).columns

le = LabelEncoder()
for col in categorical_features:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

X_train.head()

skipping


In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_features = X_train.select_dtypes(include=['object']).columns

le_list = {}
le = LabelEncoder()
for col in categorical_features:
    le_list[col] = le
    X_train[col] = le_list[col].fit_transform(X_train[col])
    X_test[col] = le_list[col].transform(X_test[col])
    X_train[col] = X_train[col].astype('int')
    X_test[col] = X_test[col].astype('int')

X_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
13382,31,7,1,1,0,1,1,0,0,9,5,82,1,-1,0,3
32641,35,7,1,1,0,195,1,0,0,17,0,203,1,-1,0,3
3991,24,1,2,1,0,77,1,0,2,16,8,236,2,-1,0,3
8068,35,1,1,1,0,80,1,1,2,2,6,579,2,-1,0,3
27484,37,7,2,1,0,105,0,1,0,21,9,197,2,157,4,0


# Exercício 2. Decision Tree

Aplique um modelo de Árvore de Decisão. Qual a acuracidade sobre o conjunto de teste?

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print( f'{clf.score(X_test,y_test):.4f}' )

0.8730


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("Matriz de Confusão:")
print(cm)

              precision    recall  f1-score   support

          no       0.93      0.93      0.93     11977
         yes       0.46      0.48      0.47      1587

    accuracy                           0.87     13564
   macro avg       0.69      0.70      0.70     13564
weighted avg       0.87      0.87      0.87     13564

Matriz de Confusão:
[[11087   890]
 [  833   754]]


# Exercício 3. Normalize os dados

Normalize os dados e Aplique novamente o mesmo modelo de Árvore de Decisão. Qual a acuracidade sobre o conjunto de teste agora?

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_features = X_train.select_dtypes(include=np.number).columns

scaler = StandardScaler()
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])

display(X_train.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
13382,-0.930461,0.81498,-0.277213,-0.294147,-0.134703,-0.443813,0.89195,-0.436973,-0.712156,-0.817039,-0.172516,-0.680152,-0.56685,-0.410038,-0.234774,0.442673
32641,-0.553871,0.81498,-0.277213,-0.294147,-0.134703,-0.380625,0.89195,-0.436973,-0.712156,0.143236,-1.835109,-0.214545,-0.56685,-0.410038,-0.234774,0.442673
3991,-1.589493,-1.014195,1.367509,-0.294147,-0.134703,-0.419059,0.89195,-0.436973,1.51598,0.023201,0.825039,-0.087562,-0.244783,-0.410038,-0.234774,0.442673
8068,-0.553871,-1.014195,-0.277213,-0.294147,-0.134703,-0.418082,0.89195,2.288469,1.51598,-1.65728,0.160002,1.232298,-0.244783,-0.410038,-0.234774,0.442673
27484,-0.365576,0.81498,1.367509,-0.294147,-0.134703,-0.409939,-1.121139,2.288469,-0.712156,0.623373,1.157558,-0.237633,-0.244783,1.175153,1.371081,-2.605694


In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print( f'{clf.score(X_test,y_test):.4f}' )

0.8731


# Exercício 4. Cross-Validation

Estime a acuracidade *média* do modelo empregando o cross-validation.

Se você emprega,

```
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
```

Serão empregados sempre os mesmos dados fazendo partições contíguas dos dados. Isso pode não ser muito adequado em grande número de casos!


In [None]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42) # random_state=42 para reprodução!

cv_scores = cross_val_score(clf, X_train, y_train, cv=kf)

print("Cross-validation scores:", cv_scores)
print("Average cross-validation accuracy:", cv_scores.mean())


Cross-validation scores: [0.87598736 0.86666667 0.8724917  0.86869964 0.86885764]
Average cross-validation accuracy: 0.8705406018635349


# Exercício 5. Feature Selection

Empregue o ganho de informação (*mutual information*) para selecionar os 5 atributos preditores com maior ganho e estime novamente a acuracidade do mesmo modelo de árvore, agora somente com esses preditores.

In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information
mutual_info = mutual_info_classif(X_train, y_train)

# Create a DataFrame to store feature names and their mutual information scores
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Mutual Information': mutual_info})

# Sort the DataFrame by mutual information in descending order
feature_importance_df = feature_importance_df.sort_values('Mutual Information', ascending=False)

# Display the feature importance
print(feature_importance_df)

In [None]:
X_train[feature_importance_df[0:5].Feature.values]

In [None]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)
clf.fit(X_train[feature_importance_df[0:5].Feature.values],y_train)
y_pred = clf.predict(X_test[feature_importance_df[0:5].Feature.values])
print( f'{clf.score(X_test[feature_importance_df[0:5].Feature.values],y_test):.4f}' )

# Exercício 6. GridSearchCV

Empregue o Grid search para selecionar, dentre diferentes modelos com diferentes parâmetros, um melhor modelo para este caso de dados de telemarketing. Garanta incluir nos seus modelos um modelo de Random Forest (um *ensemble model*, aliás o que é isso?) e um modelo de Rede Neural variando alguns parâmetros.

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score

"""
model_params = {
    'MLP': {
        'model': MLPClassifier(),
        'params': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001, 0.01]
         }
    },
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15],
            'min_samples_split': [2, 5, 10]
        }
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'penalty': ['l1', 'l2']
        }
    },
    'SVC': {
        'model': SVC(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }
}
"""

model_params = {
    'DecisionTreeClassifier': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [None, 5]
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {}
    },
    'LogisticRegression': {
        'model': LogisticRegression(),
        'params': {}
    },
    'SVC': {
        'model': SVC(),
        'params': {}
    }
}

best_models = {}

for model_name, mp in model_params.items():
    print(f'Model {model_name}...')
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    best_models[model_name] = (clf.best_estimator_, accuracy)

best_model_name = max(best_models, key=lambda k: best_models[k][1])
best_model, best_accuracy = best_models[best_model_name]

print(f"Melhor modelo: {best_model_name}")
print(f"Melhor acurácia: {best_accuracy}")
print(f"Melhor modelo encontrado: {best_model}")


# Exercício 7. E agora?

Selecionado o melhor modelo, o que faço? Pronto, basta usar? Ou deveria fazer algo antes?


In [None]:
best_model.fit(pd.concat([X_train, X_test]),pd.concat([y_train, y_test]))
y_pred = best_model.predict(pd.concat([X_train, X_test]))
accuracy = accuracy_score(pd.concat([y_train, y_test]), y_pred)
print(f"Acurácia (just check...): {accuracy}")