<a href="https://colab.research.google.com/github/Rogerio-mack/IMT_Ciencia_de_Dados/blob/main/IMT_CV_GridSearch_Lab_solucao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<head>
  <meta name="author" content="Rogério de Oliveira">
  <meta institution="author" content="ITM">
</head>

<img src="https://maua.br/images/selo-60-anos-maua.svg" width=300, align="right">
<!-- <h1 align=left><font size = 6, style="color:rgb(200,0,0)"> optional title </font></h1> -->


# Lab: Validação Cruzada e GridSearch





# Caso: **Classificação de Tipos de Vidro para Reciclagem**

Nossa base de dados classifica vidros industrializados em 7 categorias conforme suas características químicas:

* Classe 1: janelas de construção (processadas com flutuação)
* Classe 2: janelas de construção (processadas sem flutuação)
* Classe 3: janelas do veículo (processadas com flutuação)
* Classe 4: janelas do veículo (processadas sem flutuação)
* Classe 5: recipientes
* Classe 6: talheres
* Classe 7: faróis

(*algumas dessas classes podem não estar presentes no data-set*).

Os dados estão na URL: https://github.com/Rogerio-mack/Machine-Learning-I/raw/main/data/glasses.csv

Aqui vai nos interessar classificar os vidros para efeito de reciclagem em 3 categorias:

* **C = Vidros de Construção**
* **V = Vidros de Veículos**
* **O = Outros**

E para isso vamos empregar uma seleção de hiperparâmetros de modelos com o GridSearch que você aprendeu na aula teórica.




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Aquisição dos dados

In [None]:
df = pd.read_csv('https://github.com/Rogerio-mack/IMT_Ciencia_de_Dados/raw/main/data/glasses.csv')
df.head()

Unnamed: 0,Id number,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1,1.52101,13.64,D,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,D,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,C,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,D,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,D,1.24,73.08,0.55,8.07,0.0,0.0,1


# Tratamento de Nulos

In [None]:
df.isnull().sum() / len(df)

Id number        0.0
RI               0.0
Na               0.0
Mg               0.0
Al               0.0
Si               0.0
K                0.0
Ca               0.0
Ba               0.0
Fe               0.0
Type of glass    0.0
dtype: float64

# Tratamento do Atributo Target

In [None]:
df['Type of glass'] = df['Type of glass'].replace([1,2,3,4,5,6,7],['C','C','V','V','O','O','V'])
df['Type of glass'].value_counts()


C    146
V     46
O     13
Name: Type of glass, dtype: int64

In [None]:
#@markdown check
all(( df['Type of glass'].value_counts() == [146,46,13] ) == True)

True

# Exclusão de atributos

In [None]:
df.corr() > 0.9

  df.corr() > 0.9


Unnamed: 0,Id number,RI,Na,Al,Si,K,Ca,Ba,Fe
Id number,True,False,False,False,False,False,False,False,False
RI,False,True,False,False,False,False,False,False,False
Na,False,False,True,False,False,False,False,False,False
Al,False,False,False,True,False,False,False,False,False
Si,False,False,False,False,True,False,False,False,False
K,False,False,False,False,False,True,False,False,False
Ca,False,False,False,False,False,False,True,False,False
Ba,False,False,False,False,False,False,False,True,False
Fe,False,False,False,False,False,False,False,False,True


In [None]:
df = df.drop(columns='Id number')
df.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1.52101,13.64,D,1.1,71.78,0.06,8.75,0.0,0.0,C
1,1.51761,13.89,D,1.36,72.73,0.48,7.83,0.0,0.0,C
2,1.51618,13.53,C,1.54,72.99,0.39,7.78,0.0,0.0,C
3,1.51766,13.21,D,1.29,72.61,0.57,8.22,0.0,0.0,C
4,1.51742,13.27,D,1.24,73.08,0.55,8.07,0.0,0.0,C


In [None]:
#@markdown check
df.shape == (205,10)

True

# Hot Encode

In [None]:
from sklearn.preprocessing import OneHotEncoder

hot_encode = OneHotEncoder(handle_unknown='ignore',sparse_output=False,drop='first')
hot_encode.fit(df.drop(columns='Type of glass').select_dtypes(exclude='number'))

df_hot_encode = pd.DataFrame(hot_encode.transform(df.drop(columns='Type of glass').select_dtypes(exclude='number')),columns=hot_encode.get_feature_names_out())
df_hot_encode.head()

df = pd.concat([df_hot_encode,df.select_dtypes('number'),df[['Type of glass']]],axis=1)
df.head()

Unnamed: 0,Mg_B,Mg_C,Mg_D,RI,Na,Al,Si,K,Ca,Ba,Fe,Type of glass
0,0.0,0.0,1.0,1.52101,13.64,1.1,71.78,0.06,8.75,0.0,0.0,C
1,0.0,0.0,1.0,1.51761,13.89,1.36,72.73,0.48,7.83,0.0,0.0,C
2,0.0,1.0,0.0,1.51618,13.53,1.54,72.99,0.39,7.78,0.0,0.0,C
3,0.0,0.0,1.0,1.51766,13.21,1.29,72.61,0.57,8.22,0.0,0.0,C
4,0.0,0.0,1.0,1.51742,13.27,1.24,73.08,0.55,8.07,0.0,0.0,C


In [None]:
#@markdown check
df[ [x for x in df.columns if x.find('Mg_') == 0] ].sum().sum() == 156

True

In [None]:
#@markdown check
if list(df.columns[0:3]) != ['Mg_B', 'Mg_C', 'Mg_D']:
  print("Coloque os atributos 'Mg_B', 'Mg_C', 'Mg_D' primeiro no seu DataFrame")
else:
  print('True')

# Apenas para garantia da reprodução dos resultados. Uma ordem diferente, pode diferir no conjunto de treinamento e teste obtido

True


# Select Features?

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif

X = df.drop(columns=['Type of glass'])
y = df['Type of glass']

select_features = SelectKBest(mutual_info_classif, k=10).fit(X, y)
# print( select_features.get_support() )
print( list(X.columns[select_features.get_support()]))

['Mg_B', 'Mg_C', 'Mg_D', 'RI', 'Na', 'Al', 'K', 'Ca', 'Ba', 'Fe']


# Normalização

Empregue `StandardScaler`.

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

scaler = StandardScaler()
scaler.fit(df.drop(columns='Type of glass'))

df_scaled = scaler.transform(df.drop(columns='Type of glass'))
df_scaled = pd.DataFrame(df_scaled, columns=df.drop(columns='Type of glass').columns)

df_scaled = pd.concat([df_scaled,df[['Type of glass']]],axis=1)

df = df_scaled
df.head()

Unnamed: 0,Mg_B,Mg_C,Mg_D,RI,Na,Al,Si,K,Ca,Ba,Fe,Type of glass
0,-0.53033,-0.60553,1.63117,0.86033,0.377325,-0.702388,-1.127508,-0.69927,-0.133436,-0.361529,-0.603761,C
1,-0.53033,-0.60553,1.63117,-0.262715,0.706536,-0.17813,0.137804,-0.059245,-0.781542,-0.361529,-0.603761,C
2,-0.53033,1.651446,-0.613057,-0.735055,0.232472,0.184818,0.484099,-0.196393,-0.816765,-0.361529,-0.603761,C
3,-0.53033,-0.60553,1.63117,-0.2462,-0.188919,-0.319276,-0.022025,0.077903,-0.506801,-0.361529,-0.603761,C
4,-0.53033,-0.60553,1.63117,-0.325474,-0.109908,-0.420095,0.603971,0.047426,-0.61247,-0.361529,-0.603761,C


In [None]:
df.drop(columns=['Type of glass']).sum()

Mg_B   -4.440892e-16
Mg_C   -7.105427e-15
Mg_D   -5.329071e-15
RI      3.229417e-12
Na     -1.989520e-13
Al     -7.105427e-14
Si      3.481659e-12
K      -5.329071e-15
Ca     -6.750156e-14
Ba      1.421085e-14
Fe      9.769963e-15
dtype: float64

In [None]:
#@markdown check
df[['Na','Si']].sum().sum() == 3.282707439211663e-12

True

# Treinando o Modelo

Você vai treinar um modelo de Árvore de Decisão buscando os melhores hiperparâmetros de 'max_depth' e 'criterion' (pesquise os possíveis valores na documentação do scikit-learn). Entretanto, no lugar da acuracidade, você empregar o F1 score (`f1_macro`) que é uma métrica que balanceia os resultados de precisão e recall.

Aqui um checklist do que precisa ser feito...

1. Separe os dados de Treinamento e Teste empregandp 0.3 dos dados para teste, estratificados e não deixe de empregar o seed 123.

2. Defina uma DecisionTree como Estimador Base. Não deixe de empregar o parâmetro `random_state=123` no estimador base para a reprodutibilidade dos resultados.

3. Especifique o range dos valores 'max_depth'  de 3 a 10, e 'criterion' que você deseja empregar pesquisando os valores na documentação do scikit-learn.

4. Configure o `GridSearchCV` para empregar 5 partições e empregar o score de `f1_macro` para a seleção dos melhores hiperparâmetros.

5. Verifique os Resultados gerando um classification report para ver as métricas do modelo.


**Nota**: não use outros parâmetros ou recursos não especificados aqui.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X = df.drop(columns=['Type of glass'])
y = df['Type of glass']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=123)

base_estimator = DecisionTreeClassifier(random_state=123)
param_grid = {'max_depth': range(3,10), 'criterion': ['gini','entropy','log_loss']}

clf = GridSearchCV(base_estimator, param_grid, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train)

# print(clf.cv_results_)
print(clf.best_estimator_)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


DecisionTreeClassifier(criterion='entropy', max_depth=9, random_state=123)
              precision    recall  f1-score   support

           C       0.85      0.80      0.82        44
           O       0.67      0.50      0.57         4
           V       0.44      0.57      0.50        14

    accuracy                           0.73        62
   macro avg       0.65      0.62      0.63        62
weighted avg       0.75      0.73      0.73        62



In [None]:
#@markdown check
if X_train.sum().sum() != -19.006240819955718:
  print('verifique train_test_split')
else:
  print(True)
if clf.get_params()['estimator__random_state'] != 123:
  print('verifique o random state da árvore de decisão')
else:
  print(True)
if clf.get_params()['scoring'] != 'f1_macro':
  print('verifique o score aplicado')
else:
  print(True)

True
True
True


# Predição de Novos Casos

Considere os casos abaixo.

In [None]:
df_cases = pd.read_csv('https://github.com/Rogerio-mack/IMT_Ciencia_de_Dados/raw/main/data/glasses_test.csv')
df_cases.head()

Unnamed: 0,Id number,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,0,1.515877,12.81,C,1.48,73.89,0.6,8.12,0.0,0.01
1,1,1.515627,12.89,C,1.52,74.1,0.67,7.83,0.0,0.01
2,2,1.518166,12.9,D,1.19,73.44,0.6,8.43,0.0,0.01
3,3,1.517456,13.33,B,1.52,73.04,0.58,8.79,0.0,0.01
4,4,1.514837,13.81,B,3.5,70.89,1.68,5.87,2.2,0.01


In [None]:
def preparacao(df_cases):

  df = df_cases.copy()

  df = df.drop(columns=['Id number'])

  df_hot_encode = pd.DataFrame(hot_encode.transform(df.select_dtypes(exclude='number')),columns=hot_encode.get_feature_names_out())

  df = pd.concat([df_hot_encode,df.select_dtypes('number')],axis=1)

  df_scaled = scaler.transform(df)
  df_scaled = pd.DataFrame(df_scaled, columns=df.columns)

  df = df_scaled
  return df

In [None]:
df_cases = preparacao(df_cases)
df_cases.head()

Unnamed: 0,Mg_B,Mg_C,Mg_D,RI,Na,Al,Si,K,Ca,Ba,Fe
0,-0.53033,1.651446,-0.613057,-0.835216,-0.715658,0.063836,1.682816,0.123619,-0.577247,-0.361529,-0.50231
1,-0.53033,1.651446,-0.613057,-0.917776,-0.61031,0.144491,1.962516,0.23029,-0.781542,-0.361529,-0.50231
2,-0.53033,-0.60553,1.63117,-0.078963,-0.597142,-0.520914,1.083457,0.123619,-0.358864,-0.361529,-0.50231
3,1.885618,-0.60553,-0.613057,-0.313435,-0.030898,0.144491,0.550695,0.093142,-0.105257,-0.361529,-0.50231
4,1.885618,-0.60553,-0.613057,-1.178667,0.601188,4.136919,-2.312905,1.769398,-2.162289,3.991106,-0.50231


In [None]:
df_cases['Type of glass'] = clf.predict(df_cases)
df_cases

Unnamed: 0,Mg_B,Mg_C,Mg_D,RI,Na,Al,Si,K,Ca,Ba,Fe,Type of glass
0,-0.53033,1.651446,-0.613057,-0.835216,-0.715658,0.063836,1.682816,0.123619,-0.577247,-0.361529,-0.50231,C
1,-0.53033,1.651446,-0.613057,-0.917776,-0.61031,0.144491,1.962516,0.23029,-0.781542,-0.361529,-0.50231,C
2,-0.53033,-0.60553,1.63117,-0.078963,-0.597142,-0.520914,1.083457,0.123619,-0.358864,-0.361529,-0.50231,C
3,1.885618,-0.60553,-0.613057,-0.313435,-0.030898,0.144491,0.550695,0.093142,-0.105257,-0.361529,-0.50231,C
4,1.885618,-0.60553,-0.613057,-1.178667,0.601188,4.136919,-2.312905,1.769398,-2.162289,3.991106,-0.50231,V
5,1.885618,-0.60553,-0.613057,-0.247386,-0.070403,0.366292,0.417504,0.077903,-0.091168,-0.361529,0.410756,C
6,-0.53033,-0.60553,-0.613057,-0.620559,1.325454,1.072024,2.215578,-0.790702,-0.295462,2.744669,0.207852,V
