In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

uri = "https://gist.githubusercontent.com/guilhermesilveira/e99a526b2e7ccc6c3b70f53db43a87d2/raw/1605fc74aa778066bf2e6695e24d53cf65f2f447/machine-learning-carros-simulacao.csv"
dados = pd.read_csv('../data/modelo_carros.csv', sep=",").drop(columns=["Unnamed: 0"], axis=1)
dados.head()

In [None]:
dados.info()

In [None]:
# situação horrível de "azar" onde as classes estão ordenadas por padrão
dados_azar = dados.sort_values(by=['vendido'], ascending=True)
x_azar = dados[['preco', 'idade_do_modelo', 'km_por_ano']]
y_azar = dados['vendido']
dados_azar.head()

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
SEED = 301
np.random.seed(SEED)

modelo = DummyClassifier()
results = cross_validate(modelo, x_azar, y_azar, cv=10, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print('Accuracy média:  %.2f' %(media * 100))
print("Accuracy intervalo:  [%.2f %.2f]" % ((media - 2 * desvio_padrao) * 100, (media + 2 * desvio_padrao) *100))

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
SEED = 158020
np.random.seed(SEED)

modelo = DecisionTreeClassifier(max_depth=2)
results = cross_validate(modelo, x_azar, y_azar, cv=10, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print('Accuracy média:  %.2f' %(media * 100))
print("Accuracy intervalo:  [%.2f %.2f]" % ((media - 2 * desvio_padrao) * 100, (media + 2 * desvio_padrao) *100))


In [None]:
# gerando dados aleatórios de modelo de carro para simulação de agrupamento ao usar nosso estimador

SEED = 301
np.random.seed(SEED)

dados['modelo'] = dados.idade_do_modelo + np.random.randint(-2, 3, size=dados.shape[0])
dados.modelo = dados.modelo + abs(dados.modelo.min()) + 1
dados.head()

In [None]:
def imprime_resulados(result):
  media = result['test_score'].mean()
  desvio_padrao = result['test_score'].std()
  print('Accuracy média:  %.2f' %(media * 100))
  print("Accuracy intervalo:  [%.2f %.2f]" % ((media - 2 * desvio_padrao) * 100, (media + 2 * desvio_padrao) *100))

In [None]:
# GroupKFold para analisar como o modelo se comporta com novos grupos

from sklearn.model_selection import cross_validate
SEED = 301
np.random.seed(SEED)

modelo = DecisionTreeClassifier(max_depth=2)
result = cross_validate(modelo, x_azar, y_azar, cv=5, return_train_score=False, groups=dados.modelo)
imprime_resulados(result)

In [None]:
from sklearn.model_selection import GroupKFold
SEED = 301
np.random.seed(SEED)

cv = GroupKFold(n_splits=10)
modelo = DecisionTreeClassifier(max_depth=3)
result = cross_validate(modelo, x_azar, y_azar, cv=cv, return_train_score=False, groups=dados.modelo)
imprime_resulados(result)

In [None]:
from sklearn.tree import export_graphviz
import graphviz

modelo.fit(x_azar, y_azar)
features = x_azar.columns
dot_data = export_graphviz(modelo, out_file=None, filled=True, rounded=True, class_names=["Não", "Sim"],feature_names=features)
graph = graphviz.Source(dot_data)
graph

In [None]:
# GroupKFold em um pipeline com StandardScaler e SVC

from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GroupKFold

SEED = 301
np.random.seed(SEED)

scaler = StandardScaler()
modelo = SVC()

pipeline = Pipeline([('transformacao', scaler), ('estimador', modelo)])
cv = GroupKFold(n_splits=10)
result = cross_validate(pipeline, x_azar, y_azar, cv=cv, return_train_score=False, groups=dados.modelo)
imprime_resulados(result)

In [None]:
# Testando parametros para uma dimensão

def roda_arvore_de_decisao(max_depth):
  SEED = 301
  np.random.seed(SEED)
  
  cv = GroupKFold(n_splits=10)
  modelo = DecisionTreeClassifier(max_depth=max_depth)
  results = cross_validate(modelo, x_azar, y_azar, cv=cv, return_train_score=True, groups=dados.modelo)
  teste_score = results['test_score'].mean() * 100
  treino_score = results['train_score'].mean() * 100
  print("max_depth = %d, treino = %.2f, teste = %.2f" % (max_depth, treino_score, teste_score))
  tabela = [max_depth, treino_score, teste_score]
  return tabela

resultados = [roda_arvore_de_decisao(i) for i in range(1, 33)]
resultados = pd.DataFrame(resultados, columns=['max_depth', 'train', 'test'])
  

#### OVERFIT: ficou perfeito para o treino mas ruim para o teste

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white")

sns.lineplot(x="max_depth", y="train", data=resultados)
sns.lineplot(x="max_depth", y="test", data=resultados)
plt.legend(["Treino", "Teste"])

In [None]:
resultados.sort_values(by=['test'], ascending=False).head()

In [None]:
# explorando parâmetro com duas dimensão

def roda_arvore_de_decisao(max_depth, min_sample_leaf):
  SEED = 301
  np.random.seed(SEED)
  
  cv = GroupKFold(n_splits=10)
  model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_sample_leaf)
  results = cross_validate(model, x_azar, y_azar, cv=cv, return_train_score=True, groups=dados.modelo)
  test_score = results['test_score'].mean() * 100
  train_score = results['train_score'].mean() * 100
  print("max_depth = %d, min_sample_leaf = %d, treino = %.2f, teste = %.2f" % (max_depth, min_sample_leaf, train_score, test_score))
  table = [max_depth, min_sample_leaf, train_score, test_score]
  return table

def busca():
  results = []
  for max_depth in range(1,16):
    for min_sample_leaf in [92, 128, 256, 512]:
      table = roda_arvore_de_decisao(max_depth, min_sample_leaf)
      results.append(table)
  results = pd.DataFrame(results, columns=['max_depth', 'min_sample_leaf', 'train', 'test'])
  return results

results = busca()  
results.head()

In [None]:
results.sort_values(by=["test"], ascending=False).head()

In [None]:
# Resultado correlacionado
corr = results.corr()

In [None]:

sns.heatmap(corr)

In [None]:
sns.pairplot(results)

In [None]:

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and corrent aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=3, center=0, square=True, linewidths=.5, cbar_kws={'shrink':.5})

In [None]:
results.sort_values(by=["test"], ascending=False).head()

In [None]:
# Explorando 3 dimensões de hiper parâmetro

def roda_arvore_de_decisao(max_depth, min_sample_leaf, min_sample_split):
  SEED = 301
  np.random.seed(SEED)
  
  cv = GroupKFold(n_splits=10)
  model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_sample_leaf, min_samples_split=min_sample_split)
  results = cross_validate(model, x_azar, y_azar, cv=cv, return_train_score=True, groups=dados.modelo)
  fit_time = results['fit_time'].mean()
  score_time = results['score_time'].mean()
  test_score = results['test_score'].mean() * 100
  train_score = results['train_score'].mean() * 100
  table = [max_depth, min_sample_split, min_sample_leaf, train_score, test_score, fit_time, score_time]
  return table

def busca():
  results = []
  for max_depth in range(1,16):
    for min_sample_leaf in [92, 128, 256, 512]:
      for min_sample_split in [32, 64, 128, 256]:
        table = roda_arvore_de_decisao(max_depth, min_sample_leaf, min_sample_split)
        results.append(table)
  results = pd.DataFrame(results, columns=['max_depth', 'min_sample_split','min_sample_leaf', 'train', 'test', 'fit_time', 'score_time'])
  return results

results = busca()  
results.head()

In [None]:
corr = results.corr()
mask = np.zeros_like(corr, dtype=np.bool_)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(15, 10))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=3, center=0, square=True, linewidths=.5, cbar_kws={'shrink':.5})

In [None]:
results.sort_values(by=["test"], ascending=False).head()

In [None]:
# Explorando espaço de hiper parâmetro com GridSearchCV

from sklearn.model_selection import GridSearchCV

SEED = 301
np.random.seed(SEED)
espaco_de_parametros = {
  'max_depth': [3, 5], 
  'min_samples_split':[32,64,128], 
  'min_samples_leaf':[32,64,128],
  'criterion': ['gini', 'entropy']
  }

search = GridSearchCV(DecisionTreeClassifier(), espaco_de_parametros, cv=GroupKFold(n_splits=10))
search.fit(x_azar, y_azar, groups=dados.modelo)
results = pd.DataFrame(search.cv_results_)
results.head()

In [None]:
print(search.best_params_)
print(search.best_score_ * 100)

In [None]:
# Evitar essa abordagem pois estará sendo otimista

best_model = search.best_estimator_
prediction = best_model.predict(x_azar)
accuracy = accuracy_score(prediction, y_azar)
print("Accuracy %.2f" % (accuracy * 100))

In [None]:
# Como ter uma estimativa sem esse vício nos dados que já vimos?
# No caso de cross validation com busca de hipr paâmetros fazemos uma nova
# validação cruzada, chama-se Nested Cross Validation

# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(search, x_azar, y_azar, cv=GroupKFold(n_splits=10), groups=dados.modelo)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

SEED = 301
np.random.seed(SEED)
espaco_de_parametros = {
  'max_depth': [3, 5], 
  'min_samples_split':[32,64,128], 
  'min_samples_leaf':[32,64,128],
  'criterion': ['gini', 'entropy']
  }

search = GridSearchCV(DecisionTreeClassifier(), espaco_de_parametros, cv=KFold(n_splits=5,shuffle=True))
search.fit(x_azar, y_azar)
results = pd.DataFrame(search.cv_results_)
results.head()

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(search, x_azar, y_azar, cv=KFold(n_splits=5, shuffle=True))
scores

In [None]:
def print_score(score):
  media = score.mean() * 100
  desvio_padrao = score.std() * 100
  print('Accuracy média:  %.2f' %(media))
  print("Accuracy intervalo:  [%.2f %.2f]" % ((media - 2 * desvio_padrao), (media + 2 * desvio_padrao)))


print_score(scores)

In [None]:
the_best_model = search.best_estimator_
print(the_best_model)

In [None]:
features = x_azar.columns
dot_data = export_graphviz(the_best_model, out_file=None, filled=True, rounded=True, class_names=["Não", "Sim"],feature_names=features)
graph = graphviz.Source(dot_data)
graph

In [None]:
# Busca aleatória com RandomSearch

from sklearn.model_selection import RandomizedSearchCV
SEED = 301
np.random.seed(SEED)

espaco_de_parametros = {
  'max_depth': [3, 5], 
  'min_samples_split':[32,64,128], 
  'min_samples_leaf':[32,64,128],
  'criterion': ['gini', 'entropy']
  }

search = RandomizedSearchCV(DecisionTreeClassifier(), 
                            espaco_de_parametros, 
                            cv=KFold(n_splits=5,shuffle=True),
                            random_state=SEED, 
                            n_iter=16)
search.fit(x_azar, y_azar)
results = pd.DataFrame(search.cv_results_)
results.head()

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(search, x_azar, y_azar, cv=KFold(n_splits=5, shuffle=True))
print_score(scores)

In [None]:
the_best_model = search.best_estimator_
print(the_best_model)

In [None]:
features = x_azar.columns
dot_data = export_graphviz(the_best_model, out_file=None, filled=True, rounded=True, class_names=["Não", "Sim"],feature_names=features)
graph = graphviz.Source(dot_data)
graph

In [None]:
# Customizando o espaço de hiper paâmetros
from scipy.stats import randint

SEED = 301
np.random.seed(SEED)

espaco_de_parametros = {
  'max_depth': [3, 5, 10, 15,20, 30, None], 
  'min_samples_split': randint(32, 128), 
  'min_samples_leaf':randint(32, 128),
  'criterion': ['gini', 'entropy']
  }

search = RandomizedSearchCV(DecisionTreeClassifier(), 
                            espaco_de_parametros, 
                            cv=KFold(n_splits=5,shuffle=True),
                            random_state=SEED, 
                            n_iter=16)
search.fit(x_azar, y_azar)
results = pd.DataFrame(search.cv_results_)
results.head()

In [None]:
scores = cross_val_score(search, x_azar, y_azar, cv=KFold(n_splits=5, shuffle=True))
print(search.best_estimator_)
print_score(scores)

In [None]:
results_order_by_mean = results.sort_values(by=['mean_test_score'], ascending=False)
for index, line in results_order_by_mean.iterrows():
  print("%.3f +- (%.3f) %s" % (line.mean_test_score, line.std_test_score *2, line.params))

In [None]:
from sklearn.ensemble import RandomForestClassifier
import time

SEED = 301
np.random.seed(SEED)

espaco_de_parametros = {
  'n_estimators': [10, 100],
  'max_depth': [3, 5], 
  'min_samples_split': randint(32, 128), 
  'min_samples_leaf':randint(32, 128),
  'bootstrap':[True, False],
  'criterion': ['gini', 'entropy']
  }
tic = time.time()
search = RandomizedSearchCV(RandomForestClassifier(), 
                            espaco_de_parametros, 
                            cv=KFold(n_splits=5,shuffle=True),
                            random_state=SEED, 
                            n_iter=16)

search.fit(x_azar, y_azar)

tac = time.time()
tic_tac = tac - tic
print("Tempo %.2f segundos" % tic_tac)

results = pd.DataFrame(search.cv_results_)
results.head()

In [115]:
tic = time.time()
scores = cross_val_score(search, x_azar, y_azar, cv=KFold(n_splits=5, shuffle=True))
tac = time.time()
tic_tac = tac - tic
print("Tempo %.2f" % tic_tac)

print(search.best_estimator_)
print_score(scores)

Tempo 42.94
RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=5,
                       min_samples_leaf=66, min_samples_split=108,
                       n_estimators=10)
Accuracy média:  77.61
Accuracy intervalo:  [76.18 79.04]


In [118]:
# 0.6 treino => treino
# 0.2 teste => dev teste
# 0.2 validação => validação

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

SEED = 301
np.random.seed(SEED)

x_treino_teste, x_validacao, y_treino_teste, y_validacao =  train_test_split(x_azar, y_azar, test_size=0.2, shuffle=True, stratify=y_azar)

espaco_de_parametros = {
  'n_estimators': [10, 100],
  'max_depth': [3, 5], 
  'min_samples_split': randint(32, 128), 
  'min_samples_leaf':randint(32, 128),
  'bootstrap':[True, False],
  'criterion': ['gini', 'entropy']
  }

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25)
search = RandomizedSearchCV(RandomForestClassifier(), 
                            espaco_de_parametros, 
                            cv=split,
                            random_state=SEED, 
                            n_iter=5)

search.fit(x_treino_teste, y_treino_teste)
results = pd.DataFrame(search.cv_results_)
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,0.31945,0.0,0.022993,0.0,False,gini,5,100,50,100,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.7745,0.7745,0.0,3
1,0.043916,0.0,0.003987,0.0,False,entropy,5,66,108,10,"{'bootstrap': False, 'criterion': 'entropy', '...",0.7755,0.7755,0.0,1
2,0.023906,0.0,0.003022,0.0,True,gini,3,102,46,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.774,0.774,0.0,4
3,0.023928,0.0,0.002958,0.0,True,entropy,3,84,99,10,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.74,0.74,0.0,5
4,0.02693,0.0,0.003985,0.0,True,gini,5,57,73,10,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.775,0.775,0.0,2


In [121]:
scores = cross_val_score(search, x_validacao, y_validacao, cv=split)
# print_score(scores)
scores

array([0.774])