In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization

In [None]:
df = pd.read_csv("car.data", header=None)
col_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df.columns = col_names

In [None]:
X = df.drop(['class'], axis=1)
y = df['class']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)
X_train.head()

In [None]:
# Fazer a troca dos valores nominais para valores numéricos
# Neste caso antes tinhamos objects e agora temos dtypes

import category_encoders as ce

encoder = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)
X_train.head()
X_train.dtypes

In [96]:
from sklearn.tree import DecisionTreeClassifier

clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, random_state=0)

In [97]:
y_pred_gini = clf_gini.predict(X_test)

In [98]:
from sklearn.metrics import accuracy_score

print(f'Score de precisão do set de TESTE utilizando Índice de Gini: {accuracy_score(y_test, y_pred_gini)}')

Score de precisão do set de TESTE utilizando Índice de Gini: 0.8021015761821366


In [99]:
y_pred_train_gini = clf_gini.predict(X_train)

y_pred_train_gini

array(['unacc', 'unacc', 'unacc', ..., 'unacc', 'unacc', 'acc'],
      dtype=object)

In [100]:
print(f'Score de precisão do set de TREINO utilizando Índice de Gini: {accuracy_score(y_train, y_pred_train_gini)}')


Score de precisão do set de TREINO utilizando Índice de Gini: 0.7865168539325843


In [101]:
print(f'Score do conjunto de TREINO utilizando Índice de Gini: {clf_gini.score(X_train, y_train)}')

print(f'Score do conjunto de TESTE utilizando Índice de Gini: {clf_gini.score(X_test, y_test)}')



Score do conjunto de TREINO utilizando Índice de Gini: 0.7865168539325843
Score do conjunto de TESTE utilizando Índice de Gini: 0.8021015761821366


In [None]:
plt.figure(figsize=(12,8))

from sklearn import tree

# tree.plot_tree(clf_gini.fit(X_train, y_train)) 

import graphviz 
dot_data = tree.export_graphviz(clf_gini, out_file=None, 
                              feature_names=X_train.columns,  
                              class_names=y_train,  
                              filled=True, rounded=True,  
                              special_characters=True)

graph = graphviz.Source(dot_data) 

graph 

In [102]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':range(3,20)}
clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, n_jobs=2)
clf.fit(X=X_train, y=y_train)

y_test_pred_gini_cv = clf.predict(X_test)

print(f'Score de precisão do set de TESTE utilizando Índice de Gini + cross validation (k-folding): {accuracy_score(y_test, y_test_pred_gini_cv)}')



tree_model = clf.best_estimator_
print(clf.best_score_, clf.best_params_) 

Score de precisão do set de TESTE utilizando Índice de Gini: 0.9439579684763573
0.9585124645469474 {'max_depth': 16}
