### Base de dados: StudentsPerformance

1000 instâncias

5 atributos

2 classes (none, completed)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
base = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Bases/StudentsPerformance.csv", sep=";")
del base["parental level of education"]
base.head()

Unnamed: 0,gender,race,test preparation course,math score,reading score,writing score
0,female,group B,none,72,72,74
1,female,group C,completed,69,90,88
2,female,group B,none,90,95,93
3,male,group A,none,47,57,44
4,male,group C,none,76,78,75


In [None]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   gender                   1000 non-null   object
 1   race                     1000 non-null   object
 2   test preparation course  1000 non-null   object
 3   math score               1000 non-null   int64 
 4   reading score            1000 non-null   int64 
 5   writing score            1000 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 47.0+ KB


In [None]:
base.select_dtypes(include='int').describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [None]:
base.select_dtypes(include='object').describe()

Unnamed: 0,gender,race,test preparation course
count,1000,1000,1000
unique,2,5,2
top,female,group C,none
freq,518,319,642


In [None]:
X_dict = base.iloc[:,base.columns != 'test preparation course'].T.to_dict().values()
X_dict

dict_values([{'gender': 'female', 'race': 'group B', 'math score': 72, 'reading score': 72, 'writing score': 74}, {'gender': 'female', 'race': 'group C', 'math score': 69, 'reading score': 90, 'writing score': 88}, {'gender': 'female', 'race': 'group B', 'math score': 90, 'reading score': 95, 'writing score': 93}, {'gender': 'male', 'race': 'group A', 'math score': 47, 'reading score': 57, 'writing score': 44}, {'gender': 'male', 'race': 'group C', 'math score': 76, 'reading score': 78, 'writing score': 75}, {'gender': 'female', 'race': 'group B', 'math score': 71, 'reading score': 83, 'writing score': 78}, {'gender': 'female', 'race': 'group B', 'math score': 88, 'reading score': 95, 'writing score': 92}, {'gender': 'male', 'race': 'group B', 'math score': 40, 'reading score': 43, 'writing score': 39}, {'gender': 'male', 'race': 'group D', 'math score': 64, 'reading score': 64, 'writing score': 67}, {'gender': 'female', 'race': 'group B', 'math score': 38, 'reading score': 60, 'writin

In [None]:
vect = DictVectorizer(sparse=False)
x = vect.fit_transform(X_dict)
x

array([[ 1.,  0., 72., ...,  0., 72., 74.],
       [ 1.,  0., 69., ...,  0., 90., 88.],
       [ 1.,  0., 90., ...,  0., 95., 93.],
       ...,
       [ 1.,  0., 59., ...,  0., 71., 65.],
       [ 1.,  0., 68., ...,  0., 78., 77.],
       [ 1.,  0., 77., ...,  0., 86., 86.]])

Carga de dados

90% da base para treinamento (900 registros)

10% da base para teste (100 regisros)

In [None]:
le = LabelEncoder()
y = le.fit_transform(base["test preparation course"])

class_names = le.classes_
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.1)

### Árvore de decisão



In [None]:
arvore = DecisionTreeClassifier(random_state=0)
arvore = arvore.fit(x_train, y_train)
y_pred = arvore.predict(x_test)

In [None]:
print("Classificador Árvore de Decisão:\n")
print("--------------------------------------")
print("Acurácia da base de treinamento: {:.2f}".format(arvore.score(x_train, y_train)))
print("--------------------------------------")

print("--------------------------------------")
print("Acurácia da base de teste: {:.4f}".format(arvore.score(x_test, y_test)))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

arvore_matrix = confusion_matrix(y_test, y_pred)
arvore_table = pd.DataFrame(data=arvore_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(arvore_table)


Classificador Árvore de Decisão:

--------------------------------------
Acurácia da base de treinamento: 1.00
--------------------------------------
--------------------------------------
Acurácia da base de teste: 0.6200
--------------------------------------
              precision    recall  f1-score   support

   completed       0.48      0.41      0.44        37
        none       0.68      0.75      0.71        63

    accuracy                           0.62       100
   macro avg       0.58      0.58      0.58       100
weighted avg       0.61      0.62      0.61       100

           completed(prev)  none(prev)
completed               15          22
none                    16          47


In [None]:
tree_params = {"max_depth": np.arange(1, 20),
               "criterion": ["gini", "entropy"],
               "min_samples_split": np.arange(1, 20),
               "min_samples_leaf": np.arange(1, 20),
               }

In [None]:
arvore2 = random_search = RandomizedSearchCV(arvore,
                                   param_distributions=tree_params,
                                   n_iter=10,
                                   cv=5,
                                   random_state=0)

In [None]:
arvore2.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'min_samples_split': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
                   random_state=0)

In [None]:
arvore2.best_estimator_

DecisionTreeClassifier(criterion='entropy', max_depth=7, min_samples_leaf=11,
                       min_samples_split=11, random_state=0)

In [87]:
print("Classificador Árvore de Decisão:\n")
print("--------------------------------------")
print("Acurácia de treinamento:", arvore2.best_estimator_.score(x_train, y_train))
print("--------------------------------------")

print("--------------------------------------")
y_pred = arvore2.best_estimator_.predict(x_test)
print("Acurácia de previsão:", accuracy_score(y_test, y_pred))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

arvore_matrix = confusion_matrix(y_test, y_pred)
arvore_table = pd.DataFrame(data=arvore_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(arvore_table)

Classificador Árvore de Decisão:

--------------------------------------
Acurácia de treinamento: 0.7677777777777778
--------------------------------------
--------------------------------------
Acurácia de previsão: 0.7
--------------------------------------
              precision    recall  f1-score   support

   completed       0.61      0.54      0.57        37
        none       0.75      0.79      0.77        63

    accuracy                           0.70       100
   macro avg       0.68      0.67      0.67       100
weighted avg       0.69      0.70      0.70       100

           completed(prev)  none(prev)
completed               20          17
none                    13          50


### Random Forest

In [None]:
floresta = RandomForestClassifier(random_state=0)
floresta = floresta.fit(x_train, y_train)
y_pred = floresta.predict(x_test)

In [None]:
print("Classificador Random Forest:\n")
print("--------------------------------------")
print("Acurácia da base de treinamento: {:.2f}".format(floresta.score(x_train, y_train)))
print("--------------------------------------")

print("--------------------------------------")
print("Acurácia da base de teste: {:.4f}".format(floresta.score(x_test, y_test)))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

floresta_matrix = confusion_matrix(y_test, y_pred)
floresta_table = pd.DataFrame(data=floresta_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(floresta_table)


Classificador Random Forest:

--------------------------------------
Acurácia da base de treinamento: 1.00
--------------------------------------
--------------------------------------
Acurácia da base de teste: 0.6500
--------------------------------------
              precision    recall  f1-score   support

   completed       0.71      0.32      0.44        37
        none       0.70      0.92      0.79        63

    accuracy                           0.70       100
   macro avg       0.70      0.62      0.62       100
weighted avg       0.70      0.70      0.66       100

           completed(prev)  none(prev)
completed               12          25
none                     5          58


In [None]:
forest_params = {"max_depth": np.arange(1, 20),
               "criterion": ["gini", "entropy"],
               "min_samples_split": np.arange(1, 20),
               "min_samples_leaf": np.arange(1, 20),
               }

In [None]:
floresta2 = random_search = RandomizedSearchCV(floresta,
                                   param_distributions=forest_params,
                                   n_iter=10,
                                   cv=5,
                                   random_state=0)

In [None]:
floresta2.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'min_samples_leaf': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                                        'min_samples_split': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])},
                   random_state=0)

In [None]:
floresta2.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=9, min_samples_leaf=6,
                       min_samples_split=4, random_state=0)

In [85]:
print("Classificador Random Forest:\n")
print("--------------------------------------")
print("Acurácia de treinamento:", floresta2.best_estimator_.score(x_train, y_train))
print("--------------------------------------")

print("--------------------------------------")
y_pred = floresta2.best_estimator_.predict(x_test)
print("Acurácia de previsão:", accuracy_score(y_test, y_pred))
print("--------------------------------------")

print(classification_report(y_test, y_pred, target_names=class_names))

floresta_matrix = confusion_matrix(y_test, y_pred)
floresta_table = pd.DataFrame(data=floresta_matrix, index=class_names, columns=[x + "(prev)" for x in class_names])
print(floresta_table)

Classificador Random Forest:

--------------------------------------
Acurácia de treinamento: 0.7988888888888889
--------------------------------------
--------------------------------------
Acurácia de previsão: 0.7
--------------------------------------
              precision    recall  f1-score   support

   completed       0.71      0.32      0.44        37
        none       0.70      0.92      0.79        63

    accuracy                           0.70       100
   macro avg       0.70      0.62      0.62       100
weighted avg       0.70      0.70      0.66       100

           completed(prev)  none(prev)
completed               12          25
none                     5          58
