In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, plot_confusion_matrix, plot_roc_curve

from matplotlib.colors import ListedColormap
import matplotlib

In [34]:
dataset = pd.read_csv("https://pycourse.s3.amazonaws.com/banknote_authentication.txt",
                      header=None, names=['variance', 'skewness', 'curtosis', 'entropy', 'class'])
dataset.head()

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [35]:
# Qual o tamanho desse dataset (número de linhas, número de colunas)? 
dataset.shape

(1372, 5)

In [13]:
# Qual variável possui o maior range (diferença entre valor máximo e mínimo)?  
dataset.max() - dataset.min()

variance    13.8669
skewness    26.7247
curtosis    23.2135
entropy     10.9977
class        1.0000
dtype: float64

In [14]:
# Qual a média da coluna skewness?  
dataset['skewness'].mean()

1.9223531206393603

In [15]:
# Qual a média da coluna entropy?  
dataset['entropy'].mean()

-1.191656520043731

In [17]:
 #Qual a mediana da coluna variance?
dataset['variance'].median()

0.49618

In [22]:
# Qual o desvio padrão da coluna curtosis?  
import statistics as sts
sts.stdev(dataset['curtosis'])

4.310030090106595

In [26]:
# Qual a porcentagem de exemplos do dataset que são cédulas falsas (class=1)?  
dataset_ced_1 = dataset[dataset['class'] == 1]
(len(dataset_ced_1)/len(dataset))*100

44.460641399416915

In [28]:
 #Qual o valor da correlação de Pearson entre as variáveis skewness e curtosis?  
dataset['skewness'].corr(dataset['curtosis'])

-0.7868952243065797

In [65]:
# Começar preparar os dados para aplicar os algoritmos
#Utilizarei a coluna class como saída do modelo
X = dataset.iloc[:,0:4].values
y = dataset.iloc[:, 4].values

In [72]:
#Divindo em treino e test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0, stratify = y)

In [73]:
#Escalando variáveis
sc = StandardScaler()
X_train = sc.fit_transform(X_train.astype(np.float64))
X_test = sc.transform(X_test.astype(np.float64))

In [74]:
#KNN
knn_reg = KNeighborsClassifier(n_neighbors=5)
knn_reg.fit(X_train, y_train)

KNeighborsClassifier()

In [75]:
y_pred = knn_reg.predict(X_test)

In [71]:
# Qual a acurácia do KNN no conjunto de teste?  
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       232
           1       0.99      1.00      1.00       180

    accuracy                           1.00       412
   macro avg       1.00      1.00      1.00       412
weighted avg       1.00      1.00      1.00       412



In [78]:
#Decision tree
from sklearn import tree
clf_arvore = tree.DecisionTreeClassifier(random_state=1)

In [79]:
clf_arvore.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [80]:
y_pred = clf_arvore.predict(X_test)

In [81]:
# Qual a acurácia do Random Forest no conjunto de teste?  
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       191
           1       0.97      0.99      0.98       152

    accuracy                           0.98       343
   macro avg       0.98      0.98      0.98       343
weighted avg       0.98      0.98      0.98       343



In [82]:
#Random Forest
clf_floresta = RandomForestClassifier(max_depth=8, random_state=1)

In [83]:
clf_floresta.fit(X_train, y_train)

RandomForestClassifier(max_depth=8, random_state=1)

In [85]:
y_pred = clf_floresta.predict(X_test)

In [86]:
# Qual a acurácia do Random Forest no conjunto de teste?  
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       191
           1       0.99      1.00      0.99       152

    accuracy                           0.99       343
   macro avg       0.99      0.99      0.99       343
weighted avg       0.99      0.99      0.99       343



In [89]:
# Analisando o valor da importância relativa das features do Random Forest (atributo feature_importances_),
#qual feature melhor contribuiu para a predição de class?  
# Qual o valor da importância relativa da feature skewness?  
print(clf_floresta.feature_importances_)

[0.52969205 0.23825188 0.1644553  0.06760077]


In [92]:
#SVM
from sklearn.svm import SVC
clf_svm = SVC(gamma='auto',kernel='rbf', random_state=1)

In [93]:
clf_svm.fit(X_train, y_train)

SVC(gamma='auto', random_state=1)

In [94]:
y_pred = clf_svm.predict(X_test)

In [95]:
 #Qual a acurácia do SVM no conjunto de teste?  
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       191
           1       1.00      1.00      1.00       152

    accuracy                           1.00       343
   macro avg       1.00      1.00      1.00       343
weighted avg       1.00      1.00      1.00       343



In [96]:
#MLP
clf_mlp = MLPClassifier(hidden_layer_sizes=(2,), solver='lbfgs', random_state=1)

In [97]:
clf_mlp.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(2,), random_state=1, solver='lbfgs')

In [98]:
y_pred = clf_mlp.predict(X_test)

In [99]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       191
           1       1.00      1.00      1.00       152

    accuracy                           1.00       343
   macro avg       1.00      1.00      1.00       343
weighted avg       1.00      1.00      1.00       343

