# Avaliando a generalização de algoritmos
## Breast Cancer Wisconsin (Original) Dataset
Retirado de https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/

### 1 - Carregar uma base de dados de classificação

In [43]:
import pandas as pd
df = pd.read_excel('breast_cancer_wisconsin.xlsx', header=None)

In [44]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [45]:
# Dar drop da coluna 6 porque ela contém alguns valores faltantes
df.drop(columns=6, inplace=True)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       699 non-null    int64
 1   1       699 non-null    int64
 2   2       699 non-null    int64
 3   3       699 non-null    int64
 4   4       699 non-null    int64
 5   5       699 non-null    int64
 6   7       699 non-null    int64
 7   8       699 non-null    int64
 8   9       699 non-null    int64
 9   10      699 non-null    int64
dtypes: int64(10)
memory usage: 54.7 KB


In [47]:
X = df.iloc[:,1:-1].to_numpy()
y = df.iloc[:,-1].to_numpy()
y = (y > 3).astype(int)

In [48]:
X.shape, y.shape

((699, 8), (699,))

### 2 - Comparar os classificadores Logistic Regression e KNN
#### 2.1 - Logistic Regression

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.metrics import make_scorer, mean_squared_error, accuracy_score
import numpy as np

In [50]:
pipeline_logistic_regression = Pipeline([
    ("padronizacao", StandardScaler()),
    ("logistic_regression", LogisticRegression())
])

parametros = {'logistic_regression__penalty': ['l2', 'none']}
modelo = GridSearchCV(pipeline_logistic_regression, parametros, scoring='neg_mean_squared_error')

##### Usando como avaliação o mean_squared_error

In [51]:
scores = cross_validate(modelo, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=10) 
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[-0.29277002 -0.26726124 -0.20701967 -0.31622777 -0.16903085 -0.26726124
 -0.11952286 -0.11952286 -0.16903085 -0.12038585]
Com padronização: -0.20480332163730072


##### Usando como avaliação o accuracy_score

In [52]:
scores = cross_validate(modelo, X, y, scoring=make_scorer(accuracy_score), cv=10) 
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[0.91428571 0.92857143 0.95714286 0.9        0.97142857 0.92857143
 0.98571429 0.98571429 0.97142857 0.98550725]
Com padronização: 0.9528364389233955


#### 2.2 - KNN

In [53]:
pipeline_knn = Pipeline([
    ("padronizacao", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

parametros = {'knn__n_neighbors': [3, 5, 7, 9]}
modelo_knn = GridSearchCV(pipeline_knn, parametros, scoring='neg_mean_squared_error')

##### Usando como avaliação o mean_squared_error

In [54]:
scores = cross_validate(modelo_knn, X, y, scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False), cv=10) 
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[-0.29277002 -0.16903085 -0.16903085 -0.31622777 -0.16903085 -0.26726124
 -0.11952286 -0.11952286 -0.16903085 -0.12038585]
Com padronização: -0.19118140085492832


##### Usando como avaliação o accuracy_score

In [55]:
scores = cross_validate(modelo_knn, X, y, scoring=make_scorer(accuracy_score), cv=10) 
print(scores['test_score'])
compad = np.mean(scores['test_score'])
print(f"Com padronização: {compad}")

[0.91428571 0.97142857 0.97142857 0.9        0.97142857 0.92857143
 0.98571429 0.98571429 0.97142857 0.98550725]
Com padronização: 0.9585507246376812


Ambos os classificadores obtiveram resultados parecidos nas duas métricas em que foram avaliados, mas o Knn foi ligeiramente melhor.