**Declaração do problema: Valores Faltantes (missing)**

**Chamando dataset:**

In [None]:
import pandas as pd

df = pd.read_csv("cancer_de_mama.csv", na_values=["?"]) # na_values=["NaN"]
df.isnull().sum()

FileNotFoundError: ignored

**Dividindo em X (features) e Y (labels)**

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

**Splid Dataset**

In [None]:
from sklearn.model_selection import train_test_split

train, test, train_labels, test_labels = train_test_split(X,
                                                          y,
                                                          test_size=0.3,
                                                          random_state=12,
                                                          stratify=y)

**Trabalhando com Valores Faltantes (missing) - Com Holdout**

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

imp = SimpleImputer(missing_values=np.nan, strategy='mean') # strategy='median', strategy='most_frequent', (strategy='constant', fill_value = 0)
new_train = pd.DataFrame(np.round(imp.fit_transform(train)))
new_test = pd.DataFrame(np.round(imp.transform(test)))

In [None]:
new_train

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,8.0,2.0,1.0,1.0,5.0,1.0,1.0,1.0,1.0
1,6.0,10.0,10.0,10.0,4.0,10.0,7.0,10.0,1.0
2,8.0,7.0,8.0,2.0,4.0,2.0,5.0,10.0,1.0
3,7.0,5.0,6.0,3.0,3.0,8.0,7.0,4.0,1.0
4,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
484,3.0,2.0,2.0,2.0,2.0,1.0,3.0,2.0,1.0
485,4.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
486,3.0,2.0,2.0,3.0,2.0,1.0,1.0,1.0,1.0
487,10.0,10.0,10.0,7.0,9.0,10.0,7.0,10.0,10.0


In [None]:
new_train.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

In [None]:
new_test

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,10.0,10.0,10.0,1.0,6.0,1.0,2.0,8.0,1.0
1,3.0,3.0,6.0,4.0,5.0,8.0,4.0,4.0,1.0
2,2.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
3,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
4,1.0,1.0,3.0,2.0,2.0,1.0,3.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
205,5.0,5.0,7.0,8.0,6.0,10.0,7.0,4.0,1.0
206,4.0,1.0,1.0,1.0,2.0,1.0,3.0,6.0,1.0
207,9.0,5.0,5.0,2.0,2.0,2.0,5.0,1.0,1.0
208,4.0,2.0,4.0,3.0,2.0,2.0,2.0,1.0,1.0


In [None]:
new_test.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

**Trabalhando com Valores Faltantes (missing) - Com cross-validation**

Usamos um Pipeline para definir o pipeline de modelagem, onde os dados são passados primeiro pela transformação do imputador e depois fornecidos ao modelo. Isso garante que o imputador e o modelo sejam ambos adequados apenas ao conjunto de dados de treinamento e avaliados no conjunto de dados de teste em cada dobra de validação cruzada. Isso é importante para evitar vazamento de dados.

In [None]:
from numpy import nan
from pandas import read_csv
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

dataset = pd.read_csv("cancer_de_mama.csv", na_values=["?"]) # na_values=["NaN"]
# dataset.isnull().sum()

X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# define the imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')

lda = LinearDiscriminantAnalysis()

pipeline = Pipeline(steps=[('imputer', imputer),('model', lda)])

kfold = KFold(n_splits=3, shuffle=True, random_state=1)

result = cross_val_score(pipeline, X, y, cv=kfold, scoring='accuracy')

print('Accuracy: %.3f' % result.mean())

Accuracy: 0.959
