In [101]:
import numpy as np
import pandas as pd
data = pd.read_csv('agaricus-lepiota.data', na_values=["?", "NULL", "NaN"], header=None)
data.columns = ('classify', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor', 
                'gill-attachment', 'gill-spacing', 'gill-size','gill-color', 'stalk-shape', 
                'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 
                'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 
                'ring-number', 'ring-type', 'spore-print-color', 'population', 'habitat')


In [102]:
data.head()

Unnamed: 0,classify,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [103]:
from sklearn.preprocessing import LabelEncoder


label_encoders = {}

for col in data.columns:
    le = LabelEncoder()
    
    # Maska braków
    missing_mask = data[col].isnull()
    
    # Zakoduj tylko nie-NaN wartości i zapisz z powrotem tylko w te miejsca
    data.loc[~missing_mask, col] = le.fit_transform(data.loc[~missing_mask, col])
    
    # Zamień kolumnę na float, żeby NaN było prawidłowe (int nie wspiera NaN)
    data[col] = data[col].astype(float)
    
    # Zapisz encoder
    label_encoders[col] = le




In [104]:
data.head()
#data.isnull().sum()

Unnamed: 0,classify,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1.0,5.0,2.0,4.0,1.0,6.0,1.0,0.0,1.0,4.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0
1,0.0,5.0,2.0,9.0,1.0,0.0,1.0,0.0,0.0,4.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,1.0
2,0.0,0.0,2.0,8.0,1.0,3.0,1.0,0.0,0.0,5.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,3.0,2.0,3.0
3,1.0,5.0,3.0,8.0,1.0,6.0,1.0,0.0,1.0,5.0,...,2.0,7.0,7.0,0.0,2.0,1.0,4.0,2.0,3.0,5.0
4,0.0,5.0,2.0,3.0,0.0,5.0,1.0,1.0,0.0,4.0,...,2.0,7.0,7.0,0.0,2.0,1.0,0.0,3.0,0.0,1.0


In [105]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)
data_imputed = imputer.fit_transform(data)


In [108]:
data_imputed = pd.DataFrame(data_imputed, columns=data.columns)
data_imputed.isnull().sum()

classify                    0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises?                    0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [109]:
for col in data_imputed.columns:
    le = label_encoders[col]
    data_imputed[col] = le.inverse_transform(data_imputed[col].round().astype(int))


In [110]:
data_imputed.head()

Unnamed: 0,classify,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [None]:
# Można dodać słownik by nadpisać plik mushroom_decoded.csv 

In [111]:
from pickle import DICT
DICT1 = {'e':'edible', 'p':'poisonous'}
DICT2 = {'a': 'almond', 'l': 'anise', 'c':'creosote','y':'fishy','f':'foul', 
         'm':'musty','n':'none', 'p':'pungent','s':'spicy'}
DICT3 = {'b':'bell', 'c':'conical', 'x':'convex', 'f': 'flat', 'k':'knobbed', 's':'sunken'}
DICT4 = {'f':'fibrous', 'g':'grooves', 'y':'scaly', 's':'smooth'}
DICT5 = {'a':'abundant','c':'clustered','n':'numerous',
         's':'scattered','v':'several','y':'solitary'}
DICT6 = {'b':'bulbous','c':'club','u':'cup','e':'equal', 'z':'rhizomorphs', 'r':'rooted'}

data_imputed['classify'] = data_imputed['classify'].replace(DICT1).astype('category')
data_imputed['odor'] = data_imputed['odor'].replace(DICT2).astype('category')
data_imputed['cap-shape'] = data_imputed['cap-shape'].replace(DICT3).astype('category')
data_imputed['cap-surface'] = data_imputed['cap-surface'].replace(DICT4).astype('category')
data_imputed['population'] = data_imputed['population'].replace(DICT5).astype('category')
data_imputed['stalk-root'] = data_imputed['stalk-root'].replace(DICT6).astype('category')

In [112]:
data_imputed.head()

Unnamed: 0,classify,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,poisonous,convex,smooth,n,t,pungent,f,c,n,k,...,s,w,w,p,w,o,p,k,scattered,u
1,edible,convex,smooth,y,t,almond,f,c,b,k,...,s,w,w,p,w,o,p,n,numerous,g
2,edible,bell,smooth,w,t,anise,f,c,b,n,...,s,w,w,p,w,o,p,n,numerous,m
3,poisonous,convex,scaly,w,t,pungent,f,c,n,n,...,s,w,w,p,w,o,p,k,scattered,u
4,edible,convex,smooth,g,f,none,f,w,b,k,...,s,w,w,p,w,o,e,n,abundant,g


In [None]:
# Z pliku agaricus-lepiota.names wiemy, że w 11 kolumnie są nieuzupełnione wartości oznaczone "?".
# niestety polecenie data_imputed.isnull() nie widział tych pustych wartości nawet po zastosowaniu DICT6
# dlatego przy pobieraniu danych należało ustalić na_values
# rozkodujemy je(oprócz wartości "?") a następnie metodą KNN oszacujemy ich wartości (DICT6)

In [113]:
data_imputed.to_csv('mushroom_decoded.csv', index = False)