In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

In [31]:
# read data 
X = pd.read_csv('../data/secondary_mushroom_features.csv')
y = pd.read_csv('../data/secondary_mushroom_targets.csv')
X.shape, y.shape

((61069, 20), (61069, 1))

In [32]:
X.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,15.26,x,g,o,f,e,,w,16.95,17.09,s,y,w,u,w,t,g,,d,w
1,16.6,x,g,o,f,e,,w,17.99,18.19,s,y,w,u,w,t,g,,d,u
2,14.07,x,g,o,f,e,,w,17.8,17.74,s,y,w,u,w,t,g,,d,w
3,14.17,f,h,e,f,e,,w,15.77,15.98,s,y,w,u,w,t,p,,d,w
4,14.64,x,h,o,f,e,,w,16.53,17.2,s,y,w,u,w,t,p,,d,w


In [33]:
# get info on the data 
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61069 entries, 0 to 61068
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   cap-diameter          61069 non-null  float64
 1   cap-shape             61069 non-null  object 
 2   cap-surface           46949 non-null  object 
 3   cap-color             61069 non-null  object 
 4   does-bruise-or-bleed  61069 non-null  object 
 5   gill-attachment       51185 non-null  object 
 6   gill-spacing          36006 non-null  object 
 7   gill-color            61069 non-null  object 
 8   stem-height           61069 non-null  float64
 9   stem-width            61069 non-null  float64
 10  stem-root             9531 non-null   object 
 11  stem-surface          22945 non-null  object 
 12  stem-color            61069 non-null  object 
 13  veil-type             3177 non-null   object 
 14  veil-color            7413 non-null   object 
 15  has-ring           

In [34]:
# check for nulls
X.isnull().sum()


cap-diameter                0
cap-shape                   0
cap-surface             14120
cap-color                   0
does-bruise-or-bleed        0
gill-attachment          9884
gill-spacing            25063
gill-color                  0
stem-height                 0
stem-width                  0
stem-root               51538
stem-surface            38124
stem-color                  0
veil-type               57892
veil-color              53656
has-ring                    0
ring-type                2471
spore-print-color       54715
habitat                     0
season                      0
dtype: int64

In [35]:
# what is the percentage of nulls in each column
X.isnull().mean() * 100

cap-diameter             0.000000
cap-shape                0.000000
cap-surface             23.121387
cap-color                0.000000
does-bruise-or-bleed     0.000000
gill-attachment         16.184971
gill-spacing            41.040462
gill-color               0.000000
stem-height              0.000000
stem-width               0.000000
stem-root               84.393064
stem-surface            62.427746
stem-color               0.000000
veil-type               94.797688
veil-color              87.861272
has-ring                 0.000000
ring-type                4.046243
spore-print-color       89.595376
habitat                  0.000000
season                   0.000000
dtype: float64

In [40]:
y.head()


Unnamed: 0,class
0,p
1,p
2,p
3,p
4,p


In [36]:
# drop columns with more than 20% nulls
X = X.loc[:, X.isnull().mean() < 0.2]
X.shape

(61069, 13)

In [37]:
# now we also need to remove duplicates from the target variable
# so we will concatenate the features and target variable and then remove duplicates
data = pd.concat([X, y], axis=1)
data.shape

(61069, 14)

In [38]:
# get duplicate percentage
data.duplicated().mean() * 100

np.float64(0.27182367485958503)

In [39]:
data = data.drop_duplicates()
data.shape

(60903, 14)

In [41]:
# get the target variable 
y = data['class']
X = data.drop('class', axis=1)


In [42]:
# split the data into categorical and numerical features
X_cat = X.select_dtypes(include=['object']) 
X_num = X.select_dtypes(include=['int64', 'float64'])
X_cat.shape, X_num.shape


((60903, 10), (60903, 3))

In [43]:
# now let's log our initial cleaning steps
X_cat.to_csv('../data/cleaned_categorical_features(1).csv', index=False)
X_num.to_csv('../data/cleaned_numerical_features(1).csv', index=False)
y.to_csv("../data/secondary_mushroom_targets(1).csv", index=False)