# Different ways to fill missing values (NaN) in data

In [2]:
import pandas as pd
import numpy as np

In [16]:
miles = pd.DataFrame({'farthest_run_mi': [50,62, np.nan,100,26,13,31,50]}) # np.nan - Missing value

In [18]:
miles

Unnamed: 0,farthest_run_mi
0,50.0
1,62.0
2,
3,100.0
4,26.0
5,13.0
6,31.0
7,50.0


In [20]:
miles.isna().sum()

farthest_run_mi    1
dtype: int64

In [22]:
from sklearn.impute import SimpleImputer

In [24]:
imp_mean = SimpleImputer(strategy='mean')

In [26]:
imp_mean.fit_transform(miles)   # Missing value has been changed for a mean of other values 

array([[ 50.        ],
       [ 62.        ],
       [ 47.42857143],
       [100.        ],
       [ 26.        ],
       [ 13.        ],
       [ 31.        ],
       [ 50.        ]])

In [28]:
imp_median = SimpleImputer(strategy='median')

In [32]:
imp_median.fit_transform(miles)   # Missing value has been changed for median value - 50

array([[ 50.],
       [ 62.],
       [ 50.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [34]:
imp_median = SimpleImputer(strategy='most_frequent')

In [36]:
imp_median.fit_transform(miles)

array([[ 50.],
       [ 62.],
       [ 50.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [42]:
imp_median = SimpleImputer(strategy='constant', fill_value=13) # Missing value has been changed for 13 or fill_value

In [44]:
imp_median.fit_transform(miles)


array([[ 50.],
       [ 62.],
       [ 13.],
       [100.],
       [ 26.],
       [ 13.],
       [ 31.],
       [ 50.]])

In [50]:
names = pd.DataFrame({'names': ['ryan', 'nolan', 'honus', 'wagner', np.nan, 'ruth']})

In [52]:
names

Unnamed: 0,names
0,ryan
1,nolan
2,honus
3,wagner
4,
5,ruth


In [54]:
imp_constant_cat = SimpleImputer(strategy='constant', fill_value = 'babe')

In [56]:
imp_constant_cat.fit_transform(names)

array([['ryan'],
       ['nolan'],
       ['honus'],
       ['wagner'],
       ['babe'],
       ['ruth']], dtype=object)

In [58]:
imp_mean_marked = SimpleImputer(strategy='mean', add_indicator = True)

In [62]:
imp_mean_marked.fit_transform(miles) #To value that was previously NaN and was replaced with the mean, also is marked with 1 

array([[ 50.        ,   0.        ],
       [ 62.        ,   0.        ],
       [ 47.42857143,   1.        ],
       [100.        ,   0.        ],
       [ 26.        ,   0.        ],
       [ 13.        ,   0.        ],
       [ 31.        ,   0.        ],
       [ 50.        ,   0.        ]])

In [None]:
%cd "C:\\Users\\User\\OneDrive\\Töölaud\\Töö\\Port\\Pyhton\\Sklearn"

In [68]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


In [74]:
df = pd.read_excel("Jooks.xlsx")

In [76]:
df

Unnamed: 0,Name,farthest_run_mi
0,Ryan,50.0
1,Nolan,62.0
2,Walter,
3,Honus,100.0
4,Christy,26.0
5,,13.0
6,Napoleon,31.0
7,Tris,50.0


In [78]:
from sklearn.compose import make_column_transformer

In [80]:
ct = make_column_transformer(
    (imp_constant_cat, ['Name']),
    (imp_mean, ['farthest_run_mi']),
    remainder='drop'
)

In [82]:
ct.set_output(transform='pandas')

In [84]:
df_pandas = ct.fit_transform(df)

In [86]:
df_pandas

Unnamed: 0,simpleimputer-1__Name,simpleimputer-2__farthest_run_mi
0,Ryan,50.0
1,Nolan,62.0
2,Walter,47.428571
3,Honus,100.0
4,Christy,26.0
5,babe,13.0
6,Napoleon,31.0
7,Tris,50.0
