## Import the Packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
dict1 = {'Names' : ['Suresh','Ramesh','Sathish',np.nan],
         'Age'   : [24,45,np.nan,21],
         'City'  : [np.nan,'Hyd','Hyd','Blr']}
dict1

{'Names': ['Suresh', 'Ramesh', 'Sathish', nan],
 'Age': [24, 45, nan, 21],
 'City': [nan, 'Hyd', 'Hyd', 'Blr']}

In [9]:
df = pd.DataFrame(dict1)
df

Unnamed: 0,Names,Age,City
0,Suresh,24.0,
1,Ramesh,45.0,Hyd
2,Sathish,,Hyd
3,,21.0,Blr


In [13]:
df.isnull()

Unnamed: 0,Names,Age,City
0,False,False,True
1,False,False,False
2,False,True,False
3,True,False,False


In [15]:
df.isnull().sum()

Names    1
Age      1
City     1
dtype: int64

# Dropna

- drop the null values data based on rows and column

In [22]:
df

Unnamed: 0,Names,Age,City
0,Suresh,24.0,
1,Ramesh,45.0,Hyd
2,Sathish,,Hyd
3,,21.0,Blr


In [28]:
df.dropna()

Unnamed: 0,Names,Age,City
1,Ramesh,45.0,Hyd


In [30]:
df.dropna(axis = 1)

0
1
2
3


In [32]:
dict2 = {'Names' : ['Suresh','Ramesh','Sathish',np.nan],
         'Age'   : [24,45,np.nan,21],
         'City'  : [np.nan,'Hyd','Hyd','Blr'],
         'Company' : ['Google','Microsoft','IBM','Apple']}
df = pd.DataFrame(dict2)

In [34]:
df

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,,Hyd,IBM
3,,21.0,Blr,Apple


In [36]:
df.dropna(axis = 1)

Unnamed: 0,Company
0,Google
1,Microsoft
2,IBM
3,Apple


# Fillna

In [40]:
df

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,,Hyd,IBM
3,,21.0,Blr,Apple


- First Select the column

- then apply the fillna

In [43]:
df.fillna(30)

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,30,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,30.0,Hyd,IBM
3,30,21.0,Blr,Apple


In [47]:
df['Age'].fillna(30)

0    24.0
1    45.0
2    30.0
3    21.0
Name: Age, dtype: float64

**Drawback : Randomly filling the columns is not a good approach**

- To avoid that we have some methods

    - backfill

    - bfill

    - ffill

    - pad

In [52]:
df

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,,Hyd,IBM
3,,21.0,Blr,Apple


In [56]:
import warnings
warnings.filterwarnings('ignore')

df.fillna(method = 'backfill')

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,Hyd,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,21.0,Hyd,IBM
3,,21.0,Blr,Apple


In [58]:
df.fillna(method = 'backfill', axis = 1)

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,Google,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,Hyd,Hyd,IBM
3,21.0,21.0,Blr,Apple


In [66]:
df.fillna(method = 'bfill')

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,Hyd,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,21.0,Hyd,IBM
3,,21.0,Blr,Apple


In [68]:
df.fillna(method = 'bfill', axis = 1)

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,Google,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,Hyd,Hyd,IBM
3,21.0,21.0,Blr,Apple


In [70]:
df.fillna(method = 'ffill')

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,45.0,Hyd,IBM
3,Sathish,21.0,Blr,Apple


In [72]:
df.fillna(method = 'ffill', axis = 1)

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,24.0,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,Sathish,Hyd,IBM
3,,21.0,Blr,Apple


In [74]:
df.fillna(method = 'pad')

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,45.0,Hyd,IBM
3,Sathish,21.0,Blr,Apple


In [76]:
df.fillna(method = 'pad',axis = 1)

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,24.0,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,Sathish,Hyd,IBM
3,,21.0,Blr,Apple


# Mean - Median - Mode

- Numerical : Mean, Median

    - Outliers There     : Median

    - Outliers Not There : Mode

- Categorical : Mode

In [81]:
df

Unnamed: 0,Names,Age,City,Company
0,Suresh,24.0,,Google
1,Ramesh,45.0,Hyd,Microsoft
2,Sathish,,Hyd,IBM
3,,21.0,Blr,Apple


In [83]:
age_data = df['Age']
age_data.fillna(age_data.mean())

0    24.0
1    45.0
2    30.0
3    21.0
Name: Age, dtype: float64

In [85]:
city_data  = df['City']
city_data.fillna(city_data.mode())

0    Hyd
1    Hyd
2    Hyd
3    Blr
Name: City, dtype: object

In [87]:
age_data.mean()

30.0

In [89]:
age_data.median()

24.0

In [95]:
city_data.mode().values[0]

'Hyd'

# KNN Imputer

In [99]:
from sklearn import impute
dir(impute)

['KNNImputer',
 'MissingIndicator',
 'SimpleImputer',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_base',
 '_knn',
 'typing']

In [101]:
from sklearn.impute import KNNImputer
X = [[1,2,np.nan],[3,np.nan,3],[np.nan,60,5],[8,8,7]]
imputer = KNNImputer(n_neighbors=2,weights = 'uniform')
imputer.fit_transform(X)

array([[ 1. ,  2. ,  5. ],
       [ 3. , 31. ,  3. ],
       [ 5.5, 60. ,  5. ],
       [ 8. ,  8. ,  7. ]])

In [103]:
from sklearn.impute import KNNImputer
X = [[1,2,np.nan],[3,np.nan,3],[np.nan,60,5],[8,8,7]]
imputer = KNNImputer(n_neighbors=2,weights = 'distance')
imputer.fit_transform(X)

array([[ 1.        ,  2.        ,  3.93905505],
       [ 3.        , 31.        ,  3.        ],
       [ 3.25775362, 60.        ,  5.        ],
       [ 8.        ,  8.        ,  7.        ]])

In [117]:
np.sqrt(2713)

52.08646657242167

In [115]:
52**2 + 9

2713