### Reference: EDA and Data Preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [21]:
df = pd.DataFrame({'A':[1, 2, np.nan, 5, np.nan, 6], 'B': [8, 9, np.nan, 6, 8, np.nan], 'C': [5, 3, 9, 4, 1, 8]})
df

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
2,,,9
3,5.0,6.0,4
4,,8.0,1
5,6.0,,8


In [6]:
# Detecting Missing Data
df.isnull().sum()

A    2
B    2
C    0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       4 non-null      float64
 2   C       6 non-null      int64  
dtypes: float64(2), int64(1)
memory usage: 208.0 bytes


In [8]:
# Handling mising data
df.dropna()

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
3,5.0,6.0,4


In [9]:
df.dropna(how='all')

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
2,,,9
3,5.0,6.0,4
4,,8.0,1
5,6.0,,8


In [10]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
3,5.0,6.0,4
4,,8.0,1
5,6.0,,8


In [16]:
df.dropna(subset=['A', 'C'])

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
3,5.0,6.0,4


In [20]:
df.dropna(inplace=True)
df

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
3,5.0,6.0,4


In [23]:
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
2,0.0,0.0,9
3,5.0,6.0,4
4,0.0,8.0,1
5,6.0,0.0,8


In [24]:
df.fillna(method='ffill')

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
2,2.0,9.0,9
3,5.0,6.0,4
4,5.0,8.0,1
5,6.0,8.0,8


In [26]:
df.fillna(method = 'bfill')

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
2,5.0,6.0,9
3,5.0,6.0,4
4,6.0,8.0,1
5,6.0,,8


In [27]:
values = {'A': 0, 'B': 2, 'C': 4}
df.fillna(value=values)

Unnamed: 0,A,B,C
0,1.0,8.0,5
1,2.0,9.0,3
2,0.0,2.0,9
3,5.0,6.0,4
4,0.0,8.0,1
5,6.0,2.0,8


In [28]:
df['A'].fillna(df['A'].mean(), axis=0)

0    1.0
1    2.0
2    3.5
3    5.0
4    3.5
5    6.0
Name: A, dtype: float64

In [30]:
df = pd.DataFrame({'Height': [50,59,70,95,80,35,40,45,85,60],'After Scaling':[0,0,0,0,0,0,0,0,0,0]})
df

Unnamed: 0,Height,After Scaling
0,50,0
1,59,0
2,70,0
3,95,0
4,80,0
5,35,0
6,40,0
7,45,0
8,85,0
9,60,0


In [31]:
from sklearn.preprocessing import MinMaxScaler 
scaler = MinMaxScaler() 
scaler.fit(df[['Height']]) 
df['After Scaling'] = scaler.transform(df[['Height']])
df

Unnamed: 0,Height,After Scaling
0,50,0.25
1,59,0.4
2,70,0.583333
3,95,1.0
4,80,0.75
5,35,0.0
6,40,0.083333
7,45,0.166667
8,85,0.833333
9,60,0.416667


In [32]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 
scaler.fit(df[['Height']]) 
df['After Scaling'] = scaler.transform(df[['Height']]) 
df

Unnamed: 0,Height,After Scaling
0,50,-0.621607
1,59,-0.151484
2,70,0.423111
3,95,1.729008
4,80,0.945469
5,35,-1.405145
6,40,-1.143966
7,45,-0.882786
8,85,1.206649
9,60,-0.099248


In [33]:
df = pd.DataFrame({'Country': ['Japan', 'US', 'India', 'China', 'US', 'India'], 'Population':[100,200,250,300,200,250]})
df

Unnamed: 0,Country,Population
0,Japan,100
1,US,200
2,India,250
3,China,300
4,US,200
5,India,250


In [34]:
from sklearn.preprocessing import LabelEncoder 
encoder = LabelEncoder() 
df['Country'] = encoder.fit_transform(df['Country']) 
df

Unnamed: 0,Country,Population
0,2,100
1,3,200
2,1,250
3,0,300
4,3,200
5,1,250


In [35]:
from sklearn.preprocessing import OneHotEncoder 
encoder = OneHotEncoder() 
enc_df = pd.DataFrame(encoder.fit_transform(df[['Country']]).toarray()) 
df=df.join(enc_df)
df

Unnamed: 0,Country,Population,0,1,2,3
0,2,100,0.0,0.0,1.0,0.0
1,3,200,0.0,0.0,0.0,1.0
2,1,250,0.0,1.0,0.0,0.0
3,0,300,1.0,0.0,0.0,0.0
4,3,200,0.0,0.0,0.0,1.0
5,1,250,0.0,1.0,0.0,0.0
