In [53]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [54]:
df = pd.read_csv("Data.csv")
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Null Values Handling - Numeric

### 1. Removing the Rows

In [41]:
df.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [42]:
# Average Age
avg_age = df['Age'].sum()/len(df['Age'])
avg_age

34.9

In [43]:
# Average Salary 
avg_salary = df['Salary'].sum()/len(df['Salary'])
avg_salary

57400.0

### 2. Using Imputers

In [44]:
df.iloc[:,:]

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [45]:
df.iloc[:,1:3]

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [46]:
df.iloc[:,1:3].values

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01,     nan],
       [3.5e+01, 5.8e+04],
       [    nan, 5.2e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [47]:
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(df.iloc[:,1:3].values)
df.iloc[:,1:3] = imputer.transform(df.iloc[:,1:3].values)

In [48]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Null Value Handling- Categorical 

In [49]:
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(df.iloc[:,:1].values)
df.iloc[:,:1] = imputer.transform(df.iloc[:,:1].values)

In [50]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,France,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [51]:
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(df.iloc[:,3:].values)
df.iloc[:,3:] = imputer.transform(df.iloc[:,3:].values)

In [52]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,France,30.0,54000.0,Yes
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [55]:
# We can directly replace nan with most_frequent value in each column for all columns irrespective of numeric or categorical  since strategy is same for both
# Taking whole dataframe
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(df.iloc[:,:].values)
df.iloc[:,:] = imputer.transform(df.iloc[:,:].values)

In [56]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,France,30.0,54000.0,Yes
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Null Values Handling on GooglePlaystore Dataset

In [57]:
df = pd.read_csv("googleplaystore.csv")
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [58]:
df.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [60]:
df.iloc[:,2:3]

Unnamed: 0,Rating
0,4.1
1,3.9
2,4.7
3,4.5
4,4.3
...,...
10836,4.5
10837,5.0
10838,
10839,4.5


In [61]:
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')
imputer.fit(df.iloc[:,2:3].values)
df.iloc[:,2:3] = imputer.transform(df.iloc[:,2:3].values)

In [64]:
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [63]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              1
Price             0
Content Rating    1
Genres            0
Last Updated      0
Current Ver       8
Android Ver       3
dtype: int64

In [65]:
df = df.dropna()

In [68]:
df.isnull().sum()

App               0
Category          0
Rating            0
Reviews           0
Size              0
Installs          0
Type              0
Price             0
Content Rating    0
Genres            0
Last Updated      0
Current Ver       0
Android Ver       0
dtype: int64

In [69]:
len(df)

10829