In [1]:
# 2 ways to handle NaN values: 
# 1) delete the particular row or column  (generally dont prefer)
# 2) fill the cell containing NaN values with some value, and that value can be :
#    a) mean of that column   (generally we prefer this)
#    b) most occuring value in the same column
#    c) mean of the values of that column that has the same flower_type (in case if the nan values belong to same flower_type)

In [2]:
import pandas as pd

In [5]:
iris = pd.read_csv('iris.data')

In [7]:
iris.columns = ['sl','sw','pl','pw',"flower_type"]

In [8]:
df = iris.copy()

In [10]:
df

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


In [11]:
# now since the data doesnot contain any nan values so we will make some values as nan, and for this we will use numpy
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [12]:
import numpy as np

In [13]:
df.iloc[2:4,1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,,,0.2,Iris-setosa
3,5.0,,,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [14]:
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,149.0,147.0,147.0,149.0
mean,5.848322,3.046939,3.806122,1.205369
std,0.828594,0.434048,1.750351,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [15]:
# 1st way by dropping rows containing nan values
df.dropna(inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa


In [16]:
# as we can see that label 2 and 3 got deleted 

In [17]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,5.4,3.9,1.7,0.4,Iris-setosa
3,4.6,3.4,1.4,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


In [23]:
# 2nd way by filling the mean values of the same column

df = iris.copy()
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [24]:
df.iloc[2:4,1:3] = np.nan
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,,,0.2,Iris-setosa
3,5.0,,,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [25]:
df.sw.fillna(df.sw.mean(), inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.046939,,0.2,Iris-setosa
3,5.0,3.046939,,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [26]:
df.pl.fillna(df.pl.mean(), inplace=True)
df.head()

Unnamed: 0,sl,sw,pl,pw,flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.046939,3.806122,0.2,Iris-setosa
3,5.0,3.046939,3.806122,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
