Generally, we dont leave data as NaN  
We either,  
    1. Drop NaN  
    2. Fill NaN

In [18]:
import pandas as pd
import numpy as np

iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
iris 

##df = iris    ## If we make a copy, internal data of the frame is shared by both variable,
               ## so, changes made in one variable will reflect in other variable or the main 
               ## dataframe.
        
df = iris.copy()  ## So we use copy() function.


In [19]:
df.columns = ['sl','sw','pl','pw', 'Flower_type']    ## this will change the column name.

In [20]:
df.head()

Unnamed: 0,sl,sw,pl,pw,Flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [21]:
## In this dataframe, we dont have NaN, so we currently make values as 0 or we use numpy constant
## 'nan'
df.iloc[2:4, 1:3] = 0   
df.head()

Unnamed: 0,sl,sw,pl,pw,Flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,0.0,0.0,0.2,Iris-setosa
3,5.0,0.0,0.0,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [22]:
df.iloc[2:4, 1:3] = np.nan   ## here, we made values as nan
df.head()

Unnamed: 0,sl,sw,pl,pw,Flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,,,0.2,Iris-setosa
3,5.0,,,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [23]:
## Now, we describe data
df.describe()

## As we can see, we have only 147 entries in column 2 and 3, which means it is nan.


Unnamed: 0,sl,sw,pl,pw
count,149.0,147.0,147.0,149.0
mean,5.848322,3.046939,3.806122,1.205369
std,0.828594,0.434048,1.750351,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [24]:
## To handle this we have two options,
## First we have dropna() which drops nan.
df.dropna()

## As we can see, we have removed whole row 2 and 3.As we know, we havent changed df, to do
## we do inplace = True.
df.dropna(inplace = True)
df.head()  ## which is done



Unnamed: 0,sl,sw,pl,pw,Flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
5,4.6,3.4,1.4,0.3,Iris-setosa
6,5.0,3.4,1.5,0.2,Iris-setosa


In [25]:
## to make index proper, we do resetindex
df.reset_index(drop = True, inplace = True)
df.head()
df.tail()

Unnamed: 0,sl,sw,pl,pw,Flower_type
142,6.7,3.0,5.2,2.3,Iris-virginica
143,6.3,2.5,5.0,1.9,Iris-virginica
144,6.5,3.0,5.2,2.0,Iris-virginica
145,6.2,3.4,5.4,2.3,Iris-virginica
146,5.9,3.0,5.1,1.8,Iris-virginica


In [26]:
## Second solution is to fill the entries with something that we require
df.iloc[2:4, 1:3] = np.nan   ## here, we made values as nan
df.head()

Unnamed: 0,sl,sw,pl,pw,Flower_type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,5.4,,,0.4,Iris-setosa
3,4.6,,,0.3,Iris-setosa
4,5.0,3.4,1.5,0.2,Iris-setosa


In [30]:
## To fill the values with something, we here fill values with their mean of their columns.
df.sw.fillna(df.sw.mean())  ## here we fill NaN sw column with mean.
## here we can see that NaN in column sw is filled with mean value.

0      3.000000
1      3.200000
2      3.038621
3      3.038621
4      3.400000
         ...   
142    3.000000
143    2.500000
144    3.000000
145    3.400000
146    3.000000
Name: sw, Length: 147, dtype: float64

In [31]:
df.pl.fillna(df.pl.mean())  ## here  we fill NaN pl column with their mean value.


0      1.400000
1      1.300000
2      3.837241
3      3.837241
4      1.500000
         ...   
142    5.200000
143    5.000000
144    5.200000
145    5.400000
146    5.100000
Name: pl, Length: 147, dtype: float64

In [34]:
a = df[df.Flower_type =='Iris-setosa']
a.pl.mean()   
## the above will give you mean of the pl column where floweer type is 'Iris-setosa'

1.4622222222222219

# Handling Strings in Data


In [40]:
df['Gender'] = 'Female'
df.iloc[0:10, 5] = 'Male'
df.head()

Unnamed: 0,sl,sw,pl,pw,Flower_type,Gender
0,4.9,3.0,1.4,0.2,Iris-setosa,Male
1,4.7,3.2,1.3,0.2,Iris-setosa,Male
2,5.4,3.038621,,0.4,Iris-setosa,Male
3,4.6,3.038621,,0.3,Iris-setosa,Male
4,5.0,3.4,1.5,0.2,Iris-setosa,Male


In [42]:
def f(s):
    if s == "Male":
        return 0
    else:
        return 1
    
    
df['sex'] = df.Gender.apply(f)


## What this will do is it will take all the values of Gender column and 
## apply function 'f' on each value of Gender.And that returned value will
## be saved on a newly created column 'sex'


In [43]:
df.head()

## The above is the way we can make string entries into numeric entries.

Unnamed: 0,sl,sw,pl,pw,Flower_type,Gender,sex
0,4.9,3.0,1.4,0.2,Iris-setosa,Male,0
1,4.7,3.2,1.3,0.2,Iris-setosa,Male,0
2,5.4,3.038621,,0.4,Iris-setosa,Male,0
3,4.6,3.038621,,0.3,Iris-setosa,Male,0
4,5.0,3.4,1.5,0.2,Iris-setosa,Male,0
