In [4]:
# Handling missing data
import pandas as pd
import numpy as np

In [5]:
# pandas exclude missing data by default while doing calculation
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [6]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [7]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [8]:
# filtering using boolean masking
string_data[string_data.isnull()]

2    NaN
dtype: object

In [9]:
from numpy import nan as NA

In [10]:
# filtering out the missing data
data = pd.Series([1, NA, 3.5, NA, 7])

In [11]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])

In [13]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
data.dropna(axis=0) 
# drop rows with all NAN

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [15]:
# drop a column with all NAN
data.dropna(axis=1)

0
1
2
3


In [16]:
df = pd.DataFrame(np.random.randn(7, 3))

In [17]:
df

Unnamed: 0,0,1,2
0,0.011111,0.53978,-1.424375
1,1.806644,0.866338,0.030414
2,1.39752,-1.820408,-0.735407
3,-1.098902,-0.319864,-0.519418
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [18]:
df.iloc[:4,1] = NA 
df.iloc[:2,2] = NA

In [19]:
df

Unnamed: 0,0,1,2
0,0.011111,,
1,1.806644,,
2,1.39752,,-0.735407
3,-1.098902,,-0.519418
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [20]:
df.dropna() 
# implicitly axis 0

Unnamed: 0,0,1,2
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [21]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [22]:
df.dropna(axis=1)

Unnamed: 0,0
0,0.011111
1,1.806644
2,1.39752
3,-1.098902
4,-0.18046
5,0.183997
6,0.82511


In [23]:
df.dropna(axis=0,thresh=2) 
# drop rows where NAN>=2

Unnamed: 0,0,1,2
2,1.39752,,-0.735407
3,-1.098902,,-0.519418
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [24]:
# Filling out missing values

In [25]:
df

Unnamed: 0,0,1,2
0,0.011111,,
1,1.806644,,
2,1.39752,,-0.735407
3,-1.098902,,-0.519418
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [26]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.011111,0.0,0.0
1,1.806644,0.0,0.0
2,1.39752,0.0,-0.735407
3,-1.098902,0.0,-0.519418
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [27]:
# filling out diff values for each cols
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,0.011111,0.5,0.0
1,1.806644,0.5,0.0
2,1.39752,0.5,-0.735407
3,-1.098902,0.5,-0.519418
4,-0.18046,-0.763978,-1.121179
5,0.183997,2.020421,0.084681
6,0.82511,0.655836,0.047674


In [28]:
# DATA TRANSFORMATION

In [30]:
# Removing duplicates

In [31]:
# removing duplicate rows
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})

In [32]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [33]:
# finding out duplication
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool