#handling missing data

In [14]:
import pandas as pd
import numpy as np
from numpy import nan as NA

data = pd.DataFrame([[1.,6.5,3.],[1,NA,NA],
                    [NA,NA,NA],[NA,6.5,3.]],
                    columns=list('abc'))

In [8]:
data

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
#this wil drop all null values
cleaned = data.dropna()
#how =all will drop the rows that are all NA
cleaned = data.dropna(how='all')
cleaned

Unnamed: 0,a,b,c
0,1.0,6.5,3.0


In [10]:
#dropping columns that have  all null values
data.dropna(axis=1,how='all')

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [11]:
display(data)

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


# filling missing data

In [22]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]= NA
df.iloc[:2,2] = NA

In [24]:
#we will use fill na to fill all the NA values or 
#we can use fillna({1:0.5,2:0}) here col 1will be filled with 0.5 and sam for col 2

#we can also use ffil
filled=df.fillna({1:1.5,2:0})
display(filled)

Unnamed: 0,0,1,2
0,-2.816682,1.5,0.0
1,0.062613,1.5,0.0
2,-0.058849,1.5,0.88328
3,0.471983,1.5,-0.228606
4,0.33925,1.576027,-0.761771
5,-0.997838,-1.293263,-0.062828
6,0.222726,-2.565595,-0.338768


In [25]:
# removing duplicated

In [29]:
import pandas as pd
data=pd.DataFrame({'k1':['one','two']*3 + ['two'],
                  'k2':[1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [34]:
data.duplicated()
const=data.drop_duplicates()
const
data['v1']=['one','two','one','four','one','six','two']
data.drop_duplicates(['k1','v1']) #onli k1 v1 col dup removed

Unnamed: 0,k1,k2,v1
0,one,1,one
1,two,1,two
3,two,3,four
5,two,4,six


In [36]:
display(data)

Unnamed: 0,k1,k2,v1
0,one,1,one
1,two,1,two
2,one,2,one
3,two,3,four
4,one,3,one
5,two,4,six
6,two,4,two


In [37]:
# replacing values

In [44]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]=np.nan
df.iloc[:2,2]=np.nan
display(df)
replaced=df.replace(np.nan,-999)
#display(replaced)
nrep=replaced.replace([-999,-9],[1.5,2])
display(nrep)

Unnamed: 0,0,1,2
0,0.248813,,
1,0.942901,,
2,-0.353971,,1.440715
3,-1.14387,,0.113702
4,-0.043395,0.424526,-0.29625
5,0.182007,2.419201,-0.559587
6,-0.386822,0.06386,-0.004732


Unnamed: 0,0,1,2
0,0.248813,1.5,1.5
1,0.942901,1.5,1.5
2,-0.353971,1.5,1.440715
3,-1.14387,1.5,0.113702
4,-0.043395,0.424526,-0.29625
5,0.182007,2.419201,-0.559587
6,-0.386822,0.06386,-0.004732


# Renaming indexes

In [54]:
data=pd.DataFrame((np.arange(12).reshape(3,4)),
index=['ohio','colorado' , 'new york'],
columns=['one','two','three','four'])
data

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


In [55]:
display(data)
transform=lambda x:x[:4].upper()
data=data.index.map(transform)
display(data)

Unnamed: 0,one,two,three,four
ohio,0,1,2,3
colorado,4,5,6,7
new york,8,9,10,11


Index(['OHIO', 'COLO', 'NEW '], dtype='object')

# detecting filtering outliers

In [58]:
data=pd.DataFrame(np.random.randn(1000,4))
data.describe()
#display(data)

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.040513,-0.012197,0.029685,0.058453
std,0.965534,1.008052,0.978591,0.996622
min,-3.725152,-3.760811,-2.792703,-3.713338
25%,-0.678241,-0.735214,-0.654189,-0.610792
50%,-0.040046,-0.043221,-0.022918,0.082359
75%,0.649059,0.706901,0.642401,0.728493
max,2.801725,4.169792,3.080636,3.055051


In [64]:
col=data[2]
col[np.abs(col)>3]

8    3.080636
Name: 2, dtype: float64

In [67]:
outliers = data[(np.abs(data)>3).any(1)]

In [69]:
len(outliers)

9

# permutation and random sampling

In [71]:
df =pd.DataFrame(np.arange(5*4).reshape((5,4)))
print(df.shape)

(5, 4)


In [72]:
display(df)

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [73]:
sampler =np.random.permutation(4)

In [74]:
sampler

array([0, 3, 1, 2])

In [75]:
df =df[sampler]

In [76]:
df

Unnamed: 0,0,3,1,2
0,0,3,1,2
1,4,7,5,6
2,8,11,9,10
3,12,15,13,14
4,16,19,17,18
