In [32]:
import pandas as pd
import numpy as np
data=pd.Series([1.2,-3.5,np.nan,0,None])
data

0    1.2
1   -3.5
2    NaN
3    0.0
4    NaN
dtype: float64

In [33]:
data.isna() # both np.nan,none value are taken as missing value

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [34]:
data.notna()

0     True
1     True
2    False
3     True
4    False
dtype: bool

In [35]:
data[data.notna()] #it returns the nonnull values and corresponding index values.

0    1.2
1   -3.5
3    0.0
dtype: float64

In [36]:
data.dropna()#it returns the Series with only the nonnull data and corresponding index values.

0    1.2
1   -3.5
3    0.0
dtype: float64

In [37]:
data=pd.DataFrame([[1.2,-3.5,np.nan,4],
                   [np.nan,0,7,None],
                   [4.7,7.4,-0.7,0.4]])
data

Unnamed: 0,0,1,2,3
0,1.2,-3.5,,4.0
1,,0.0,7.0,
2,4.7,7.4,-0.7,0.4


In [38]:
data.dropna(axis=1) #it will drop the nonnull value of column   

Unnamed: 0,1
0,-3.5
1,0.0
2,7.4


In [39]:
data.dropna(axis=0)#it will drop the nonnull value of row   

Unnamed: 0,0,1,2,3
2,4.7,7.4,-0.7,0.4


In [40]:
data.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.2,-3.5,,4.0
1,,0.0,7.0,
2,4.7,7.4,-0.7,0.4


In [41]:
data.dropna(how='any') #it will return the row where all value are nonnull.

Unnamed: 0,0,1,2,3
2,4.7,7.4,-0.7,0.4


In [42]:
data.dropna(thresh=3)#it will drop the row where at least 3 are non-null value.

Unnamed: 0,0,1,2,3
0,1.2,-3.5,,4.0
2,4.7,7.4,-0.7,0.4


In [43]:
data.dropna(thresh=4)# at least 4 value are non-value are present and it checks rowwise.

Unnamed: 0,0,1,2,3
2,4.7,7.4,-0.7,0.4


In [44]:
dic={0:99,1:999,2:9999} # it fill the null value according to the data given in dictionary
data.fillna(value=dic)

Unnamed: 0,0,1,2,3
0,1.2,-3.5,9999.0,4.0
1,99.0,0.0,7.0,
2,4.7,7.4,-0.7,0.4


# fillna

In [45]:
data.fillna(method='ffill')

  data.fillna(method='ffill')


Unnamed: 0,0,1,2,3
0,1.2,-3.5,,4.0
1,1.2,0.0,7.0,4.0
2,4.7,7.4,-0.7,0.4


In [46]:
data.fillna(0,limit=2) # it will fill the nan value with 0 in 0 and 1 index.

Unnamed: 0,0,1,2,3
0,1.2,-3.5,0.0,4.0
1,0.0,0.0,7.0,0.0
2,4.7,7.4,-0.7,0.4


In [51]:
data.fillna(method='ffill',limit=1)# it specified,this is the maximum number of consecutive NaN value to ffill and bffill

  data.fillna(method='ffill',limit=1)# it specified,this is the maximum number of consecutive NaN value to ffill and bffill


Unnamed: 0,0,1,2,3
0,1.2,-3.5,,4.0
1,1.2,0.0,7.0,4.0
2,4.7,7.4,-0.7,0.4


In [52]:
data.fillna(method='ffill',limit=2) # it checks column wise.

  data.fillna(method='ffill',limit=2)


Unnamed: 0,0,1,2,3
0,1.2,-3.5,,4.0
1,1.2,0.0,7.0,4.0
2,4.7,7.4,-0.7,0.4


In [48]:
data.fillna(method='bfill',limit=2) # it will check the o and 1 row

  data.fillna(method='bfill',limit=2) # it will check the o and 1 row


Unnamed: 0,0,1,2,3
0,1.2,-3.5,7.0,4.0
1,4.7,0.0,7.0,0.4
2,4.7,7.4,-0.7,0.4


# Data transformation

In [64]:
data1=pd.DataFrame({"c1":['a','b','a','b','a','b'],'c2':[1,1,1,2,2,2]})
data1

Unnamed: 0,c1,c2
0,a,1
1,b,1
2,a,1
3,b,2
4,a,2
5,b,2


In [65]:
data1.duplicated() # it will return boolean series true or false if row wise data repeted or not

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

In [66]:
data1.drop_duplicates()#it will return the dataframe with rows where the duplicated array is false.it removes the repeted value.

Unnamed: 0,c1,c2
0,a,1
1,b,1
3,b,2
4,a,2


In [67]:
data1.fillna(method='ffill',limit=1)# it specified,this is the maximum number of consecutive NaN value to ffill and bffill

  data1.fillna(method='ffill',limit=1)# it specified,this is the maximum number of consecutive NaN value to ffill and bffill


Unnamed: 0,c1,c2
0,a,1
1,b,1
2,a,1
3,b,2
4,a,2
5,b,2


In [68]:
data1['c3']=['!','@','#','$','#','@']
data1.drop_duplicates('c3')

Unnamed: 0,c1,c2,c3
0,a,1,!
1,b,1,@
2,a,1,#
3,b,2,$


In [69]:
data1.drop_duplicates(subset=['c3'],keep='first')# it drop the first occurences of duplicates

Unnamed: 0,c1,c2,c3
0,a,1,!
1,b,1,@
2,a,1,#
3,b,2,$


In [70]:
data1.drop_duplicates(subset=['c3'],keep='last')# it drop the last occurences of duplicates

Unnamed: 0,c1,c2,c3
0,a,1,!
3,b,2,$
4,a,2,#
5,b,2,@


In [71]:
import pandas as pd 
import numpy as np
frame=pd.DataFrame(np.random.randn(4,3),columns=list('abc'),index=['w','x','y','z'])
frame

Unnamed: 0,a,b,c
w,0.920705,1.45395,-1.101107
x,0.601765,0.571386,-0.895126
y,-0.811973,0.561183,1.344552
z,-1.147386,-0.171267,0.032527


In [72]:
col=lambda x:x.max()
frame.apply(col)

a    0.920705
b    1.453950
c    1.344552
dtype: float64

In [75]:
index=lambda x:x.max()
frame.apply(index)

a    0.920705
b    1.453950
c    1.344552
dtype: float64

In [76]:
ele=lambda x:x**2
frame.applymap(ele)# it will works element wise on a dataframe

  frame.applymap(ele)


Unnamed: 0,a,b,c
w,0.847698,2.11397,1.212437
x,0.362121,0.326482,0.80125
y,0.6593,0.314926,1.807821
z,1.316495,0.029333,0.001058


In [77]:
frame['a'].map(ele)# it will return the columne 1st element wise

w    0.847698
x    0.362121
y    0.659300
z    1.316495
Name: a, dtype: float64

In [78]:
frame1=pd.DataFrame({'Name':['salman','aiswaraya','shahid','kareena'],'Height':[1.68,1.63,1.71,1.65],'Weight':[78,55,72,53]})
frame1

Unnamed: 0,Name,Height,Weight
0,salman,1.68,78
1,aiswaraya,1.63,55
2,shahid,1.71,72
3,kareena,1.65,53


In [83]:
maximum_height=frame1.loc[frame1['Height'].idxmax(),['Name']]
maximum_height

Name    shahid
Name: 2, dtype: object

In [85]:
ele= lambda x:x**2
x=frame1['Height'].map(ele)
y=frame1['Weight']
frame1['BMI']=y/x
frame1

Unnamed: 0,Name,Height,Weight,BMI
0,salman,1.68,78,27.636054
1,aiswaraya,1.63,55,20.700817
2,shahid,1.71,72,24.622961
3,kareena,1.65,53,19.467401


In [86]:
ow=lambda x:'Obessed' if x>20 else'Normal'
frame1['Category']=frame1['BMI'].map(ow)
frame1

Unnamed: 0,Name,Height,Weight,BMI,Category
0,salman,1.68,78,27.636054,Obessed
1,aiswaraya,1.63,55,20.700817,Obessed
2,shahid,1.71,72,24.622961,Obessed
3,kareena,1.65,53,19.467401,Normal


In [94]:
Name_Gen={'salman':'Male','aiswaraya':'Female','shahid':'Male','kareena':'Female'}
frame1["Gender"]=frame1['Name'].map(Name_Gen)
frame1

Unnamed: 0,Name,Height,Weight,BMI,Category,Gender
0,salman,1.68,78,27.636054,Obessed,Male
1,aiswaraya,1.63,55,20.700817,Obessed,Female
2,shahid,1.71,72,24.622961,Obessed,Male
3,kareena,1.65,53,19.467401,Normal,Female


In [95]:
data1.replace(to_replace="a",value='*') #single value replacement

Unnamed: 0,c1,c2,c3
0,*,1,!
1,b,1,@
2,*,1,#
3,b,2,$
4,*,2,#
5,b,2,@


In [96]:
data1.replace(to_replace=["a","b"],value='*') # multiple value replacement

Unnamed: 0,c1,c2,c3
0,*,1,!
1,*,1,@
2,*,1,#
3,*,2,$
4,*,2,#
5,*,2,@


In [97]:
data1.replace({'a':'*','b':'&'}) # 

Unnamed: 0,c1,c2,c3
0,*,1,!
1,&,1,@
2,*,1,#
3,&,2,$
4,*,2,#
5,&,2,@


In [98]:
data1.replace(to_replace=["a","b"],value=['*','&']) # replace in form of list 

Unnamed: 0,c1,c2,c3
0,*,1,!
1,&,1,@
2,*,1,#
3,&,2,$
4,*,2,#
5,&,2,@


In [99]:
data1.replace(to_replace={'c1':{'a':'*','b':'&'}})# replace in form of dictionary of dictionary


Unnamed: 0,c1,c2,c3
0,*,1,!
1,&,1,@
2,*,1,#
3,&,2,$
4,*,2,#
5,&,2,@


In [102]:
data1=pd.DataFrame({"c1":['a','b','a','b','a','b','a'],'c2':['a',1,1,1,2,2,2]},index=['d','s','f','g','h','j','k'])
data1

Unnamed: 0,c1,c2
d,a,a
s,b,1
f,a,1
g,b,1
h,a,2
j,b,2
k,a,2


In [103]:
def transform(x):
    return x.upper()
data1.index.map(transform)

Index(['D', 'S', 'F', 'G', 'H', 'J', 'K'], dtype='object')

In [105]:
data1.index=data1.index.map(transform)
data1

Unnamed: 0,c1,c2
D,a,a
S,b,1
F,a,1
G,b,1
H,a,2
J,b,2
K,a,2


In [106]:
data1.rename(index=str.title,columns=str.upper)

Unnamed: 0,C1,C2
D,a,a
S,b,1
F,a,1
G,b,1
H,a,2
J,b,2
K,a,2


In [108]:
data1.rename(str.lower,axis='columns')

Unnamed: 0,c1,c2
D,a,a
S,b,1
F,a,1
G,b,1
H,a,2
J,b,2
K,a,2


In [109]:
data1.rename(str.lower,axis='index')

Unnamed: 0,c1,c2
d,a,a
s,b,1
f,a,1
g,b,1
h,a,2
j,b,2
k,a,2


# Discretization

In [110]:
x=np.array([1,2,3,7,4])
pd.cut(x,bins=2)

[(0.994, 4.0], (0.994, 4.0], (0.994, 4.0], (4.0, 7.0], (0.994, 4.0]]
Categories (2, interval[float64, right]): [(0.994, 4.0] < (4.0, 7.0]]

In [111]:
x=np.array([1,2,3,7,4])
y=[0,2,6]
pd.cut(x,bins=y)

[(0.0, 2.0], (0.0, 2.0], (2.0, 6.0], NaN, (2.0, 6.0]]
Categories (2, interval[int64, right]): [(0, 2] < (2, 6]]