In [27]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    np.arange(0,15).reshape(5,3),
    index=['a','b','c','d','e'],
    columns=['c1','c2','c3']
)
df

Unnamed: 0,c1,c2,c3
a,0,1,2
b,3,4,5
c,6,7,8
d,9,10,11
e,12,13,14


In [28]:
df['c4'] = np.nan
df.loc['f'] = np.arange(15,19)
df.loc['g'] = np.nan
df['c5'] = np.nan
df['c4']['a'] = 20
df

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,
g,,,,,


In [30]:
#find missing values
df.isnull()

Unnamed: 0,c1,c2,c3,c4,c5
a,False,False,False,False,True
b,False,False,False,True,True
c,False,False,False,True,True
d,False,False,False,True,True
e,False,False,False,True,True
f,False,False,False,False,True
g,True,True,True,True,True


In [31]:
#total number of missing values per column
df.isnull().sum()

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [32]:
#total no of missing values in dataset
df.isnull().sum().sum()

15

In [33]:
#count non null values per column
df.count()

c1    6
c2    6
c3    6
c4    2
c5    0
dtype: int64

In [34]:
#count total non null values
df.count().sum()

20

In [37]:
#another way to find no of nans per column
(len(df)-df.count())

c1    1
c2    1
c3    1
c4    5
c5    7
dtype: int64

In [38]:
#another way to find total no of nans
(len(df)-df.count()).sum()

15

In [40]:
 #retrieve values that are not nans
df.c4[df.c4.notnull()]

a    20.0
f    18.0
Name: c4, dtype: float64

In [41]:
#drop only rowa with all nan values
x = df.dropna(how='all')
x

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,,
c,6.0,7.0,8.0,,
d,9.0,10.0,11.0,,
e,12.0,13.0,14.0,,
f,15.0,16.0,17.0,18.0,


In [42]:
#drop only columns with all nan values
x = df.dropna(how='all', axis=1)
x

Unnamed: 0,c1,c2,c3,c4
a,0.0,1.0,2.0,20.0
b,3.0,4.0,5.0,
c,6.0,7.0,8.0,
d,9.0,10.0,11.0,
e,12.0,13.0,14.0,
f,15.0,16.0,17.0,18.0
g,,,,


In [44]:
#fill nan values
df2 = df.copy()
fill = df2.fillna(0)
fill

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,0.0
b,3.0,4.0,5.0,0.0,0.0
c,6.0,7.0,8.0,0.0,0.0
d,9.0,10.0,11.0,0.0,0.0
e,12.0,13.0,14.0,0.0,0.0
f,15.0,16.0,17.0,18.0,0.0
g,0.0,0.0,0.0,0.0,0.0


In [47]:
#filling using index values
fill_values = pd.Series(
    [100,101,102],
    index=['a','e','g']
)
fill_values

a    100
e    101
g    102
dtype: int64

In [48]:
df2.c4.fillna(fill_values)

a     20.0
b      NaN
c      NaN
d      NaN
e    101.0
f     18.0
g    102.0
Name: c4, dtype: float64

In [49]:
#filling with column mean values
df2.fillna(df.mean())

Unnamed: 0,c1,c2,c3,c4,c5
a,0.0,1.0,2.0,20.0,
b,3.0,4.0,5.0,19.0,
c,6.0,7.0,8.0,19.0,
d,9.0,10.0,11.0,19.0,
e,12.0,13.0,14.0,19.0,
f,15.0,16.0,17.0,18.0,
g,7.5,8.5,9.5,19.0,


In [50]:
#interpolation (estimating uknown values that fall between known values)
s = pd.Series([1,np.nan,np.nan,np.nan,2])
s.interpolate()

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
dtype: float64

In [52]:
#duplicate data
df3 = pd.DataFrame({'a':['x']*3 + ['y']*4, 'b':[1,1,2,3,3,4,4]})
df3

Unnamed: 0,a,b
0,x,1
1,x,1
2,x,2
3,y,3
4,y,3
5,y,4
6,y,4


In [53]:
#find duplicates
df3.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [54]:
#drop duplicates
df3.drop_duplicates()

Unnamed: 0,a,b
0,x,1
2,x,2
3,y,3
5,y,4


In [57]:
#mapping
data = {
    'first_name': ['Ezra', 'Sherman', 'Martin', 'Ray', 'Joseph'],
    'last_name': ['Odhiambo', 'Ochieng', 'Omondi', 'Odero', 'Onyango'],
    'age': ['10','20','30','40','50'],
    'town': ['Ahero','Eldoret','Kasarani','Mtwapa','Juja']
}

df3 = pd.DataFrame(data, columns=['first_name','last_name','age','town'])
df3

Unnamed: 0,first_name,last_name,age,town
0,Ezra,Odhiambo,10,Ahero
1,Sherman,Ochieng,20,Eldoret
2,Martin,Omondi,30,Kasarani
3,Ray,Odero,40,Mtwapa
4,Joseph,Onyango,50,Juja


In [58]:
#mapping town to county
town_to_county = {
    'Ahero' : 'Kisumu',
    'Eldoret': 'Uasin Gishu',
    'Kasarani': 'Nairobi',
    'Mtwapa': 'Mombasa',
    'Juja': 'Kiambu'
}

df3['county'] = df3['town'].map(town_to_county)
df3

Unnamed: 0,first_name,last_name,age,town,county
0,Ezra,Odhiambo,10,Ahero,Kisumu
1,Sherman,Ochieng,20,Eldoret,Uasin Gishu
2,Martin,Omondi,30,Kasarani,Nairobi
3,Ray,Odero,40,Mtwapa,Mombasa
4,Joseph,Onyango,50,Juja,Kiambu


In [62]:
#simple replace
s = pd.Series([0,1,2,3,4,5])
print(s)
s = s.replace(3,7)
print(s)

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
0    0
1    1
2    2
3    7
4    4
5    5
dtype: int64


In [63]:
#create a column using a function
raw_data = {
    'student_id':[1,2,3,4,5,6,7,8,9,10],
    'marks': [10,20,30,40,50,60,70,80,90,100],
}

df3 = pd.DataFrame(raw_data,columns=['student_id','marks'])
df3

Unnamed: 0,student_id,marks
0,1,10
1,2,20
2,3,30
3,4,40
4,5,50
5,6,60
6,7,70
7,8,80
8,9,90
9,10,100
