In [2]:
import pandas as pd
import numpy as np

In [3]:
people = {
    'first': ['marym', 'sara', 'nada', 'aseil', np.nan, None, 'NA'],
    'last': ['mohamed', 'ahmed', 'zyad', 'yossef', np.nan, np.nan, 'Missing'],
    'email': ['marym@gmail.com', 'sara@gmail.com', 'nada@gmail.com', None, np.nan, 'shada@gamil.com', 'NA' ],
    'age': ['20', '35', '40', '36', None, None, 'Missing']
}

In [16]:
df = pd.DataFrame(people)
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [17]:
df

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20.0
1,sara,ahmed,sara@gmail.com,35.0
2,nada,zyad,nada@gmail.com,40.0
3,aseil,yossef,,36.0
4,,,,
5,,,shada@gamil.com,
6,,,,


# Drop null values

In [18]:
#drop row that contain any missing value in one of its columns
df.dropna()

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20
1,sara,ahmed,sara@gmail.com,35
2,nada,zyad,nada@gmail.com,40


In [19]:
#drop row that contain any missing value in one of its columns 
#this is default and can edit it
df.dropna (axis='index', how='any')

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20
1,sara,ahmed,sara@gmail.com,35
2,nada,zyad,nada@gmail.com,40


In [20]:
#drop columns if all its values is a missing value
df.dropna (axis='columns', how='all')

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20.0
1,sara,ahmed,sara@gmail.com,35.0
2,nada,zyad,nada@gmail.com,40.0
3,aseil,yossef,,36.0
4,,,,
5,,,shada@gamil.com,
6,,,,


In [21]:
#drop columns that contain any missing value in one of its rows
#because all columns have at least on missing valuy all data droped
df.dropna (axis='columns', how='any')

0
1
2
3
4
5
6


In [22]:
#drop row if email value is null
df.dropna(axis='index', how='any', subset=['email'])

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20.0
1,sara,ahmed,sara@gmail.com,35.0
2,nada,zyad,nada@gmail.com,40.0
5,,,shada@gamil.com,


In [23]:
df.dropna(axis='index', how='any', subset=['email','last'])

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20
1,sara,ahmed,sara@gmail.com,35
2,nada,zyad,nada@gmail.com,40


# Display null values

In [24]:
# the cell that contain null value is true
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [25]:
df.isnull()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [26]:
#can use sum method to count null values for each columns
df.isnull().sum()

first    3
last     3
email    3
age      3
dtype: int64

# Fill null values

In [27]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20
1,sara,ahmed,sara@gmail.com,35
2,nada,zyad,nada@gmail.com,40
3,aseil,yossef,MISSING,36
4,MISSING,MISSING,MISSING,MISSING
5,MISSING,MISSING,shada@gamil.com,MISSING
6,MISSING,MISSING,MISSING,MISSING


In [28]:
df.fillna(0)

Unnamed: 0,first,last,email,age
0,marym,mohamed,marym@gmail.com,20
1,sara,ahmed,sara@gmail.com,35
2,nada,zyad,nada@gmail.com,40
3,aseil,yossef,0,36
4,0,0,0,0
5,0,0,shada@gamil.com,0
6,0,0,0,0


#  Data types

In [29]:
# display data dypes for each columns
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [34]:
type(np.nan)

float

In [30]:
# convert data type for column 
df['age'] = df['age'].astype(float)

In [31]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [32]:
df['age'].mean()

32.75