In [16]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'ID': ['H342', 'H543', 'H129', 'H309', 'H219'],
                   'Mathematics': [np.nan, 18, np.nan, 19, 20],
                   'Physics': [19, 18, np.nan, 10, np.nan],
                   'Arts': [17, np.nan, 18, 10, 15]},
                  index=['Glenn', 'Maria', 'Tim', 'Robert', 'Eric'])
df

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


### Task1: Remove columns or rows with missing data

In [17]:
t1 = df.dropna(axis=1)     # axis=1 so it checks columns and drops the ones with at least one missing element
t1

Unnamed: 0,ID
Glenn,H342
Maria,H543
Tim,H129
Robert,H309
Eric,H219


In [18]:
t2 = df.dropna(axis=1, how='all')     # Drops columns when all elements are missing
t2

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


In [19]:
t3 = df.dropna(axis=1, thresh=4)     # Keeps columns with more than 4 non-missing elements. Thresh is the number of non-missing elements 
t3

Unnamed: 0,ID,Arts
Glenn,H342,17.0
Maria,H543,
Tim,H129,18.0
Robert,H309,10.0
Eric,H219,15.0


In [20]:
t4 = df.dropna(axis='rows', subset=['Mathematics', 'Arts'])     # Drop rows if there is missing data in either of specified columns
t4

Unnamed: 0,ID,Mathematics,Physics,Arts
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


### Task2: Filling missing data in the DataFrame

In [21]:
t1 = df.fillna(0)     # Puts zero at all points with missing data. Or df.fillna(0, inplace=True)
t1

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,0.0,19.0,17.0
Maria,H543,18.0,18.0,0.0
Tim,H129,0.0,0.0,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,0.0,15.0


In [22]:
t2 = df.ffill(axis=1, limit=1)     # Fills forward missing data. Limit=1 means it fills only one missing element
t2

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,H342,19.0,17.0
Maria,H543,18.0,18.0,18.0
Tim,H129,H129,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,20.0,15.0


In [23]:
t3 = df.bfill(axis=1)     # Fills backward missing data.
t3

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,19.0,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,18.0,18.0,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,15.0,15.0


In [25]:
df

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


#### Filling missing data is a decision, which depends on your understanding of the dataset. 
#### More practical operations here, again based on your knowledge of the dataset

In [26]:
df1 = df.copy()
df1['Mathematics'] = df1['Mathematics'].fillna(0)
df1

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,0.0,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,0.0,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


In [27]:
df2 = df.copy()
df2['Mathematics'] = df2['Mathematics'].fillna(df2['Mathematics'].mean())
df2

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,19.0,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,19.0,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


#### Instead of fillna, replace can be used too

In [29]:
t1 = df.replace(np.nan, 1)     # this operation can be used not only for missing data, but also for any element in the dataframe
t1                             # this is the same as df.fillna(1)

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,1.0,19.0,17.0
Maria,H543,18.0,18.0,1.0
Tim,H129,1.0,1.0,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,1.0,15.0


In [30]:
t2 = df.replace({np.nan: 0, 10: 1})
t2

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,0.0,19.0,17.0
Maria,H543,18.0,18.0,0.0
Tim,H129,0.0,0.0,18.0
Robert,H309,19.0,1.0,1.0
Eric,H219,20.0,0.0,15.0


In [31]:
df3 = df.copy()
df3['Mathematics'] = df3['Mathematics'].replace(np.nan, 0)
df3

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,0.0,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,0.0,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


### Task3: Calculations with missing data

In [32]:
t1 = df.mean(numeric_only=True, axis=1)     # Find the average mark of each student; or sum or prod
t1

Glenn     18.0
Maria     18.0
Tim       18.0
Robert    13.0
Eric      17.5
dtype: float64

In [33]:
df

Unnamed: 0,ID,Mathematics,Physics,Arts
Glenn,H342,,19.0,17.0
Maria,H543,18.0,18.0,
Tim,H129,,,18.0
Robert,H309,19.0,10.0,10.0
Eric,H219,20.0,,15.0


### Two practice questions:
#### 1: Return the average mark for any student who missed the maths exam
#### 2: Maria missed the exam for Arts. What is the average mark for the rest of the class?