In [20]:
import pandas as pd

In [21]:
# Read the data:

# Define the following dataframe:
# ------------------------
# |  A  |  B  |  C  | D  |
# ------------------------
# |  1  |  6  | 11  |23  |
# |  2  | NaN | 12  |11  |
# | NaN |  8  | 11  |101 |
# |  4  | NaN | NaN |22  |
# |  5  | 10  | 11  |59  |
# ------------------------

df = pd.DataFrame({
    'A': [ 1,    2, None,    4,  5],
    'B': [ 6, None,    8, None, 10],
    'C': [11,   12,   11, None, 11],
     'D': [23,   11,   101, 22, 59]
})

df

Unnamed: 0,A,B,C,D
0,1.0,6.0,11.0,23
1,2.0,,12.0,11
2,,8.0,11.0,101
3,4.0,,,22
4,5.0,10.0,11.0,59


## Identifying and Counting Missing Values:

In [22]:
# use the existing function in dataframe to identify the missing values
missing = df.isna()
missing

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,True,False,False
2,True,False,False,False
3,False,True,True,False
4,False,False,False,False


In [23]:
# Print the number of missing values per column:
missing_counts_per_column = df.isna().sum()
missing_counts_per_column

A    1
B    2
C    1
D    0
dtype: int64

In [24]:
# Print the number of missing values in all the dataframe:
missing_counts = df.isna().sum().sum()
missing_counts

np.int64(4)

## Methods to Deal with Missing Values:

## 1 - Dropping Missing Values:


In [25]:
# 1 - Dropping Missing Values:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,6.0,11.0,23
4,5.0,10.0,11.0,59


## 2 - Imputing Missing Values:

### 2.1 - Mean Imputation:
Here we will try to use the mean over the first column 'A' and replace the missing values of this column with the mean in the original dataframe without needing to assign the result back to it


In [26]:
# 2 - Imputing Missing Values:
# 2.2 - Mean Imputation:
mean =  df['A'].mean()
df['A'].fillna(mean, inplace= True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['A'].fillna(mean, inplace= True)


Unnamed: 0,A,B,C,D
0,1.0,6.0,11.0,23
1,2.0,,12.0,11
2,3.0,8.0,11.0,101
3,4.0,,,22
4,5.0,10.0,11.0,59


Now we want to replace the missing values of second column 'B' with its mean also bu we want to assign the result back to it.

In [27]:
mean =  df['B'].mean()
df['B'] = df['B'].fillna(mean)


In [28]:
df

Unnamed: 0,A,B,C,D
0,1.0,6.0,11.0,23
1,2.0,8.0,12.0,11
2,3.0,8.0,11.0,101
3,4.0,8.0,,22
4,5.0,10.0,11.0,59


### 2.2 - Median Imputation:

Here we want to replace the missing values in the other columns using its median .

In [29]:
# 2 - Imputing Missing Values:
# 2.2 - Median Imputation:
median = df['C'].median()
# add your code here
df['C'] = df['C'].fillna(median)
df

Unnamed: 0,A,B,C,D
0,1.0,6.0,11.0,23
1,2.0,8.0,12.0,11
2,3.0,8.0,11.0,101
3,4.0,8.0,11.0,22
4,5.0,10.0,11.0,59


In [30]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [31]:
print("END !!")

END !!
