In [27]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# To display all columns in the dataset.
pd.set_option('display.max_columns', None)

In [28]:
# Missing data Not At Random (MNAR)

In [29]:
# load the titanic dataset.
data = pd.read_csv('desktop/feature-engineering-for-machine-learning-main/feature-engineering-for-machine-learning-main/titanic.csv')
# Let's inspect the first 5 rows.
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22,S,,,"Montreal, PQ / Chesterville, ON"


In [6]:
# We can quantify the missing values using the isnull() method plus the sum() method:

data.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [7]:
#There are 263 missing values for Age, 1014 for Cabin and 2 for Embarked.

In [8]:
# We can also use the mean() method after isnull() to obtain the fraction of missing values:

data.isnull().mean()

pclass       0.000000
survived     0.000000
name         0.000000
sex          0.000000
age          0.200917
sibsp        0.000000
parch        0.000000
ticket       0.000000
fare         0.000764
cabin        0.774637
embarked     0.001528
boat         0.628724
body         0.907563
home.dest    0.430863
dtype: float64

In [30]:
#In the variables Age there is 20% of data missing.
#There is 77 percent of data missing in the variable Cabin, in which the passenger was traveling.
 

In [31]:
# Let's create a binary variable that indicates if the value of cabin is missing.
data['cabin_null'] = np.where(data['cabin'].isnull(), 1, 0)

In [11]:
# Let's evaluate the percentage of missing values in cabin for the people who survived vs the non-survivors.

# The variable Survived takes the value 1 if the passenger survived, or 0 otherwise.

# Group data by Survived vs Non-Survived and find the percentage of NaN for Cabin.

data.groupby(['survived'])['cabin_null'].mean()

survived
0    0.873918
1    0.614000
Name: cabin_null, dtype: float64

In [None]:
#The percentage of missing values is higher for those who did not survive (87% vs 60% for survivors). 
#This finding could support our hypothesis that the data is missing because after people died,the information could not be retrieved.

In [12]:
# Let's do the same for the variable age:

# First, we create a binary variable to indicate
# if a value is missing.

data['age_null'] = np.where(data['age'].isnull(), 1, 0)

# Then we look at the mean in survivors and non-survivors:
data.groupby(['survived'])['age_null'].mean()

survived
0    0.234858
1    0.146000
Name: age_null, dtype: float64

In [13]:
# We observe more missing data points for the people who did not survive. 
#The analysis therefore suggests that there was a systematic loss of data: people who did not survive had more missing information.
#Presumably, the method chosen to gather the information contributes to the generation of this missing data.

In [14]:
#Missing data Completely At Random (MCAR)

In [15]:
# missing values for the variable Embarked.
# Let's slice the dataframe to show only the observations with missing values for Embarked.

data[data['embarked'].isnull()]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,cabin_null,age_null
168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,,6,,,0,0
284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,,6,,"Cincinatti, OH",0,0


In [16]:
# there does not seem to be an indication that the missing information in the variable "embarked" is dependent on any other variable, 
#and the fact that these women survived means that they could have been asked for this information.

#It is very likely the values were lost at the time of building the dataset.

#If these values are MCAR, the likelihood of data missing for these two women is the same as the likelihood of data
#missing for any other person on the Titanic. Of course, this will be hard, if possible at all, to prove.

In [17]:
# Missing data at Random (MAR)

In [32]:
data = pd.read_csv('desktop/feature-engineering-for-machine-learning-main/feature-engineering-for-machine-learning-main/loan.csv', usecols=['employment', 'time_employed'])

data.head()

Unnamed: 0,employment,time_employed
0,Teacher,<=5 years
1,Accountant,<=5 years
2,Statistician,<=5 years
3,Other,<=5 years
4,Bus driver,>5 years


In [19]:
# Let's check the percentage of missing data.

data.isnull().mean()

employment       0.0611
time_employed    0.0529
dtype: float64

In [20]:
#Both variables have roughly the same percentage of missing observations.

In [34]:
# let's insptect the different employment types.

# Number of different employments.
print('Number of employments: {}'.format(len(data['employment'].unique())))

# Examples of employments.
data['employment'].unique()

Number of employments: 12


array(['Teacher', 'Accountant', 'Statistician', 'Other', 'Bus driver',
       'Secretary', 'Software developer', 'Nurse', 'Taxi driver', nan,
       'Civil Servant', 'Dentist'], dtype=object)

In [35]:
# Let's inspect the variable time employed.

data['time_employed'].unique()

array(['<=5 years', '>5 years', nan], dtype=object)

In [36]:
#The customer can't enter a value for employment time if they are not employed

In [38]:
#calculation of  the proportion of missing data in time_employed variable when
# customers declared employment.

t = data[~data['employment'].isnull()] # customer said I am employed
# Percentage of missing data in time employed
t['time_employed'].isnull().mean()


0.0005325380764724678

In [39]:
# Let's do the same for those borrowers who did not declared  employment.
# Customers who did not declare employment.
t = data[data['employment'].isnull()]

# Percentage of missing data in time employed.
t['time_employed'].isnull().mean()


0.8576104746317512

In [40]:
#The number of borrowers who have reported occupation and have missing values in time_employed is minimal. 
#Customers who did not report an occupation, on the other hand, mostly show missing values in the time_employed variable.

In [None]:
#This supports the hypothesis that the missing values in employment are related to the missing values in time_employed.

#This is an example of MAR.