In [2]:
# importing required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as stat
import random
%matplotlib inline

# To display maximum columns of dataframe on screen
pd.pandas.set_option('display.max_columns', None)

In [3]:
dataset=pd.read_csv('Travel.csv')
dataset.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


### 1.1 Getting all features containing NaN values 

In [4]:
# Getting features containing nan values
feature_nan=[feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 ]
feature_nan

['Age',
 'TypeofContact',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

### 1.2 Getting categorical feature containing nan values

In [5]:
categorical_feature_nan=[feature for feature in feature_nan if dataset[feature].dtypes=="O"]
categorical_feature_nan

['TypeofContact']

### 1.3 Getting numerical feature containing nan values

In [6]:
numerical_feature_nan=[feature for feature in feature_nan if dataset[feature].dtypes!='O']
numerical_feature_nan

['Age',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

## 2.0 Filling missing value in categorical feature

### 2.1 Using mode

In [26]:
for feature in categorical_feature_nan:
    data1=dataset.copy()
    value = dataset[feature].mode()[0]
    data1[feature]=data1[feature].fillna('value')

data1['TypeofContact'].isnull().sum()

0

### 2.2 Using random value from feature 

In [8]:
dataset['TypeofContact'].isnull().sum()

25

In [9]:
for feature in categorical_feature_nan:
    data1=dataset.copy()
    value = random.choice(list(dataset[feature])) 
    data1[feature]=data1[feature].fillna(value)

data1['TypeofContact'].isnull().sum()

0

### 2.3 Using Missing String from feature 

In [10]:
for feature in categorical_feature_nan:
    data1=dataset.copy()
    value = random.choice(list(dataset[feature])) 
    data1[feature]=data1[feature].fillna('Missing')

data1['TypeofContact'].isnull().sum()

0

In [11]:
data1['TypeofContact'][224]

'Missing'

### 2.4 Droping missing values from feature 

In [12]:
data1=dataset.copy()
data1.dropna(subset=['TypeofContact'], inplace=True)

data1['TypeofContact'].isnull().sum()

0

### 3.0 Filling Missing value in Numerical features

###  Getting Missing Value Percentage

In [13]:
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((dataset[feature].isnull().mean())*100,3)))

Age has 4.624 % of missing values
DurationOfPitch has 5.135 % of missing values
NumberOfFollowups has 0.921 % of missing values
PreferredPropertyStar has 0.532 % of missing values
NumberOfTrips has 2.864 % of missing values
NumberOfChildrenVisiting has 1.35 % of missing values
MonthlyIncome has 4.767 % of missing values


### 3.1 Droping missing values from feature 

In [14]:
data2=dataset.copy()
data2.dropna(inplace=True)

In [15]:
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.2 Filling missing values with Maximum value from each feature 

In [16]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    value = data2[feature].max()
    data2[feature]=data2[feature].fillna(value)


In [17]:
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.3 Filling missing values with Minimum value from each feature 

In [18]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    value = data2[feature].min()
    data2[feature]=data2[feature].fillna(value)
    
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.4 Filling missing values with mean value from each feature 

In [19]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    value = round(data2[feature].mean())
    data2[feature]=data2[feature].fillna(value)
    
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.5 Filling missing values with median value from each feature 

In [20]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    value = data2[feature].median()
    data2[feature]=data2[feature].fillna(value)
    
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.6 Filling missing values with mode value from each feature 

In [21]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    value = data2[feature].mode()[0]
    data2[feature]=data2[feature].fillna(value)
    
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


## 3.7 Filling missing values with mode value from each feature 
1. Replacing missing values with a value at the end of the variable distribution is equivalent to replacing them with an arbitrary value, but instead of identifying the arbitrary values manually, these values are automatically selected as those at the very end of the variable distribution. The values that are used to replace missing information are estimated using the mean plus or minus three times the standard deviation if the variable is normally distributed, or the inter-quartile range (IQR) proximity rule otherwise. According to the IQR proximity rule, missing values will be replaced with the 75th quantile + (IQR * 1.5) at the right tail or by the 25th quantile - (IQR * 1.5) at the left tail. The IQR is given by the 75th quantile - the 25th quantile.

### 3.7.1 Using value at left tail

In [22]:
data2=dataset.copy()

for feature in numerical_feature_nan:
    IQR=data2[feature].quantile(0.75)-data2[feature].quantile(0.25)
    value= data2[feature].quantile(0.25)-(1.5*IQR)
    data2[feature]=data2[feature].fillna(value)
    
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.7.2 Using value at right tail

In [23]:
data2=dataset.copy()

for feature in numerical_feature_nan:
    IQR=data2[feature].quantile(0.75)-data2[feature].quantile(0.25)
    value= data2[feature].quantile(0.75)+(1.5*IQR)
    data2[feature]=data2[feature].fillna(value)
    
# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


## 3.8 Filling missing values using forward and backword filling

### 3.8.1 Using Forward filling
1.When ffill() is applied across the index then any missing value is filled based on the corresponding value in the previous row.

In [24]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    data2[feature]=data2[feature].ffill()

# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### 3.8.1 Using Backward filling
1.When bfill() is applied then value in current na cells are filled from the corresponding value in the next row. If the next row is also na value then it won’t be populated.

In [25]:
data2=dataset.copy()
for feature in numerical_feature_nan:
    data2[feature]=data2[feature].bfill()

# checking missing value percentage after performing above operation
for feature in numerical_feature_nan:
    print("{} has {} % of missing values".format(feature, np.round((data2[feature].isnull().mean())*100,3)))

Age has 0.0 % of missing values
DurationOfPitch has 0.0 % of missing values
NumberOfFollowups has 0.0 % of missing values
PreferredPropertyStar has 0.0 % of missing values
NumberOfTrips has 0.0 % of missing values
NumberOfChildrenVisiting has 0.0 % of missing values
MonthlyIncome has 0.0 % of missing values


### creating dataset after filling missing values.

In [28]:
### Categorical features

for feature in categorical_feature_nan:
    data1=dataset.copy()
    value = dataset[feature].mode()[0]
    data1[feature]=data1[feature].fillna('value')
    
data1.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                 0
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [30]:
# for numerical features
for feature in numerical_feature_nan:
    value = data1[feature].median()
    data1[feature]=data1[feature].fillna(value)
    
data1.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [33]:
# exporting this dataset to csv
data1.to_csv('Travel_missing_filled.csv', index=False)