In [None]:
'''
Handling Missing Values:
    - Check if there is any null values
    - Boolean indicator to check whether column having null value
    - Column wise Total null values
    - Total number of missing values in the dataframe
    - Filter and display data where null values are present
    - Filter data based on where no null values present
    - Filter All rows in the dataframe where null values present
    - Drop all rows where Nan values
    - Drop all rows with atleasr one Nan present
    - Drop column with atleast one NaN
    - Replacing NaNs with a single constant value
    - Replacing NaNs using Median/Mean of the column
    - Using the replace method
    - Interpolate() function is used to fill NaN values
    - Forward Fill Missing DataFrame Value
    - Backward Fill Missing DataFrame Values
    - Using SCIKIT Learn Imputer Method
    - Drop Duplicate data in dataframe
    
Cleaning DataFrame
    - Remove Special Characters 
    - Stopward Remove

In [4]:
import pandas as pd

In [6]:
df = pd.read_csv("Titanic_train.csv")

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [15]:
# Check if there is any null values
df.isnull().values.any()

True

In [14]:
# boolean value indicating whether specific column has the null value or not

df.isnull().any()  # 'Age','Cabin' and 'Embarked' having null values

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [11]:
# Get the sum of total null values based on the Column in the data Frames
df.isnull().sum()  # 'Age' having 177 and Cabin having 687 and Embarked having 2 null values

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [23]:
# Total number of missing values in the dataframe
df.isnull().sum().sum()

866

In [12]:
# Total number of observation which are not null
df.notna().sum() 

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [20]:
# Filter data where null values are present

df[~df.Embarked.notnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [22]:
df[~df.Age.notnull()].head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q


In [25]:
# filter out rows ina 

df[df.Cabin.notnull()].head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


In [27]:
import numpy as np

data = {'first_set': [1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan],
        'second_set': ['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f','g',np.nan,'h','i']
        }

df1 = pd.DataFrame(data)

df1

Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
2,3.0,
3,4.0,
4,5.0,c
5,,d
6,6.0,e
7,7.0,
8,,
9,,f


In [28]:
# Select all rows with NaN under a single DataFrame column

df1[df1['first_set'].isna()]

Unnamed: 0,first_set,second_set
5,,d
8,,
9,,f
13,,i


In [29]:
#  select all rows with NaN under a single DataFrame column:
df1[df1['first_set'].isnull()]

Unnamed: 0,first_set,second_set
5,,d
8,,
9,,f
13,,i


In [30]:
# Using isna() to select all rows with NaN under an entire DataFrame:

df1[df1.isna().any(axis=1)]

Unnamed: 0,first_set,second_set
2,3.0,
3,4.0,
5,,d
7,7.0,
8,,
9,,f
11,9.0,
13,,i


In [31]:
# Using isnull() to select all rows with NaN under an entire DataFrame:
df1[df1.isnull().any(axis=1)]

Unnamed: 0,first_set,second_set
2,3.0,
3,4.0,
5,,d
7,7.0,
8,,
9,,f
11,9.0,
13,,i


In [32]:
# drop all rows with NaN values
df1.dropna(axis=0,inplace=True)

df1

Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
4,5.0,c
6,6.0,e
10,8.0,g
12,10.0,h


In [33]:
data = {'first_set': [1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan],
        'second_set': ['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f','g',np.nan,'h',np.nan]
        }

df1 = pd.DataFrame(data)

df1

Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
2,3.0,
3,4.0,
4,5.0,c
5,,d
6,6.0,e
7,7.0,
8,,
9,,f


In [34]:
# drop all rows with atleast one NaN
df1.dropna(axis = 0, how ='any')  


Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
4,5.0,c
6,6.0,e
10,8.0,g
12,10.0,h


In [35]:
data = {'first_set': [1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan],
        'second_set': ['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f','g',np.nan,'h',np.nan]
        }

df1 = pd.DataFrame(data)

df1

Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
2,3.0,
3,4.0,
4,5.0,c
5,,d
6,6.0,e
7,7.0,
8,,
9,,f


In [36]:
#  drop all rows with all NaN
df1.dropna(axis = 0, how ='all')

Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
2,3.0,
3,4.0,
4,5.0,c
5,,d
6,6.0,e
7,7.0,
9,,f
10,8.0,g


In [37]:
# drop all columns with atleast one NaN
df1.dropna(axis = 1, how ='any')

0
1
2
3
4
5
6
7
8
9
10


In [38]:
data = {'first_set': [1,2,3,4,5,np.nan,6,7,np.nan,np.nan,8,9,10,np.nan],
        'second_set': ['a','b',np.nan,np.nan,'c','d','e',np.nan,np.nan,'f','g',np.nan,'h',np.nan],
        'third_set': [np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
        }

df1 = pd.DataFrame(data)

df1

Unnamed: 0,first_set,second_set,third_set
0,1.0,a,
1,2.0,b,
2,3.0,,
3,4.0,,
4,5.0,c,
5,,d,
6,6.0,e,
7,7.0,,
8,,,
9,,f,


In [39]:
# drop all columns with all NaN
df1.dropna(axis = 1, how ='all')

Unnamed: 0,first_set,second_set
0,1.0,a
1,2.0,b
2,3.0,
3,4.0,
4,5.0,c
5,,d
6,6.0,e
7,7.0,
8,,
9,,f


In [41]:
df2 = pd.read_csv("Bank.csv")

df2.head()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance


In [42]:
# Replacing NaNs with a single constant value

df2['Salary'].fillna(0,inplace=True)

df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,0.0,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,0.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [43]:
df2['Gender'].fillna('No Gender', inplace=True)
df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,0.0,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,No Gender,0.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [45]:
df2 = pd.read_csv("Bank.csv")

df2.head()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance


In [46]:
# Replacing NaNs using Median/Mean of the column

# using median
df2['Salary'].fillna(df2['Salary'].median(), inplace=True)

df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,79620.5,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,79620.5,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [47]:
df2 = pd.read_csv("Bank.csv")

df2.head()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance


In [48]:
# using mean
df2['Salary'].fillna(df2['Salary'].mean(), inplace=True)

df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,77805.5,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,77805.5,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [49]:
df2 = pd.read_csv("Bank.csv")

df2.head()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance


In [51]:
# Using the replace method

df2['Salary'].replace(to_replace=np.nan,value=0,inplace=True)

df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,0.0,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,0.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [53]:
# interpolate() function is used to fill NaN values using various interpolation techniques

df2 = pd.read_csv("Bank.csv")

df2['Salary'].interpolate(method='linear', direction = 'forward', inplace=True) 

df2.head()

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,88548.0,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance


In [55]:
# Forward Fill Missing DataFrame Value

df2 = pd.read_csv("Bank.csv")

df2['Salary'].fillna(method='ffill', inplace=True)

df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,61933.0,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,0.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [56]:
# Backward Fill Missing DataFrame Values

df2 = pd.read_csv("Bank.csv")

df2['Salary'].fillna(method='bfill', inplace=True)

df2

Unnamed: 0,First Name,Gender,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,97308.0,6.945,True,Marketing
1,Thomas,Male,61933.0,,True,
2,Jerry,Male,115163.0,9.34,True,Finance
3,Dennis,n.a.,115163.0,10.125,False,Legal
4,,Female,0.0,11.598,,Finance
5,Angela,,111737.0,18.523,True,Engineering
6,Shawn,Male,111737.0,6.414,False,na
7,Rachel,Female,142032.0,12.599,False,Business Development
8,Linda,Female,57427.0,9.557,True,Client Services
9,Stephanie,Female,36844.0,5.574,True,Business Development


In [62]:
# Using SCIKIT Learn Imputer Method

from sklearn.impute import SimpleImputer
import pandas as pd

data = np.array([[1,4,np.nan],[4,6,np.nan],[-3,5,2]])

df = pd.DataFrame(data,columns=['Column1','Column2','Column2'])

print(df)

# Imputing with mean-strategy
imp = SimpleImputer(strategy='mean')
print(imp.fit_transform(data),'\n')

   Column1  Column2  Column2
0      1.0      4.0      NaN
1      4.0      6.0      NaN
2     -3.0      5.0      2.0
[[ 1.  4.  2.]
 [ 4.  6.  2.]
 [-3.  5.  2.]] 



In [59]:
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h3>{title}</h3>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

In [60]:
# Drop Duplicate data in dataframe

dict = {"Name":['Santhosh','Anand','Sreedevi','Prashanth','Sachin','Santhosh'],
        "Age":[32,34,31,30,37,32],
        "Company":["ABB",'DELL','Tesco','Tesco','Volvo','ABB']}

df = pd.DataFrame(dict)

df_droped_duplicate = df.drop_duplicates()

display_side_by_side(df,df_droped_duplicate, titles=["Original",'Duplicate Removed'])

Unnamed: 0,Name,Age,Company
0,Santhosh,32,ABB
1,Anand,34,DELL
2,Sreedevi,31,Tesco
3,Prashanth,30,Tesco
4,Sachin,37,Volvo
5,Santhosh,32,ABB

Unnamed: 0,Name,Age,Company
0,Santhosh,32,ABB
1,Anand,34,DELL
2,Sreedevi,31,Tesco
3,Prashanth,30,Tesco
4,Sachin,37,Volvo
