## 🧹🧼🫧 Data Cleaning
80% of the actual responsibilities of a Data Scientist

In [2]:
import numpy as np
import pandas as pd

In [47]:
data = {"ClgID": ["c8", np.nan, np.nan, "c1"],
        "Name": ["   Alice", "Bob", np.nan, "David"],
       "Age": [25, np.nan, 22, 22],
       "Marks": [85, 90, np.nan, 78]}

data

{'ClgID': ['c8', nan, nan, 'c1'],
 'Name': ['   Alice', 'Bob', nan, 'David'],
 'Age': [25, nan, 22, 22],
 'Marks': [85, 90, nan, 78]}

In [49]:
df = pd.DataFrame(data)

In [4]:
print(df)

  ClgID      Name   Age  Marks
0    c8     Alice  25.0   85.0
1   NaN       Bob   NaN   90.0
2   NaN       NaN  22.0    NaN
3    c1     David  22.0   78.0


# Handling Missing Data

In [6]:
df.isnull()

Unnamed: 0,ClgID,Name,Age,Marks
0,False,False,False,False
1,True,False,True,False
2,True,True,False,True
3,False,False,False,False


In [7]:
df.isnull().sum()

ClgID    2
Name     1
Age      1
Marks    1
dtype: int64

In [8]:
df.isnull().mean() * 100
# less than threshold value set by us, i.e., 30%

ClgID    50.0
Name     25.0
Age      25.0
Marks    25.0
dtype: float64

## 1. Drop missing value row or column

In [10]:
# df.dropna(inplace = True) # drop rows with NaN
# df.dropna(axis=0) # drops rows with NaN 
# df.dropna(axis=1) # drops columns with NaN 

## 2. Fill missing value cell

In [12]:
#2.1 as per your intelligence fill with CDT 

# Age = mean() 
# Marks = median()

df["Age"].fillna(df["Age"].mean(), inplace = True) # Replace NaN with mean
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace = True) # Replace NaN with mean


Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,,Bob,23.0,90.0
2,,,22.0,
3,c1,David,22.0,78.0


In [13]:
df["Marks"].fillna(df["Marks"].median(), inplace = True) # replace NaN with median
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Marks"].fillna(df["Marks"].median(), inplace = True) # replace NaN with median


Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,,Bob,23.0,90.0
2,,,22.0,85.0
3,c1,David,22.0,78.0


In [14]:
# 2.1.1 Collective addressing of missing values in an object dtype

# df["Name"].fillna("Unkown", inplace = True)
df

Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,,Bob,23.0,90.0
2,,,22.0,85.0
3,c1,David,22.0,78.0


In [15]:
# 2.1.2 Individually address any empty value by ffill or bfill

# ffill - Forward filling
# df.fillna(method="ffill", inplace=True) # enters previous value
df.fillna(method="bfill", inplace=True) # enters next value
df

  df.fillna(method="bfill", inplace=True) # enters next value


Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,c1,Bob,23.0,90.0
2,c1,David,22.0,85.0
3,c1,David,22.0,78.0


# Duplicate Values

In [17]:
# 1. detect duplicate values
print(df.duplicated().sum())

0


In [18]:
# Treating Duplicate Values
# by removing

df.drop_duplicates(inplace=True)
df

Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,c1,Bob,23.0,90.0
2,c1,David,22.0,85.0
3,c1,David,22.0,78.0


In [19]:
df

Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,c1,Bob,23.0,90.0
2,c1,David,22.0,85.0
3,c1,David,22.0,78.0


In [20]:
# removing whitespace
df["Name"] = df["Name"].str.strip()
df

Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25.0,85.0
1,c1,Bob,23.0,90.0
2,c1,David,22.0,85.0
3,c1,David,22.0,78.0


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ClgID   4 non-null      object 
 1   Name    4 non-null      object 
 2   Age     4 non-null      float64
 3   Marks   4 non-null      float64
dtypes: float64(2), object(2)
memory usage: 260.0+ bytes


In [22]:
# type conversion of columns

df["Age"] = df["Age"].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ClgID   4 non-null      object 
 1   Name    4 non-null      object 
 2   Age     4 non-null      int32  
 3   Marks   4 non-null      float64
dtypes: float64(1), int32(1), object(2)
memory usage: 244.0+ bytes


In [23]:
df

Unnamed: 0,ClgID,Name,Age,Marks
0,c8,Alice,25,85.0
1,c1,Bob,23,90.0
2,c1,David,22,85.0
3,c1,David,22,78.0


In [24]:
# rename columns
df.rename(columns={"Marks":"Score"}, inplace=True)
df

Unnamed: 0,ClgID,Name,Age,Score
0,c8,Alice,25,85.0
1,c1,Bob,23,90.0
2,c1,David,22,85.0
3,c1,David,22,78.0


In [25]:
# value replace
df["Name"].replace("Alice", "Alicia", inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Name"].replace("Alice", "Alicia", inplace=True)


Unnamed: 0,ClgID,Name,Age,Score
0,c8,Alicia,25,85.0
1,c1,Bob,23,90.0
2,c1,David,22,85.0
3,c1,David,22,78.0
