In [1]:
# Data Cleaning & Preprocessing -

# Real-world data is messy. Pandas gives us powerful tools to clean and transform data before analysis.


In [3]:
# Handling Missing Values -
# Check for Missing Data

# df.isnull()              # True for NaNs  (Not a Number)
# df.isnull().sum()        # Count missing per column

In [2]:
# Practice --


import pandas as pd

df = pd.read_csv("data_cleaning_sample.csv")  # to read a csv file 
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [3]:
df.isnull()   # to check all the missing values, returns true for missing values. 

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,False,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,False,False
3,False,True,False,False,False,False
4,False,False,False,False,False,False
5,True,False,False,False,False,True
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,True,False,False,False,False


In [4]:
df.isnull().sum() # to check column wise how many mission items are there in particular colums - in numbers.

Name         1
Age          3
City         0
Gender       0
Email        0
Join Date    1
dtype: int64

In [5]:
df.dropna()   # gives the data without missing items.  This is for rows !! 

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [6]:
df.dropna(axis=1) # in case you only want columns without missing items - this is for column -  " axis=1 for columns(features) "

Unnamed: 0,City,Gender,Email
0,New York,F,alice@example.com
1,Delhi,M,charlie@example
2,Los Angeles,M,bob@example.com
3,Delhi,M,charlie@example
4,Mumbai,M,david@example.com
5,Delhi,F,eve@domain.com
6,New York,F,alice@example.com
7,New York,F,alice@example.com
8,Delhi,M,charlie@example


In [7]:
df.fillna(0)               # returns 0 where ever there was missing data (NAN)

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,0,28.0,Delhi,F,eve@domain.com,0
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,0.0,Delhi,M,charlie@example,20-07-2021


In [8]:
df.ffill()  # Copy the previous value downward to fill the blank.”

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,25.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,David,28.0,Delhi,F,eve@domain.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,25.0,Delhi,M,charlie@example,20-07-2021


In [9]:
df.bfill()    # “Copy the next value upward to fill the blank.”

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,Alice,28.0,Delhi,F,eve@domain.com,01-05-2021
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


In [13]:
df["Age"].fillna(df["Age"]).mean()  # to replace with mean

# df["whatever you wish to replace"] . fillna([" whatever you want to replace it with"]). expression, in this case we wanted it to be mean. so .mean()

np.float64(25.833333333333332)

In [14]:
df.duplicated()  # Returns true for the duplicate rows index wise starting from 0 

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

In [15]:
df.drop_duplicates()    # Returns the data that does not have duplicate items. 

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,


In [18]:
# filtered data without Nan items 

df2 = df.dropna().copy()
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [20]:
# Typecast 

df2["Age"] = df2["Age"].astype(int)      # typecast float into Interger using .astype()
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,01-05-2021
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020
4,David,22,Mumbai,M,david@example.com,12-11-2019
6,Alice,25,New York,F,alice@example.com,01-05-2021
7,Alice,25,New York,F,alice@example.com,01-05-2021


In [23]:
#Apply()

df2["Age_Group"] = df2["Age"].apply(lambda x: "Adult" if x >= 25 else "Minor")  
df2

# it lets you say "apply a function and run it on each item (or row/column) 
# of a collection of data. in simple words - .apply() Take a function and run it on each item (or row/column) of a collection of data.

# Used apply() to create a new column called Age group made of conditions ( a function )with the age column that already existed. 

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age_Group
0,Alice,25,New York,F,alice@example.com,01-05-2021,Adult
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020,Adult
4,David,22,Mumbai,M,david@example.com,12-11-2019,Minor
6,Alice,25,New York,F,alice@example.com,01-05-2021,Adult
7,Alice,25,New York,F,alice@example.com,01-05-2021,Adult


In [24]:
# Map()  It returns a new Series: This new Series contains all the transformed values, leaving your original Series unchanged. 
# It helps you change every element of a column to something else.


# It takes a dict 

gender_map = {"M" : "Male", "F" : "Female", "O" : "Others"}
df2["Gender"] = df2["Gender"].map(gender_map)
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age_Group
0,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
2,Bob,30,Los Angeles,Male,bob@example.com,15-06-2020,Adult
4,David,22,Mumbai,Male,david@example.com,12-11-2019,Minor
6,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult


In [25]:
# replace()          #It simply replaces the specific values as per instructions example below :

# It takes a dict  

df2["City"] = df2["City"].replace({"Mumbai": "New Delhi"})   # changed Mumbai to delhi 
df2

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age_Group
0,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
2,Bob,30,Los Angeles,Male,bob@example.com,15-06-2020,Adult
4,David,22,New Delhi,Male,david@example.com,12-11-2019,Minor
6,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult


In [26]:
      # pd.to_datetime()

# Why is pd.to_datetime() special?

# Unlike astype(), which works on simple data types (like integers, strings, etc.), pd.to_datetime() is designed to:

# Handle different date formats (e.g., "YYYY-MM-DD", "MM/DD/YYYY", etc.).

# Handle mixed types (e.g., some date strings, some NaT, or missing values).

# Convert integer timestamps (e.g., UNIX time) into datetime objects.

# Recognize timezones if provided.


In [None]:
# Summary - 

# Use isnull() # to check all the missing values, returns true for missing values. 
# fillna() #  # returns 0 where ever there was missing data (NAN)
# dropna() for missing data
# Clean text with .str
# convert types with .astype()
# Use apply(), map(), replace() to transform your columns
# Data cleaning is where 80% of your time goes in real projects