# **Data Cleaning**

## Import Libraries and Load Data

In [13]:
import pandas as pd
import matplotlib.pyplot as plt

In [14]:
dataset1 = '/content/drive/MyDrive/Ds Data Sets/student3.csv'
df = pd.read_csv(dataset1)

In [15]:
# Display the first few rows of the dataframe
df

Unnamed: 0,roll,name,class,marks,age
0,1,anil,TE,56.77,22.0
1,2,amit,TE,59.77,21.0
2,3,aniket,BE,76.88,19.0
3,4,ajinkya,TE,69.66,20.0
4,5,asha,TE,63.28,
5,6,ayesha,BE,,20.0
6,7,amar,BE,65.34,19.0
7,8,Amita,be,68.33,23.0
8,9,amol,TE,56.75,20.0
9,9,amol,TE,56.75,20.0


## Inspecting the Dataset

In [16]:
# Display the mean of 'age' and 'marks'
display(df['age'].mean())
display(df['marks '].mean())

20.785714285714285

66.96714285714286

## Clean Column Names

In [17]:
# Remove any leading or trailing spaces from column names
df.columns = df.columns.str.strip()
df.columns

Index(['roll', 'name', 'class', 'marks', 'age'], dtype='object')

## Standardize Column Data

In [18]:
# Capitalize the first letter of each name
df['name'] = df['name'].str.title()
df['name']

Unnamed: 0,name
0,Anil
1,Amit
2,Aniket
3,Ajinkya
4,Asha
5,Ayesha
6,Amar
7,Amita
8,Amol
9,Amol


## Handle Duplicate Rows

In [19]:
# Check for duplicate rows
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,True


In [20]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
10,False


## Check for Missing Values

In [21]:
# Count non-null values in each column
df.count()

Unnamed: 0,0
roll,15
name,15
class,14
marks,13
age,13


In [22]:
# Count missing values in each column
df.isnull().sum()

Unnamed: 0,0
roll,0
name,0
class,1
marks,2
age,2


## Impute Missing Values

In [23]:
# Fill missing values in 'marks' column with the mean of 'marks'
df['marks'].fillna(df['marks'].mean(), inplace=True)

# Fill missing values in 'age' column with the median of 'age'
df['age'].fillna(df['age'].median(), inplace=True)

# Fill missing values in 'class' column with the mode of 'class'
df['class'].fillna(df['class'].mode()[0], inplace=True)

## Forward Fill and Backward Fill Missing Values

In [26]:
# Forward fill missing values
df_ffill = df.ffill()

# Backward fill missing values
df_bfill = df.bfill()

# Display forward fill and backward fill results
df_ffill.head()

Unnamed: 0,roll,name,class,marks,age
0,1,Anil,TE,56.77,22.0
1,2,Amit,TE,59.77,21.0
2,3,Aniket,BE,76.88,19.0
3,4,Ajinkya,TE,69.66,20.0
4,5,Asha,TE,63.28,21.0


In [27]:
df_bfill.head()

Unnamed: 0,roll,name,class,marks,age
0,1,Anil,TE,56.77,22.0
1,2,Amit,TE,59.77,21.0
2,3,Aniket,BE,76.88,19.0
3,4,Ajinkya,TE,69.66,20.0
4,5,Asha,TE,63.28,21.0


## Saving Cleaned Dataset

In [28]:
cleaned_dataset = df_ffill

In [30]:
cleaned_dataset.to_csv('cleaned_dataset.csv', index=True)
print("Cleaned Dataset saved as 'cleaned_dataset.csv'")

Cleaned Dataset saved as 'cleaned_dataset.csv'
