In [45]:
import pandas as pd
import numpy as np

data = {
    "Name": ["Amit", "Neha", "Ravi", "Amit", "Priya", None],
    "Age": [21, 22, np.nan, 21, 23, 22],
    "City": ["Pune", "Mumbai", "Delhi", "Pune", None, "Mumbai"],
    "Salary": ["50000", "60000", "55000", "50000", "65000", None]
}

df = pd.DataFrame(data)

##**ðŸ”¹ Task 1: Detect Missing Values**

**Goal: Identify messy data.**

1.  Display the DataFrame
2.  Check missing values using:

In [46]:
# Display the DataFrame
print("Original Dataset : ")
print(df)

# Check missing values :
print("\nChecking Missing Values : ")
print(df.isna())

# total Count of missing Values columns wise :
print("\nTotal Missing Values : ")
print(df.isna().sum())

Original Dataset : 
    Name   Age    City Salary
0   Amit  21.0    Pune  50000
1   Neha  22.0  Mumbai  60000
2   Ravi   NaN   Delhi  55000
3   Amit  21.0    Pune  50000
4  Priya  23.0    None  65000
5   None  22.0  Mumbai   None

Checking Missing Values : 
    Name    Age   City  Salary
0  False  False  False   False
1  False  False  False   False
2  False   True  False   False
3  False  False  False   False
4  False  False   True   False
5   True  False  False    True

Total Missing Values : 
Name      1
Age       1
City      1
Salary    1
dtype: int64


##**ðŸ”¹ Task 2: Handle Missing Values**

**Goal: Clean incomplete data.**

1. Fill missing Age with the mean Age
2. Fill missing City with "Unknown"
3. Drop rows where Salary is missing

In [47]:
# Fill missing Age with the mean Age
df['Age'] = df['Age'].fillna(df['Age'].mean())
print("Fill Missing Age with the mean Age : ")
print(df)

# Fill missing City with "Unknown"
df['City'] = df['City'].fillna('Unknown')
print("\nFill missing City with 'Unknown' : ")
print(df)

# Drop rows where Salary is missing
df.dropna(subset=["Salary"], inplace=True)
print("\nDrop rows where Salary is missing")
print(df)

Fill Missing Age with the mean Age : 
    Name   Age    City Salary
0   Amit  21.0    Pune  50000
1   Neha  22.0  Mumbai  60000
2   Ravi  21.8   Delhi  55000
3   Amit  21.0    Pune  50000
4  Priya  23.0    None  65000
5   None  22.0  Mumbai   None

Fill missing City with 'Unknown' : 
    Name   Age     City Salary
0   Amit  21.0     Pune  50000
1   Neha  22.0   Mumbai  60000
2   Ravi  21.8    Delhi  55000
3   Amit  21.0     Pune  50000
4  Priya  23.0  Unknown  65000
5   None  22.0   Mumbai   None

Drop rows where Salary is missing
    Name   Age     City Salary
0   Amit  21.0     Pune  50000
1   Neha  22.0   Mumbai  60000
2   Ravi  21.8    Delhi  55000
3   Amit  21.0     Pune  50000
4  Priya  23.0  Unknown  65000


##**ðŸ”¹ Task 3: Change Data Types**
**Goal: Make data usable for calculations.**

1. Display data types of all columns  
2. Convert **Salary** column to integer  
3. Verify updated data types

In [48]:
# Display data types of all columns
print("Display Data Types of All Columns : ")
print(df.dtypes)

# Convert Salary column to integer
df['Salary'] = df['Salary'].astype(int)

# Verify updated data types
print("\nVerify Updated Data Types : ")
print(df.dtypes)

Display Data Types of All Columns : 
Name       object
Age       float64
City       object
Salary     object
dtype: object

Verify Updated Data Types : 
Name       object
Age       float64
City       object
Salary      int64
dtype: object


##**ðŸ”¹ Task 4: Replace Values**
**Goal: Standardize categorical data.**

1. Replace `"Pune"` with `"PUNE"`  
2. Replace `"Mumbai"` with `"MUMBAI"`  
3. Display the updated DataFrame  

In [49]:
# Replace "Pune" with "PUNE" and "Mumbai" with "MUMBAI"
df['City'] = df['City'].replace({
    'Pune': 'PUNE',
    'Mumbai': 'MUMBAI'
})

# Display the updated DataFrame
print("Updated DataFrame : ")
print(df)

Updated DataFrame : 
    Name   Age     City  Salary
0   Amit  21.0     PUNE   50000
1   Neha  22.0   MUMBAI   60000
2   Ravi  21.8    Delhi   55000
3   Amit  21.0     PUNE   50000
4  Priya  23.0  Unknown   65000


## **ðŸ”¹ Task 5: Detect & Remove Duplicates**  
**Goal: Ensure data accuracy.**

1. Detect duplicate rows  
2. Remove duplicate records  
3. Display DataFrame shape after removing duplicates

In [50]:
# Detect duplicate rows
print("Detect Duplicate Rows : ")
print(df.duplicated())

# Remove duplicate records
df.drop_duplicates(inplace=True)

# Display DataFrame shape after removing duplicates
print("\nDisplay DataFrame shape after removing duplicates : ")
print(df.shape)

Detect Duplicate Rows : 
0    False
1    False
2    False
3     True
4    False
dtype: bool

Display DataFrame shape after removing duplicates : 
(4, 4)


##**ðŸ”¹ Task 6: Drop Rows & Columns**  
**Goal: Remove unnecessary data.**

1. Drop the **City** column  
2. Drop the **first row** using index  
3. Display the final DataFrame  

In [54]:
# Drop the City column
df.drop('City',axis=1, inplace = True)

# Drop the first row using index
df.drop(0,axis=0, inplace = True)

# Display the final DataFrame
print("Display the final DataFrame : ")
print(df)

Display the final DataFrame : 
    Name   Age  Salary
1   Neha  22.0   60000
2   Ravi  21.8   55000
4  Priya  23.0   65000


## **ðŸ”¹ Task 7: Final Data Inspection**
**Goal: Verify cleaned dataset.**

1. Display the first 5 rows  
2. Display summary statistics  
3. Check dataset information  

In [59]:
# Display the first 5 rows
print("Display the first 5 rows : ")
print(df.head())

# Display summary statistics
print("\nDisplay summary statistics : ")
print(df.describe())

# Check dataset information
print("\nCheck dataset information : ")
print(df.info())

Display the first 5 rows : 
    Name   Age  Salary
1   Neha  22.0   60000
2   Ravi  21.8   55000
4  Priya  23.0   65000

Display summary statistics : 
             Age   Salary
count   3.000000      3.0
mean   22.266667  60000.0
std     0.642910   5000.0
min    21.800000  55000.0
25%    21.900000  57500.0
50%    22.000000  60000.0
75%    22.500000  62500.0
max    23.000000  65000.0

Check dataset information : 
<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, 1 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    3 non-null      object 
 1   Age     3 non-null      float64
 2   Salary  3 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 96.0+ bytes
None
