In [3]:
# Day 11 ‚Äî Numpy & Pandas (BMI Calculation + Data Cleaning)

# --------------------------------------------------------------
# ‚úÖ Task 1: There are 5 students with height & weight. Calculate BMI.
# --------------------------------------------------------------

import pandas as pd

# Sample student data
data = {
    'Name': ['A', 'B', 'C', 'D', 'E'],
    'Height_cm': [160, 170, 165, 155, 180],  # height in centimeters
    'Weight_kg': [55, 65, 70, 50, 80]        # weight in kilograms
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert height from cm to meters
df['Height_m'] = df['Height_cm'] / 100

# BMI Formula = Weight (kg) / (Height (m) ^ 2)
df['BMI'] = df['Weight_kg'] / (df['Height_m'] ** 2)

print("‚úÖ BMI Calculation Result:")
print(df)


‚úÖ BMI Calculation Result:
  Name  Height_cm  Weight_kg  Height_m        BMI
0    A        160         55      1.60  21.484375
1    B        170         65      1.70  22.491349
2    C        165         70      1.65  25.711662
3    D        155         50      1.55  20.811655
4    E        180         80      1.80  24.691358


In [4]:
# --------------------------------------------------------------
# ‚úÖ Task 2: Demonstrate Data Cleaning Methods in Pandas
# --------------------------------------------------------------

import pandas as pd
import numpy as np

# Sample dataset with missing values, wrong data, duplicates
clean_df = pd.DataFrame({
    'Date': ['2025-10-01', None, '2025-10-03', '2025-10-04'],
    'Calories': [420, None, 300, 300],
    'Duration': [45, 450, 30, 450]
})

print("üîπ Original Data:")
display(clean_df)

# 1) Drop Null Values
print("\n1Ô∏è‚É£ Drop Null Rows (dropna):")
display(clean_df.dropna())

# 2) Replace Empty Values
print("\n2Ô∏è‚É£ Replace All Empty with 130 (fillna):")
temp = clean_df.copy()
temp.fillna(130, inplace=True)
display(temp)

# 3) Replace Only Specific Column
print("\n3Ô∏è‚É£ Replace Only 'Calories' Null with 130:")
temp = clean_df.copy()
temp['Calories'].fillna(130, inplace=True)
display(temp)

# 4) Replace Using Mean
print("\n4Ô∏è‚É£ Replace Using Mean of 'Calories':")
temp = clean_df.copy()
temp['Calories'].fillna(temp['Calories'].mean(), inplace=True)
display(temp)

# 5) Drop rows with wrong Date format
print("\n5Ô∏è‚É£ Drop rows where Date is Null:")
temp = clean_df.copy()
temp = temp.dropna(subset=['Date'])
display(temp)

# 6) Convert to Correct Date Format
print("\n6Ô∏è‚É£ Convert 'Date' to Datetime Format:")
temp = clean_df.copy()
temp['Date'] = pd.to_datetime(temp['Date'], errors='coerce')
display(temp)

# 7) Replace Wrong Data (Duration > 120)
print("\n7Ô∏è‚É£ Replace Duration > 120 with 120:")
temp = clean_df.copy()
temp.loc[temp['Duration'] > 120, 'Duration'] = 120
display(temp)

# 8) Drop Wrong Data (Duration > 120)
print("\n8Ô∏è‚É£ Drop Rows Where Duration > 120:")
temp = clean_df.copy()
temp = temp[temp['Duration'] <= 120]
display(temp)

# 9) Check Duplicates
print("\n9Ô∏è‚É£ Check for Duplicates:")
print(clean_df.duplicated())

# 10) Remove Duplicates
print("\nüîü Remove Duplicates:")
display(clean_df.drop_duplicates())


üîπ Original Data:


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,,,450
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



1Ô∏è‚É£ Drop Null Rows (dropna):


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



2Ô∏è‚É£ Replace All Empty with 130 (fillna):


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,130,130.0,450
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



3Ô∏è‚É£ Replace Only 'Calories' Null with 130:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  temp['Calories'].fillna(130, inplace=True)


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,,130.0,450
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



4Ô∏è‚É£ Replace Using Mean of 'Calories':


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  temp['Calories'].fillna(temp['Calories'].mean(), inplace=True)


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,,340.0,450
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



5Ô∏è‚É£ Drop rows where Date is Null:


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



6Ô∏è‚É£ Convert 'Date' to Datetime Format:


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,NaT,,450
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450



7Ô∏è‚É£ Replace Duration > 120 with 120:


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,,,120
2,2025-10-03,300.0,30
3,2025-10-04,300.0,120



8Ô∏è‚É£ Drop Rows Where Duration > 120:


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
2,2025-10-03,300.0,30



9Ô∏è‚É£ Check for Duplicates:
0    False
1    False
2    False
3    False
dtype: bool

üîü Remove Duplicates:


Unnamed: 0,Date,Calories,Duration
0,2025-10-01,420.0,45
1,,,450
2,2025-10-03,300.0,30
3,2025-10-04,300.0,450
