In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("your_dataset.csv") 

# 1. Clean column headers to be lowercase and underscored
df.columns = df.columns.str.lower().str.strip().str.replace(" ", "_")

# 2. Standardize gender values
df['gender'] = df['gender'].str.lower().str.strip()
df['gender'] = df['gender'].replace({
    'm': 'male', 'male': 'male',
    'f': 'female', 'female': 'female'
})

# 3. Standardize country names
df['country'] = df['country'].str.lower().str.strip()
df['country'] = df['country'].replace({
    'u.s.a': 'usa',
    'united states': 'usa',
    'usa': 'usa',
    'india': 'india'
})

# 4. Convert 'date_of_birth' to datetime format (and handle errors)
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce', dayfirst=True)

# 5. Fill missing age values using year from DOB
df['age'] = df.apply(
    lambda row: 2025 - row['date_of_birth'].year if pd.isnull(row['age']) and pd.notnull(row['date_of_birth']) else row['age'],
    axis=1
)

# 6. Fix age data type to Int (nullable)
df['age'] = pd.to_numeric(df['age'], errors='coerce').astype('Int64')

# 7. Drop duplicate rows
df = df.drop_duplicates()

# 8. Drop rows missing essential fields
df = df.dropna(subset=['full_name', 'gender', 'country'])

# 9. Format date to dd-mm-yyyy before saving
df['date_of_birth'] = df['date_of_birth'].dt.strftime('%d-%m-%Y')

# 10. Save the cleaned dataset
df.to_csv("Cleaned_Dataset.csv", index=False)
print(" Cleaned dataset saved as 'Cleaned_Dataset.csv'")


✅ Cleaned dataset saved as 'Cleaned_Dataset.csv'
