In [1]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.



In [2]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.



In [3]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.



In [4]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.



In [5]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.



In [6]:
import pandas as pd
import numpy as np
from scipy import stats

# -------------------------------------
# Create sample synthetic dataset
# -------------------------------------
data = pd.DataFrame({
    'Gender': ['Male', 'Female', 'Female', 'Male', np.nan, 'Female', 'Male'],
    'Age': [25, np.nan, 22, 28, 32, np.nan, 40],
    'Salary': [50000, 55000, 52000, 58000, 1000000, 54000, 49000],  # Outlier in Salary
    'Department': ['HR', 'Finance', 'Finance', 'HR', 'HR', 'Finance', 'HR']
})

print("Original Data:")
print(data)

# --------------------------------------------------
# Question 1: Fill missing 'Age' based on 'Gender'
# --------------------------------------------------
# First, handle missing Gender to avoid groupby issues
data['Gender'] = data['Gender'].fillna('Unknown')

# Fill missing Age by Gender group
data['Age'] = data.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.mean()))

# --------------------------------------------------
# Question 2: Remove outliers using z-score on 'Salary'
# --------------------------------------------------
z_scores = np.abs(stats.zscore(data['Salary']))
data = data[z_scores < 3]  # Keep only rows where z-score < 3

# --------------------------------------------------
# Question 3: Convert 'Age' to integer safely
# --------------------------------------------------
# Fill any remaining NaNs (in case groupby failed for some edge case)
data['Age'] = data['Age'].fillna(data['Age'].mean())
data['Age'] = data['Age'].round().astype(int)

# --------------------------------------------------
# Question 4: Automate data cleaning
# --------------------------------------------------
def clean_data(df):
    df = df.copy()
    
    # Fill missing Gender
    df['Gender'] = df['Gender'].fillna('Unknown')

    # Fill missing Age by Gender group
    df['Age'] = df.groupby('Gender')['Age'].transform(lambda x: x.fillna(x.mean()))
    df['Age'] = df['Age'].fillna(df['Age'].mean()).round().astype(int)

    # Remove duplicates
    df = df.drop_duplicates()

    # Standardize column names
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

    return df

# Apply cleaning function
data = clean_data(data)

# --------------------------------------------------
# Question 5: Normalize 'salary' with min-max scaling
# --------------------------------------------------
min_salary = data['salary'].min()
max_salary = data['salary'].max()
data['salary_normalized'] = (data['salary'] - min_salary) / (max_salary - min_salary)

# --------------------------------------------------
# Final Output
# --------------------------------------------------
print("\nCleaned & Processed Data:")
print(data)


Original Data:
   Gender   Age   Salary Department
0    Male  25.0    50000         HR
1  Female   NaN    55000    Finance
2  Female  22.0    52000    Finance
3    Male  28.0    58000         HR
4     NaN  32.0  1000000         HR
5  Female   NaN    54000    Finance
6    Male  40.0    49000         HR

Cleaned & Processed Data:
    gender  age   salary department  salary_normalized
0     Male   25    50000         HR           0.001052
1   Female   22    55000    Finance           0.006309
2   Female   22    52000    Finance           0.003155
3     Male   28    58000         HR           0.009464
4  Unknown   32  1000000         HR           1.000000
5   Female   22    54000    Finance           0.005258
6     Male   40    49000         HR           0.000000
