### Types of Data Quality Issues in ML Pipelines
**Description**: Analyze a sample dataset (provided by the instructor) and identify instances of data quality issues such as missing values, bias, or noise.

In [None]:
# Write your code from here

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Create a sample dataset with intentional issues
data = {
    "age": [25, 30, np.nan, 22, 40, 35, 29, np.nan, 50, 27],
    "income": [50000, 60000, 55000, 58000, 62000, None, 58000, 59000, 60000, 61000],
    "gender": ["M", "F", "F", "M", "F", "M", "M", "M", "F", "F"],
    "purchase": [1, 0, 1, 0, 1, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

print("=== Dataset Preview ===")
print(df)

# Step 2: Detect Missing Values
print("\n=== Missing Values ===")
print(df.isnull().sum())

# Step 3: Detect Bias (Example: Gender bias in purchase)
purchase_rate_by_gender = df.groupby("gender")["purchase"].mean()
print("\n=== Purchase Rate by Gender (Potential Bias Indicator) ===")
print(purchase_rate_by_gender)

# Step 4: Detect Noise (Outliers in numerical features using boxplots)
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
sns.boxplot(x=df["age"])
plt.title("Age Distribution")

plt.subplot(1, 2, 2)
sns.boxplot(x=df["income"])
plt.title("Income Distribution")

plt.tight_layout()
plt.show()

# Optional: Summary
print("\nSummary:")
print("- Missing values found in 'age' and 'income' columns.")
print("- Possible bias detected: purchase rate differs by gender.")
print("- Check boxplots for outliers (noise) in numerical features.")

