In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset
num_rows = 5000

# 1. ID Column (unique identifier)
ids = np.arange(1, num_rows + 1)

# 2. Name Column (some duplicates for cleaning practice)
names = [random.choice(["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hank"]) for _ in range(num_rows)]
names[50] = "Alice "  # Intentional trailing space (inconsistent data)
names[200] = "bob"     # Lowercase variation (inconsistent data)

# 3. Age (some outliers)
ages = np.random.normal(35, 10, num_rows).astype(int)
ages[100] = 150  # Intentional outlier
ages[300] = -5   # Negative age (corrupted data)

# 4. Salary (some missing values)
salaries = np.random.normal(70000, 15000, num_rows).astype(int).astype(float)
salaries[::30] = np.nan  # Introduce missing values

# 5. Join Date (mixed formats)
join_dates = pd.date_range(start="2015-01-01", periods=num_rows, freq="D").strftime('%Y-%m-%d').tolist()
join_dates[25] = "01/15/2018"  # Different format (mm/dd/yyyy)
join_dates[400] = "March 5, 2019"  # Different format (text month)

# 6. Department (categorical with typos)
departments = [random.choice(["HR", "IT", "Finance", "Marketing", "IT ", "Finanace"]) for _ in range(num_rows)]

# 7. Education Level (Ordinal categorical)
education_levels = [random.choice(["High School", "Bachelor's", "Master's", "PhD"]) for _ in range(num_rows)]

# 8. Work Experience (some missing values)
work_experience = np.random.randint(0, 30, num_rows).astype(float)
work_experience[::40] = np.nan  # Introduce missing values

# 9. Performance Score (imbalanced target variable)
performance_scores = np.random.choice([1, 2, 3, 4, 5], num_rows, p=[0.05, 0.15, 0.35, 0.35, 0.1])

# 10. Bonus (has currency symbols)
bonuses = np.random.randint(1000, 10000, num_rows).astype(str)
bonuses = bonuses.astype(object)  # Ensure it's an object type for string operations
for i in range(0, num_rows, 20):
    bonuses[i] = "$" + bonuses[i]
for i in range(0, num_rows, 50):
    bonuses[i] = "€" + bonuses[i]

# 11. Remote Work (Boolean categorical variable)
remote_work = np.random.choice([0, 1], num_rows, p=[0.7, 0.3])

# 12. Job Satisfaction Score (skewed distribution for log transformation)
job_satisfaction = np.random.exponential(scale=3, size=num_rows)

# 13. Customer Satisfaction Rating (values between 1-10, for scaling)
customer_satisfaction = np.random.randint(1, 11, num_rows)

# Creating DataFrame
df = pd.DataFrame({
    "ID": ids,
    "Name": names,
    "Age": ages,
    "Salary": salaries,
    "Join_Date": join_dates,
    "Department": departments,
    "Education_Level": education_levels,
    "Work_Experience": work_experience,
    "Performance_Score": performance_scores,
    "Bonus": bonuses,
    "Remote_Work": remote_work,
    "Job_Satisfaction": job_satisfaction,
    "Customer_Satisfaction": customer_satisfaction
})

# Save to CSV
df.to_csv("./data/data_wrangling_dataset.csv", index=False)

print("Dataset saved as data_wrangling_dataset.csv")

Dataset saved as data_wrangling_dataset.csv
