In [7]:
import pandas as pd
import numpy as np
import os

# -------------------------------
# Step 1: Load the dataset
# -------------------------------
df = pd.read_csv(
    r'E:\GITHUB\Elevate Labs Data Analyst Internship\Day_01_Data_Cleaning\raw_dataset\marketing_campaign.csv',
    sep='\t',  # Corrected delimiter
    encoding='utf-8'
)

# Strip any leading/trailing spaces in column names
df.columns = df.columns.str.strip()

# -------------------------------
# Step 2: Initial inspection
# -------------------------------
print(df.head())
print(df.info())
print("Missing values:\n", df.isnull().sum())
print("Duplicate rows:", df.duplicated().sum())

# -------------------------------
# Step 3: Handle missing values
# -------------------------------
# Fill missing Income values with mean
if 'Income' in df.columns:
    income_mean = df['Income'].mean()
    df['Income'] = df['Income'].fillna(income_mean)

# -------------------------------
# Step 4: Standardize Education column
# -------------------------------
if 'Education' in df.columns:
    df['Education'] = df['Education'].replace('2n Cycle', 'Undergraduate')

# -------------------------------
# Step 5: Remove duplicates
# -------------------------------
df = df.drop_duplicates()

# -------------------------------
# Step 6: Convert Dt_Customer to datetime
# -------------------------------
if 'Dt_Customer' in df.columns:
    df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y', errors='coerce')

# -------------------------------
# Step 7: Save cleaned dataset
# -------------------------------
output_path = r'E:\GITHUB\Elevate Labs Data Analyst Internship\Day_01_Data_Cleaning\cleaned_dataset'
os.makedirs(output_path, exist_ok=True)

df.to_csv(os.path.join(output_path, 'marketing_campaign_cleaned.csv'), index=False)

print("✅ Dataset cleaned and saved successfully.")


     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  AcceptedCmp3  \
0  04-09-2012       58       635  ...                  7             0   
1  08-03-2014       38        11  ...                  5             0   
2  21-08-2013       26       426  ...                  4             0   
3  10-02-2014       26        11  ...                  6             0   
4  19-01-2014       94       173  ...                  5             0   

   AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  AcceptedCmp2  Complain  \
0             0