In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv('marketing_campaign.csv', sep='\t')


# Initial checks
print("Original shape:", df.shape)
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)

# 1. Drop ID column (not useful for analysis)
if 'ID' in df.columns:
    df.drop(columns='ID', inplace=True)

# 2. Remove duplicate rows
df.drop_duplicates(inplace=True)

# 3. Handle missing values
# Fill missing income with median
if 'Income' in df.columns:
    df['Income'] = df['Income'].fillna(df['Income'].median())

# 4. Standardize categorical columns
if 'Education' in df.columns:
    df['Education'] = df['Education'].str.strip().str.title()

if 'Marital_Status' in df.columns:
    df['Marital_Status'] = df['Marital_Status'].str.strip().str.lower()

# 5. Convert Dt_Customer to datetime format
if 'Dt_Customer' in df.columns:
    df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], errors='coerce', dayfirst=True)

# 6. Rename columns (lowercase, underscores)
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

# DEBUG: print column names after renaming
print("\nColumns after renaming:\n", df.columns.tolist())

# 7. Correct data types (if year_birth exists)
if 'year_birth' in df.columns:
    df['year_birth'] = pd.to_numeric(df['year_birth'], errors='coerce').astype('Int64')
else:
    print("\nColumn 'year_birth' not found after renaming. Skipping type conversion.")

# Final check
print("\nCleaned shape:", df.shape)
print("\nRemaining missing values:\n", df.isnull().sum())

# Save cleaned dataset
df.to_csv('cleaned_marketing_campaign.csv', index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_marketing_campaign.csv'")


Original shape: (2240, 29)

Missing values:
 id                      0
year_birth              0
education               0
marital_status          0
income                 24
kidhome                 0
teenhome                0
dt_customer             0
recency                 0
mntwines                0
mntfruits               0
mntmeatproducts         0
mntfishproducts         0
mntsweetproducts        0
mntgoldprods            0
numdealspurchases       0
numwebpurchases         0
numcatalogpurchases     0
numstorepurchases       0
numwebvisitsmonth       0
acceptedcmp3            0
acceptedcmp4            0
acceptedcmp5            0
acceptedcmp1            0
acceptedcmp2            0
complain                0
z_costcontact           0
z_revenue               0
response                0
dtype: int64

Data types:
 id                       int64
year_birth               int64
education               object
marital_status          object
income                 float64
kidhome            