In [18]:
import pandas as pd

raw_path = "/content/marketing_campaign.csv"

# Read raw lines, strip outer quotes, split on tab
with open(raw_path, "r", encoding="utf-8", errors="ignore") as f:
    lines = [line.strip().strip('"') for line in f]

# Split into fields
rows = [[field.strip() for field in line.split("\t")] for line in lines]

# Build DataFrame
header = rows[0]
data = rows[1:]
df = pd.DataFrame(data, columns=header)


Standardize column header

In [19]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_", regex=False)
    .str.replace("-", "_", regex=False)
)


Convert Data types

In [20]:
df["dt_customer"] = pd.to_datetime(df["dt_customer"], format="%d-%m-%Y", errors="coerce")

In [21]:
for col in df.columns:
    if col not in ("education", "marital_status", "dt_customer"):
        df[col] = pd.to_numeric(
            df[col].astype(str)
                 .str.replace(",", "", regex=False)
                 .str.replace("$", "", regex=False),
            errors="coerce"
        )

Handling missing values

In [22]:
for col in df.columns:
    if df[col].isna().sum() == 0:
        continue
    if pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].fillna(df[col].median())
    elif pd.api.types.is_datetime64_any_dtype(df[col]):
        # optional: df[col] = df[col].fillna(df[col].mode()[0]) or leave
        pass
    else:
        df[col] = df[col].fillna(df[col].mode().iloc[0])

Remove duplicates

In [23]:
df = df.drop_duplicates()

Standardize text fields

In [24]:
df["education"] = df["education"].str.strip().str.lower()
df["marital_status"] = df["marital_status"].str.strip().str.lower()


Final check

In [25]:
print(df.shape)
print(df.isnull().sum())
print(df.duplicated().sum())

(2240, 29)
id                     0
year_birth             0
education              0
marital_status         0
income                 0
kidhome                0
teenhome               0
dt_customer            0
recency                0
mntwines               0
mntfruits              0
mntmeatproducts        0
mntfishproducts        0
mntsweetproducts       0
mntgoldprods           0
numdealspurchases      0
numwebpurchases        0
numcatalogpurchases    0
numstorepurchases      0
numwebvisitsmonth      0
acceptedcmp3           0
acceptedcmp4           0
acceptedcmp5           0
acceptedcmp1           0
acceptedcmp2           0
complain               0
z_costcontact          0
z_revenue              0
response               0
dtype: int64
0


Save cleaned datset

In [30]:
df.to_csv("/content/marketing_campaign_clean.csv", index=False)
