In [1]:
# Step 1: Import necessary libraries
import pandas as pd

# Step 2: Load the dataset (assuming tab-separated format)
file_path = '/content/marketing_campaign.csv'
df = pd.read_csv(file_path, sep='\t')

# Step 3: View the first few rows and basic info
print(df.head())
print(df.info())

# Step 4: Handle missing values (e.g., fill nulls in Income column with median)
df['Income'] = df['Income'].fillna(df['Income'].median())

# Step 5: Remove duplicate rows
df = df.drop_duplicates()

# Step 6: Convert 'Dt_Customer' column to datetime format
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')

# Step 7: Strip whitespace and fix inconsistent casing in categorical columns
df['Education'] = df['Education'].str.strip().str.title()
df['Marital_Status'] = df['Marital_Status'].str.strip().str.title()

# Step 8: Recheck the data
print(df.info())
print(df.head())

# Optional: Save the cleaned dataset to a new CSV
df.to_csv('cleaned_dataset.csv', index=False)


     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  AcceptedCmp3  \
0  04-09-2012       58       635  ...                  7             0   
1  08-03-2014       38        11  ...                  5             0   
2  21-08-2013       26       426  ...                  4             0   
3  10-02-2014       26        11  ...                  6             0   
4  19-01-2014       94       173  ...                  5             0   

   AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  AcceptedCmp2  Complain  \
0             0