# Steps for Data Cleaning and Preprocessing:-

### Step 1: Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2: Loading the dataset

In [2]:
df = pd.read_csv("marketing_campaign.csv", sep='\t') # here backslash t is a tab separator

### Step 3: Previewing the dataset

In [3]:
print("Initial dataset shape:", df.shape) # Here df.shape Returns number of rows & columns
print(df.head()) # Here df.head Shows the first 5 rows of the dataset

Initial dataset shape: (2240, 29)
     ID  Year_Birth   Education Marital_Status   Income  Kidhome  Teenhome  \
0  5524        1957  Graduation         Single  58138.0        0         0   
1  2174        1954  Graduation         Single  46344.0        1         1   
2  4141        1965  Graduation       Together  71613.0        0         0   
3  6182        1984  Graduation       Together  26646.0        1         0   
4  5324        1981         PhD        Married  58293.0        1         0   

  Dt_Customer  Recency  MntWines  ...  NumWebVisitsMonth  AcceptedCmp3  \
0  04-09-2012       58       635  ...                  7             0   
1  08-03-2014       38        11  ...                  5             0   
2  21-08-2013       26       426  ...                  4             0   
3  10-02-2014       26        11  ...                  6             0   
4  19-01-2014       94       173  ...                  5             0   

   AcceptedCmp4  AcceptedCmp5  AcceptedCmp1  Accepte

### Step 4: Checking for missing values

In [4]:
print("\nMissing values in each column:")
print(df.isnull().sum())

# Handling missing values - dropping rows with missing values
df.dropna(inplace=True)  # Here df.dropna Drops rows that have at least one missing value (null).
print("Shape after dropping missing values:", df.shape)


Missing values in each column:
ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
dtype: int64
Shape after dropping missing values: (2216, 29)


### Step 5: Removing duplicates if any

In [5]:
df.drop_duplicates(inplace=True)  # inplace = true changes directly to the existing Data
print("Shape after removing duplicates:", df.shape)

Shape after removing duplicates: (2216, 29)


### Step 6: Standardizing text values

In [6]:
df['Education'] = df['Education'].str.lower().str.strip()
print("Unique values in Education column:", df['Education'].unique())

Unique values in Education column: ['graduation' 'phd' 'master' 'basic' '2n cycle']


### Step 7: Converting date columns to datetime format

In [7]:
# Converting 'Dt_Customer' column to datetime
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)
print("Dt_Customer column type after conversion:", df['Dt_Customer'].dtype)

Dt_Customer column type after conversion: datetime64[ns]


### Step 8: Rename columns to be clean and uniform

In [8]:
df.columns = df.columns.str.lower().str.replace(" ", "_").str.strip()
print("Updated column names:\n", df.columns)

Updated column names:
 Index(['id', 'year_birth', 'education', 'marital_status', 'income', 'kidhome',
       'teenhome', 'dt_customer', 'recency', 'mntwines', 'mntfruits',
       'mntmeatproducts', 'mntfishproducts', 'mntsweetproducts',
       'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
       'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth',
       'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5', 'acceptedcmp1',
       'acceptedcmp2', 'complain', 'z_costcontact', 'z_revenue', 'response'],
      dtype='object')


### Step 9: Fix data types

In [9]:
df['income'] = pd.to_numeric(df['income'], errors='coerce')
df['kidhome'] = df['kidhome'].astype(int)
df['teenhome'] = df['teenhome'].astype(int)

### Step 10: Finally Saving the cleaned Dataset

In [10]:
df.to_csv("cleaned_customer_personality.csv", index=False)
print("Cleaned dataset saved as 'cleaned_customer_personality.csv'")

Cleaned dataset saved as 'cleaned_customer_personality.csv'
