In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore") 

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Load data
data = pd.read_csv('marketing_campaign.csv', sep="\t")




## Data Preprocessing

In [5]:
#data information or Dataset statistics
print(f'Number of variables: ',data.shape[1])
print(f'Number of observations: ',data.shape[0])
print(f'Number of duplicates: ',data.duplicated().sum())
print(f'Missing cells: ',data.isnull().sum().sum())
print(f'numeric values: ',len(data.select_dtypes(include=np.number).columns))
print(f'categories values: ',len(data.select_dtypes(include='O').columns))

Number of variables:  29
Number of observations:  2240
Number of duplicates:  0
Missing cells:  24
numeric values:  26
categories values:  3


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [15]:
# drop the missing values
data=data.dropna()

In [17]:

data["Dt_Customer"] = pd.to_datetime(data["Dt_Customer"],format='%d-%m-%Y')

## Feature Engineering

In [19]:
#Feature Engineering
#Age of customer today 
data["Age"] = 2024-data["Year_Birth"]

## remove outlier

In [21]:
#Dropping the outliers by setting a cap on Age and income. 
data = data[(data["Age"]<90)]
data = data[(data["Income"]<600000)]
print("The total number of data-points after removing the outliers are:", len(data))

The total number of data-points after removing the outliers are: 2212


## RFM Model

In [117]:

# Preprocess data

data['Recency'] = (pd.to_datetime('2024-01-01') - data['Dt_Customer']).dt.days
data['TotalPurchases'] = data['NumWebPurchases'] + data['NumCatalogPurchases'] + data['NumStorePurchases']+data["NumDealsPurchases"]
data['Monetary'] = (data['MntWines'] + data['MntFruits'] + data['MntMeatProducts'] +
                    data['MntFishProducts'] + data['MntSweetProducts'] + data['MntGoldProds'])

# Calculate RFM metrics
rfm = data[['ID', 'Recency', 'TotalPurchases', 'Monetary']]

# Assign RFM scores
rfm['R_Score'] = pd.qcut(rfm['Recency'], 5, labels=[5, 4, 3, 2, 1])
rfm['F_Score'] = pd.qcut(rfm['TotalPurchases'].rank(method='first'), 5, labels=[1, 2, 3, 4, 5])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], 5, labels=[1, 2, 3, 4, 5])

# Combine RFM scores
rfm['RFM_Score'] = rfm['R_Score'].astype(str) + rfm['F_Score'].astype(str) + rfm['M_Score'].astype(str)

# Segment customers
def segment_customer(df):
    if df['RFM_Score'] == '555':
        return 'Best Customers'
    elif df['RFM_Score'] == '111':
        return 'Lost Customers'
    elif df['RFM_Score'][0] == '5':
        return 'Loyal Customers'
    elif df['RFM_Score'][1] == '5':
        return 'Frequent Customers'
    elif df['RFM_Score'][2] == '5':
        return 'Big Spenders'
    else:
        return 'Others'

rfm['Segment'] = rfm.apply(segment_customer, axis=1)

# Analyze segments
segment_analysis = rfm.groupby('Segment').agg({
    'Recency': 'mean',
    'TotalPurchases': 'mean',
    'Monetary': ['mean', 'count']
}).reset_index()

print(segment_analysis)


              Segment      Recency TotalPurchases     Monetary      
                              mean           mean         mean count
0      Best Customers  3546.562500      25.812500  1681.187500    16
1        Big Spenders  3911.248804      18.746411  1606.636364   209
2  Frequent Customers  3925.633838      25.863636  1166.537879   396
3      Lost Customers  4098.825000       4.825000    31.750000    40
4     Loyal Customers  3544.147541      11.997658   435.941452   427
5              Others  3877.794484      11.611210   294.684164  1124
