In [None]:
# loading necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from scipy.stats import linregress

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


- The comment #CHANGETHIS in the code cell indicate places that you can modify, often by replacing a column name or a number, to run different analysis
- **Hints** in the text cells are possible directions you can explore for your report

# Upload data
Make sure to upload `marketing-data.csv` to `Files` panle on the left

In [None]:
data = pd.read_csv('./marketing-data.csv')
data.head()

In [None]:
data.columns

# Data Cleaning



Some data cleaning is performed below so your data is ready for analysis

### **Data description**

Here are some of the columns in the data:

- AcceptedCmp1-5: Whether the customer accepted the offer in each of the five campaigns.
- Response: Whether the customer accepted the offer in the last campaign.
- Complain: Whether the customer filed a complaint - in the last 2 years.
- DtCustomer: Date when the customer enrolled with the company.
- Income: Customer's yearly household income.
- Kidhome, Teenhome: Number of small children and teenagers in the household.
- MntWines, MntMeatProducts, MntFishProducts, MntFruits, MntSweetProducts, MntGoldProds: Amount spent on different product categories in the last 2 years.
- NumDealsPurchases: Number of purchases made with a discount.
- NumCatalogPurchases: Number of purchases made using catalogs.
- NumStorePurchases: Number of purchases made in physical stores.
- NumWebPurchases: Number of purchases made through the website.
- NumWebVisitsMonth: Number of visits to the website in the last month.
- Recency: Number of days since the customer's last purchase.

### **Data types**

In [None]:
print(data.info())

**Hint**: in your report, please describe which columns are categorical and which are numerical


### Exploratory Data Analysis

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(data['Age'], bins=20, kde=True)  # CHANGETHIS: explore different columns
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

**Hints**:
- In your report, please report the distribution of selected columns (please provide reasons for selecting the columns).
- Please comment on the distributions

You can plot distribution on specific group of customers, for example, `Age` of customers with kids.

In [None]:
customers_with_kids = data.loc[data['Kidhome'] > 0] # CHANGETHIS: replace this condition to get other subgroup of customers

plt.figure(figsize=(8, 6))
sns.histplot(customers_with_kids['Age'] , bins=20, kde=True)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

**Hints:**

- Are there interesting distributions of other variables in your data? For example, do the `Income` distributions differ between customers with different educational levels?
- Please comment on the distributions

# Linear Regression

Below is an example analysis exploring the linear relationship between two variables, e.g. `Age` and `Income`

In [None]:
# Define the two numerical columns for regression (e.g., 'Age' and 'MntWines')
# CHANGETHIS: try other pairs of columns
x_column = 'Age'  # Independent variable
y_column = 'Income'  # Dependent variable

# Remove missing values to ensure clean data
df = data[[x_column, y_column]].dropna()

# Scatterplot to visualize the relationship
plt.figure(figsize=(8, 6))
plt.scatter(df[x_column], df[y_column], alpha=0.7)
plt.title(f'Scatterplot of {x_column} vs {y_column}')
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.grid()
plt.show()

# Step 1: Perform linear regression using statsmodels
X = sm.add_constant(df[x_column])  # Add a constant for the intercept
y = df[y_column]
model = sm.OLS(y, X).fit()  # Fit the model
print(model.summary())  # Display the regression results

# Step 2: Perform hypothesis testing using scipy's linregress
slope, intercept, r_value, p_value, std_err = linregress(df[x_column], df[y_column])

print(f"Slope: {slope}")
print(f"Intercept: {intercept}")
print(f"R-squared: {r_value**2}")
print(f"P-value: {p_value}")
print(f"Standard Error: {std_err}")


**Hints:**
- Can you write the linear equation linking `Age` and `Income` based on the `Slope` and `Intercept` values above?
- What is the hypothesis testing for in this case? Specifically, what is the null and alternative hypotheses?
- What conclusion can we draw for from the `P-value` above?
- The relationship between `Age` and `Income` does not mean much from a business analytics point of view. Can you repeat the linear regression analysis on other pairs of variables? Can you interpret such relationships?

# User segmentation with RFM framework

We can perform segment the customers based using the RFM framework:
- R: Recency - how recent is the last purchase by a customer
- F: Frequency - how often does a customer make a purchase
- M: Monetary - how much does a customer pay in total

## Derive RFM features
In our data, the `Recency` value already exists, we need to derive Frequency and Monetary value from the existing columns.*italicized text*

In [None]:
# Frequency: Sum of all purchase channels
# CHANGETHIS: are all purchase channels included in the summation below?
data['Frequency'] = data[['NumCatalogPurchases', 'NumDealsPurchases']].sum(axis=1)

# Monetary: Sum of all monetary columns
# CHANGETHIS: are all product categories channels included in the summation below?
data['Monetary'] = data[['MntWines', 'MntFishProducts', 'MntMeatProducts', 'MntSweetProducts', 'MntGoldProds']].sum(axis=1)

# Select RFM columns
rfm_data = data[['Recency', 'Frequency', 'Monetary']]

## Perform K-means clustering
The code below performs K-means with 2 clusters (segments). You may need to try different number of clusters before deciding on an ideal number of clusters.

Note that you can also try using an automated method (e.g. the "elbow" method in the k-means tutorial given during our class) to help determine the ideal number of clusters. Otherwise, you can do try-and-error, visualize/analyze whether the clusters make sense and decide on an optimal number of clusters.

In [None]:
# CHANGETHIS: try different number of clusters
NUM_CLUSTER = 2

scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_data)

kmeans = KMeans(n_clusters=NUM_CLUSTER, max_iter=50)
kmeans.fit(rfm_scaled)


## Analyze the clusters
You should analyze the clusters based on each of the 3 R, F, M factors.


In [None]:
data['Cluster_Id'] = kmeans.labels_

sns.boxplot(x='Cluster_Id', y='Recency', data=data) # CHANGE THIS: repeat the analysis for F, M factors

**Hints:**
- Can you repeat the visualization and analysis for `Frequency` and `Monetary` aspects?
- What can you say about the clusters based on the differences in R, F, M?
- What different business strategies (marketing, customer support, etc.) would you apply to the clusters?
