# Customer Segmentation with RFM & K-Means

In [None]:
import pandas as pd

from datetime import datetime

# Display figures inline in Jupyter notebook
import matplotlib.pyplot as plt
import seaborn as sn
# Use seaborn style defaults and set the default figure size
sn.set(rc={'figure.figsize':(11, 4)})
import warnings
warnings.filterwarnings('ignore')

In [None]:
events=pd.read_csv('events.csv')

# Data Perpartion

In [None]:
events['timestamp']=events['timestamp'].apply( lambda x: datetime.fromtimestamp(x/1000))

In [None]:
events

In [None]:
events.dtypes

In [None]:
df=events.copy()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe([0.05,0.01,0.25,0.50,0.75,0.80,0.90,0.95,0.99]).T

In [None]:
max(df.timestamp)

# Recency, Frequency & Monetary value calculation


we’ll calculate is the three key factors of RFM Analysis (recency, frequency, and monetary).
<br>
<b>Recency</b>: How recently customers made their purchase.<br>
<b>Frequency</b>: For simplicity, we’ll count the number of times each customer made a purchase.
<br>
<b>Monetary</b>: How much money they spent in total.
<br>
We are going to calculate these three key factors by grouping them by customers and taking <b>“2015/09/18”</b> as our reference end date since this is the last transaction date listed in our dataset.

In [None]:
# Recency   = Overall latest invoice date - individual customer's last invoice date
# Frequency = count of invoice no. of transaction(s)
# Monetary = Sum of Total amount for each cutomer

# Set 2015/09/18 as the overall last transcation date. This is to calculate recency in days.


In [None]:
df

In [None]:
#Recency Metric
import datetime as dt
today_date = dt.datetime(2021,12,30)
temp_df = (today_date - df.groupby("visitorid").agg({"timestamp":"max"}))
temp_df.rename(columns={"timestamp": "Recency"}, inplace = True)
recency_df = temp_df["Recency"].apply(lambda x: x.days)
# Frequency Metric
temp_df = df.groupby(["visitorid","transactionid"]).agg({"transactionid":"count"})
freq_df = temp_df.groupby("visitorid").agg({"transactionid":"count"})
freq_df.rename(columns={"transactionid": "Frequency"}, inplace = True)

# Monetary Metric
## it should be  item price but i dont have this  feild on data

monetary_df = df.groupby("visitorid").agg({"itemid":"sum"})
monetary_df.rename(columns = {"itemid": "Monetary"}, inplace = True)
rfm = pd.concat([recency_df, freq_df, monetary_df],  axis=1)


In [None]:
df = rfm
df["RecencyScore"] = pd.qcut(df['Recency'], 5, labels = [5, 4, 3, 2, 1])
df["FrequencyScore"] = pd.qcut(df['Frequency'].rank(method = "first"), 5, labels = [1,2,3,4,5])
df["MonetaryScore"] = pd.qcut(df['Monetary'], 5, labels = [1,2,3,4,5])
df["RFM_SCORE"] = df['RecencyScore'].astype(str) + df['FrequencyScore'].astype(str) + df['MonetaryScore'].astype(str)
seg_map = {
        r'[1-2][1-2]': 'Hibernating',
        r'[1-2][3-4]': 'At Risk',
        r'[1-2]5': 'Can\'t Loose',
        r'3[1-2]': 'About to Sleep',
        r'33': 'Need Attention',
        r'[3-4][4-5]': 'Loyal Customers',
        r'41': 'Promising',
        r'51': 'New Customers',
        r'[4-5][2-3]': 'Potential Loyalists',
        r'5[4-5]': 'Champions'
}

df['Segment'] = df['RecencyScore'].astype(str) + rfm['FrequencyScore'].astype(str)
df['Segment'] = df['Segment'].replace(seg_map, regex=True)
df.head()



In [None]:
rfm = df.loc[:,"Recency":"Monetary"]

In [None]:
df.groupby("visitorid").agg({"Segment": "sum"}).head()


# Explore the RFM values


once we have every visitor’s individual recency, frequency, and monetary value calculated, we’d like to see the distribution graph to understand the data better.

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16, 8))

# Recency distribution plot
sn.histplot(rfm['Recency'], kde=True, ax=ax[0])

# Frequency distribution plot
sn.histplot(rfm.query('Frequency < 1000')['Frequency'], kde=True, ax=ax[1])

# Monetary distribution plot
sn.histplot(rfm.query('Monetary < 10000')['Monetary'], kde=True, ax=ax[2])

In [None]:
df

# Clustering with the K-Means Algorithm


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

In [None]:
#scale
sc = MinMaxScaler((0,1))
df = sc.fit_transform(rfm)

#Kmeans
kmeans = KMeans(n_clusters = 10)
k_fit = kmeans.fit(df)

In [None]:
k_fit.n_clusters

In [None]:
k_fit.cluster_centers_

In [None]:
k_fit.labels_ 

In [None]:
df[0:5]

# Determining the Optimum Number of Clusters


In [None]:
kmeans = KMeans(n_clusters = 2)
k_fit = kmeans.fit(df)
ssd = []  

K = range(1,30)

for k in K:
    kmeans = KMeans(n_clusters = k).fit(df)
    ssd.append(kmeans.inertia_)

plt.plot(K, ssd, "bx-")
plt.xlabel("Distance Residual Sums Versus Different k Values")
plt.title("Elbow method for Optimum number of clusters")

In [None]:
# !pip install yellowbrick

In [None]:
from yellowbrick.cluster import KElbowVisualizer
kmeans = KMeans()
visu = KElbowVisualizer(kmeans, k = (2,20))
visu.fit(df)
visu.poof();

In [None]:
kmeans = KMeans(n_clusters = 6).fit(df)
cluster = kmeans.labels_
pd.DataFrame({"VistiorID": rfm.index, "cluster": cluster})