# Customer Segmentation/Clustering

## Import Required Libraries

In [7]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from datetime import datetime

## Data Loading and Preprocess

In [8]:
customers_df = pd.read_csv("../datasets/Customers.csv")
transactions_df = pd.read_csv("../datasets/Transactions.csv")

In [9]:
customers_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [10]:
transactions_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 7:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


## Feature Engineering

In [14]:
# aggregate transactions data
transaction_summary = transactions_df.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': ['sum', 'mean'],
    'TransactionDate': ['min', 'max']
}).reset_index()

transaction_summary.head()

Unnamed: 0_level_0,CustomerID,TransactionID,TotalValue,TotalValue,TransactionDate,TransactionDate
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,min,max
0,C0001,5,3354.52,670.904,2024-01-19 3:12:55,2024-11-02 17:04:16
1,C0002,4,1862.74,465.685,2024-02-28 7:44:21,2024-12-03 1:41:41
2,C0003,4,2725.38,681.345,2024-02-18 2:50:37,2024-08-24 18:54:04
3,C0004,8,5354.88,669.36,2024-02-28 10:16:35,2024-12-23 14:13:52
4,C0005,3,2034.24,678.08,2024-03-15 4:08:59,2024-11-04 0:30:22


In [15]:
transaction_summary.columns = [
    'CustomerID',
    'TotalTransactions',
    'TotalSpend',
    'AngTransactionValue',
    'FirstPurchaseDate',
    'LastPurchaseDate'
    ]

In [16]:
# calculate how long a customer has been active with the company
transaction_summary['FirstPurchaseDate'] = pd.to_datetime(transaction_summary['FirstPurchaseDate'])
transaction_summary['LastPurchaseDate'] = pd.to_datetime(transaction_summary['LastPurchaseDate'])
transaction_summary['ActiveCustomerDuration'] = transaction_summary['LastPurchaseDate'] - transaction_summary['FirstPurchaseDate']

In [17]:
# Merge customers with transaction summary