User Segmentation Using RFM and debt/income/credit_score

In [46]:
import pandas as pd
import numpy as np

df=pd.read_csv('merged-df.csv')

In [47]:
df.columns

Index(['transaction_id', 'date', 'user_id', 'card_id', 'amount', 'use_chip',
       'merchant_id', 'merchant_city', 'merchant_state', 'zip', 'mcc',
       'description', 'category', 'user_current_age', 'user_retirement_age',
       'user_birth_year', 'user_birth_month', 'user_gender', 'user_address',
       'user_latitude', 'user_longitude', 'user_per_capita_income',
       'user_yearly_income', 'user_total_debt', 'user_credit_score',
       'user_num_credit_cards', 'card_card_brand', 'card_card_type',
       'card_has_chip', 'card_cvv', 'card_expires', 'card_num_cards_issued',
       'card_credit_limit', 'card_acct_open_date',
       'card_year_pin_last_changed', 'card_card_on_dark_web'],
      dtype='object')

In [48]:
df_rfm = df[['transaction_id', 'date', 'user_id', 'amount','merchant_city', 'user_gender']]

df_rfm.head()

# There is no missing values, I will start with RFM analysis

Unnamed: 0,transaction_id,date,user_id,amount,merchant_city,user_gender
0,7475327,2010-01-01 00:01:00,1556,-77.0,Beulah,Female
1,7475328,2010-01-01 00:02:00,561,14.57,Bettendorf,Male
2,7475329,2010-01-01 00:02:00,1129,80.0,Vista,Male
3,7475331,2010-01-01 00:05:00,430,200.0,Crown Point,Female
4,7475332,2010-01-01 00:06:00,848,46.41,Harwood,Male


In [49]:
def most_fre_city(x):
    return x.mode()[0] if not x.mode().empty else np.nan


customer_agg = df_rfm.groupby('user_id').agg({
    'amount': ['sum','mean','max','count'],
    'date': ['min','max'],
    'merchant_city': most_fre_city,
    'user_gender': 'first'})

customer_agg.columns = [
    'total_spent','avg_spent','max_spent','txn_count','first_txn_date',
    'last_txn_date', 'most_frequebt_location','gender']




In [50]:
# Feature Engineering
customer_agg.reset_index(inplace=True)

customer_agg['first_txn_date'] = pd.to_datetime(customer_agg['first_txn_date'])
customer_agg['last_txn_date']  = pd.to_datetime(customer_agg['last_txn_date'])

# Recency: Days since last transaction
reference_date = pd.to_datetime(df['date']).max() + pd.Timedelta(days=1)

customer_agg['recency_days'] = (reference_date - customer_agg['last_txn_date']).dt.days

# Tenure: How long the customer has been transacting
customer_agg['tenure_days'] = (customer_agg['last_txn_date'] - customer_agg['first_txn_date']).dt.days + 1
customer_agg.loc[customer_agg['tenure_days'] < 1, 'tenure_days'] = 1 

# Average transactions per month
customer_agg['txn_per_day'] = customer_agg['txn_count'] / customer_agg['tenure_days']



In [51]:
rfm = customer_agg[['user_id', 'recency_days', 'txn_count', 'total_spent']].copy()
rfm.columns = ['user_id', 'Recency', 'Frequency', 'Monetary']

rfm['R'] = pd.qcut(rfm['Recency'].rank(method='first'), 4, labels=[4,3,2,1])  # lower recency = better
rfm['F'] = pd.qcut(rfm['Frequency'].rank(method='first'),4, labels=[4,3,2,1])
rfm['M'] = pd.qcut(rfm['Monetary'].rank(method='first'), 4, labels=[4,3,2,1])

rfm['RFM Score'] = rfm['R'].astype(str) + rfm['F'].astype(str) + rfm['M'].astype(str)

In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

X = rfm[['Recency','Frequency','Monetary']]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42)
rfm['Cluster'] = kmeans.fit_predict(X_scaled)

from sklearn.metrics import silhouette_score
print(silhouette_score(X_scaled, rfm['Cluster']))

rfm['Cluster'].value_counts()

0.4624517884611449


  super()._check_params_vs_input(X, default_n_init=10)


Cluster
2    737
0    401
3     74
1      7
Name: count, dtype: int64

This lead to imbalance class, try with log-scaling

In [53]:
rfm['Recency_log']   = np.log1p(rfm['Recency'])  
rfm['Frequency_log'] = np.log1p(rfm['Frequency'])
rfm['Monetary_log']  = np.log1p(rfm['Monetary'])

X = rfm[['Recency_log','Frequency_log','Monetary_log']]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
rfm['Cluster'] = kmeans.fit_predict(X_scaled)

print(silhouette_score(X_scaled, rfm['Cluster']))
rfm['Cluster'].value_counts()

0.38088465067588834


Cluster
2    581
0    344
1    281
3     13
Name: count, dtype: int64

The root reason might be most of the users do daily transaction, and I will try to drop recency to see if it has a better clusering

In [54]:
X = rfm[['Frequency_log','Monetary_log']]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=4, random_state=42, n_init='auto')
rfm['Cluster'] = kmeans.fit_predict(X_scaled)

print(silhouette_score(X_scaled, rfm['Cluster']))
rfm['Cluster'].value_counts()

0.36744156311506404


Cluster
2    487
3    357
0    282
1     93
Name: count, dtype: int64

It looks better, I will try to use debt, income, and credit score for clustering

We **cannot** use DTI as there is no monthly debt payment

In [55]:
df_debt = df[['user_id', 'user_yearly_income', 'user_total_debt', 'user_credit_score']]

df_debt.head()

Unnamed: 0,user_id,user_yearly_income,user_total_debt,user_credit_score
0,1556,48277.0,110153.0,740
1,561,36853.0,112139.0,834
2,1129,34449.0,36540.0,686
3,430,53350.0,128676.0,685
4,848,68362.0,96182.0,711


In [56]:
df_debt = (df.groupby('user_id').agg(
    user_yearly_income=('user_yearly_income', 'first'),
    user_total_debt=('user_total_debt', 'first')).reset_index())

df_debt['income_log'] = np.log1p(df_debt['user_yearly_income'])
df_debt['debt_log']   = np.log1p(df_debt['user_total_debt'])

X = df_debt[['income_log', 'debt_log']]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
df_debt['Cluster'] = kmeans.fit_predict(X_scaled)

print(silhouette_score(X_scaled, df_debt['Cluster']))
print(df_debt['Cluster'].value_counts())

0.38808493810488764
Cluster
3    581
0    563
2     64
4      7
1      4
Name: count, dtype: int64


Based on the reuslt of silhouette score and cluster distribution, we will use frequency+regency to do the user segmentation.

In [57]:
clustering_df = rfm[['user_id','Cluster']].copy()

clustering_df.to_csv('clustering_df.csv', index=False)