In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from matplotlib.lines import Line2D
import seaborn as sns
import random

usecols = ["id", "date", "amount", "card_id", "client_id", "use_chip", "mcc", "errors"]

trans_data = pd.read_csv("transactions_data.csv", skiprows=lambda i: i > 0 and random.random() > (200000 / 13295276), low_memory=False, usecols = usecols)

trans_data[['amount']] = trans_data[['amount']].apply(lambda x: x.astype(str).str.replace('$','').astype(float))

td = pd.DataFrame(trans_data)
td['is_refund'] = td['amount'] < 0
td["amount"] = td["amount"].abs()

# Grouping Client IDs by Amount Ranges
client_total_amount = td.groupby('client_id')['amount'].sum().reset_index()
p30 = client_total_amount['amount'].quantile(0.3)
p70 = client_total_amount['amount'].quantile(0.7)

def amount_group(val):
    if val < p30:
        return 'XLow'
    elif val < p70:
        return 'Medium'
    else:
        return 'High'

# Aggregate amount per client
client_total_amount['spend_group'] = client_total_amount['amount'].apply(amount_group)
client_total_amount = client_total_amount.sort_values(by="spend_group", ascending=False)

# Assuming the 'client_id' column exists in both DataFrames
td['spend_group'] = td['client_id'].map(client_total_amount.set_index('client_id')['spend_group'])





use_chip_risk = {
    'Swipe Transaction': 1,
    'Online Transaction': 0.5,
    'Chip Transaction': 0.2
}

td['use_chip_risk'] = td['use_chip'].map(use_chip_risk)

td['errors_en'] = td['errors'].notnull().astype("int")
td = td.sort_values(by = 'id', ascending = True)

def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

tdT = td.copy()
tdT['date'] = pd.to_datetime(tdT['date'], errors = "coerce")

tdTd = tdT['date'].dt.strftime("%d-%m-%y %H:%M").str.split(" ", expand=True)
tdT['date'] = tdTd[0].copy()
tdT['time'] = tdTd[1].copy()
td['date'] = pd.to_datetime(td['date'], errors = "coerce")
tdT['hour'] = td['date'].dt.hour
tdT['time_r'] = tdT['hour'].apply(lambda x: 1 if x < 5 else 0.3 if x > 22 else 0)

td = tdT.copy()


td = td[td['amount']>0]
td["amount_norm"] = np.log(td["amount"] + 1)

weights = {
    'amount': 0.25,
    'use_chip_risk': 0.3,
    'errors_en': 0.25,
    'time_risk' : 0.2
}

# Calculate risk score (higher = riskier)
td ['amount_risk'] = normalize(td['amount_norm']) * weights['amount']
td ['u_chip_risk'] = normalize(td['use_chip_risk']) * weights['use_chip_risk']
td ['errors_risk'] = normalize(td['errors_en']) * weights['errors_en']
td ['time_risk'] = td['time_r'] * weights['time_risk']

td['total_risk'] = (
    td ['amount_risk'] + 
    td ['u_chip_risk'] + 
    td ['errors_risk'] +
    td ['time_risk'] 
)

In [None]:
# td_s = td.sample(n=100000, random_state=42)

time_risk_s = {
    1: 100,
    0.3: 50,
    0: 20
}
td['time_risk_s'] = td['time_r'].map(time_risk_s)

x = np.array(td['amount_norm'])
y = np.array(td['total_risk'])
colors = np.array(td['use_chip_risk'])
sizes = np.array(td['time_risk_s'])

plt.scatter(x,y, c= colors, cmap="Wistia", s=sizes)
plt.colorbar()
plt.show()


## 2nd Visualisation

In [None]:
features = td[['amount_norm', 'total_risk']].dropna()
kmeans = KMeans(n_clusters=3, random_state=0)
td['cluster'] = kmeans.fit_predict(features)

# Scatter plot with cluster color-coding
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=td,
    x='amount_norm',
    y='total_risk',
    hue='cluster',
    palette='Set1',
    style='cluster',
    size='time_risk_s',  # optional: vary size by time risk
    sizes=(20, 200),
    alpha=0.7
)

plt.title('Transaction Clusters by Amount and Risk')
plt.xlabel('Normalized Amount (log scale)')
plt.ylabel('Total Risk Score')
plt.legend(title='Cluster')
plt.grid(True)
plt.tight_layout()
plt.show()
