In [None]:
import pandas as pd
df = pd.read_csv("C:/Users/sheth/Desktop/DSMP/simulated_transaction_2024.csv")
(df.head(5))

In [None]:
#Frequency of transactions for each unique value in 'Third party name' column
transaction_counts = df['Third Party Name'].value_counts()
pd.set_option('display.max_rows', None)
print(transaction_counts)

Feature Engineering as done in Dataset 1

In [None]:
#Defining a function to create a new feature to catagorize the transactions into filtered categories
def categorize(name):
    if name in ['Sports Direct','Mamas & Papas','Head','Gap Kids','Millets','HMV', 'Collector Cave', 'Etsy','Topshop', 'Fat Face', 'Matalan','Specsavers','Barbiee Boutique','Revella', 'AMAZON', 'Blackwell\'s','Reebok', 'JD Sports', 'North Face', 'Boots', 'Lloyds Pharmacy']:
        return 'Retail Stores'
    elif name in ['Netflix', 'Disney', 'Blizzard', 'Mojang Studios', 'Amazon', 'Xbox', 'Gamestation','A Cut Above',
                  'The Crown']:
        return 'Entertainment & Media'
    elif name in ['Sainsbury', 'Tesco', 'Coop Local', 'Sainsbury Local']:
        return 'Grocery'
    elif name in ['JustEat', 'Deliveroo', 'Starbucks', 'Five Senses Art', 'Coffee #1', 'Costa Coffee', 'Jollyes','Rose & Crown',
                  'Kings Arms','Frankie & Bennies']:
        return 'Food & Dining'
    elif name in ['Halifax', 'LBG', 'Premier Finance', 'CPA']:
        return 'Financial Services'
    elif name in ['PureGym', 'Grand Union BJJ', 'Selfridges','Mothercare', 'Lloyds Pharmacy',  'RugbyFields','Sunny Care Nursery', 'Remedy plus care',
                  'Vision Express', 'Pets Corner','University College Hospital']:
        return 'Health & Wellness'
    elif name in ['Blackwell\'s', 'Brilliant Brushes','Craftastic', 'A Yarn Story', 'Cass Art', 'Foyles','Lavender Primary',
                  'Green Park Academy']:
        return 'Education & Books'
    elif name in ['The Works', 'Loosely Fitted', 'Wool', 'Hobby Lobby', 'Hobbycraft', 'Happy Days Home', 'Lavender Fields']:
        return 'Home & Lifestyle'
    else:
        return 'Other Services'

#Creating a new feature 'Category' based on the function
df['Category'] = df['Third Party Name'].apply(categorize)


In [None]:
#Converting 'Amount' column to numeric, so as to perform mathematical functions     
df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')

#Average, minimum, and maximum spends
category_stats = df.groupby('Category')['Amount'].agg(['mean', 'min', 'max'])
print(category_stats)

#Function for the transaction with minimum spend
def get_min_transaction_details(group):
    min_transaction = group.loc[group['Amount'] == group['Amount'].min()]
    return min_transaction[['Third Party Name', 'Amount']]

#Function for the transaction with maximum spend
def get_max_transaction_details(group):
    max_transaction = group.loc[group['Amount'] == group['Amount'].max()]
    return max_transaction[['Third Party Name', 'Amount']]

#Transactions with min and max spends for each category
min_transaction_details = df.groupby('Category').apply(get_min_transaction_details)
max_transaction_details = df.groupby('Category').apply(get_max_transaction_details)

#Statistics and transaction details
for category, stats in category_stats.iterrows():
    print(f"Category: {category}")
    print(f"Average Spend: {stats['mean']:.2f}")
    print(f"Minimum Spend: {stats['min']:.2f}")
    print("Details of Minimum Spend Transaction:")
    print(min_transaction_details.loc[category])
    print(f"Maximum Spend: {stats['max']:.2f}")
    print("Details of Maximum Spend Transaction:")
    print(max_transaction_details.loc[category])
    print("\n")

In [None]:
import matplotlib.pyplot as plt

#Average, minimum, and maximum spends
category_stats = df.groupby('Category')['Amount'].agg(['mean', 'min', 'max']).round(2)

#Plotting mean of transactions by Category
plt.figure(figsize=(10, 6))
plt.bar(category_stats.index, category_stats['mean'], color='skyblue')
plt.title('Mean Transaction Amounts by Category')
plt.xlabel('Category')
plt.ylabel('Mean Amount')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

#Plotting min of transactions by Category
plt.figure(figsize=(10, 6))
plt.bar(category_stats.index, category_stats['min'], color='lightgreen')
plt.title('Minimum Transaction Amounts by Category')
plt.xlabel('Category')
plt.ylabel('Minimum Amount')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

#Plotting max of transactions by Category
plt.figure(figsize=(10, 6))
plt.bar(category_stats.index, category_stats['max'], color='salmon')
plt.title('Maximum Transaction Amounts by Category')
plt.xlabel('Category')
plt.ylabel('Maximum Amount')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
#Variance for each category
category_variance = df.groupby('Category')['Amount'].var()

#Rounding the variances to three decimal places
category_variance = category_variance.round(3)
print(category_variance)

In [None]:
#Plotting variances by Categoeies
category_variance.plot(kind='bar', color='skyblue', figsize=(10, 6))
plt.title('Variance of Transaction Amounts by Category')
plt.xlabel('Category')
plt.ylabel('Variance')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

###  RFM Analysis
 This analysis involves allocationn of individual scores based on:
 * (R) recency
 * (F) frequency
 * (M) Monetary  
 
 We have used the metric, 'Quantile' to calculate the score of each of the 3 aspects, where lower than 25% is given a score of 4, lower than 50%, a 3 and so on. After calculating all the individual scores, we sum them to get a RFM score, the customers with the best RFM scores are our target customers for any new services, projects, etc

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')

#Calculating the most recent date plus one day for snapshot_date
snapshot_date = df['Date'].max() + pd.DateOffset(days=1)

rfm_scores = df.groupby('Account No').agg({
    'Date': lambda x: (snapshot_date - x.max()).days,  # Recency: Days since last transaction
    'Account No': 'count',  # Frequency: Total number of transactions
    'Amount': 'sum'  # Monetary: Sum of transaction values
}).rename(columns={'Date': 'Recency', 'Account No': 'Frequency', 'Amount': 'Monetary'})

quantiles = rfm_scores.quantile(q=[0.25, 0.5, 0.75])


def assign_rfm_score(x, c, quantiles):
    if x <= quantiles.loc[0.25, c]:
        return 4 if c == 'Recency' else 1  # Lower recency is better, hence 4 is best
    elif x <= quantiles.loc[0.5, c]:
        return 3 if c == 'Recency' else 2
    elif x <= quantiles.loc[0.75, c]:
        return 2 if c == 'Recency' else 3
    else:
        return 1 if c == 'Recency' else 4  # Higher recency is worse, hence 1 is worst

rfm_scores['R_Score'] = rfm_scores['Recency'].apply(assign_rfm_score, args=('Recency', quantiles))
rfm_scores['F_Score'] = rfm_scores['Frequency'].apply(assign_rfm_score, args=('Frequency', quantiles))
rfm_scores['M_Score'] = rfm_scores['Monetary'].apply(assign_rfm_score, args=('Monetary', quantiles))

#Summing the scores to create the RFM Score
rfm_scores['RFM_Score'] = rfm_scores['R_Score'] + rfm_scores['F_Score'] + rfm_scores['M_Score']


Summing up R,F, and M Scores to get best customers from the dataset

In [None]:
#Calculate Total RFM score
rfm_scores['RFM_Score'] = rfm_scores['R_Score'] + rfm_scores['F_Score'] + rfm_scores['M_Score']

#Sorting customers by RFM score in descending order to get the best customers
best_customers = rfm_scores.sort_values(by='RFM_Score', ascending=False)
best_customers.shape


In [None]:
# Top 500 customers 
top_500_customers = best_customers.head(500)
top_500_customers

### Understanding the distribution of Actual number of accounts over the transactions, to identify:
* High-Value Customers: Accounts with the highest scores in recency, frequency, and monetary values (4 on all scores)
* Loyal Customers: Accounts with high frequency and monetary scores (3-4), regardless of their recency score
* Emerging Customers: Accounts with the highest recency score (4) but lower frequency and monetary scores (1-3)
* Risk Customers: Accounts with low recency scores (1-2) but high frequency and monetary scores (3-4)
* Lost Customers:  Accounts with low scores across recency, frequency, and monetary (1-2)
* Need Attention Customers: Accounts with medium scores in all categories (2-3)
 

In [None]:
rfm_scores.head(5)

In [None]:
unique_accounts_count = len(df['Account No'].value_counts())
print("Unique number of accounts:", unique_accounts_count)

In [None]:
#Defining a function to classify customers as per the set thresholds
def classify_customer(row):
    r, f, m = row['R_Score'], row['F_Score'], row['M_Score']
    if r == 4 and f == 4 and m == 4:
        return 'High Value'
    elif (f == 3 or f == 4) and (m == 3 or m == 4):
        return 'Loyal Customers'
    elif r == 4 and (f in [1, 2, 3]) and (m in [1, 2, 3]):
        return 'Emerging Customers'
    elif (r == 1 or r == 2) and (f in [3, 4]) and (m in [3, 4]):
        return 'At Risk'
    elif (r in [1, 2]) and (f in [1, 2]) and (m in [1, 2]):
        return 'Lost Customers'
    else:
        return 'Need Attention'

#Applying the classification function
rfm_scores['Customer Segment'] = rfm_scores.apply(classify_customer, axis=1)

In [None]:
rfm_scores['Customer Segment'].value_counts()

In [None]:

customer_segments = ['High Value','Loyal Customers', 'Emerging Customers','Lost Customers', 'Need Attention']
counts = [146,240,82,290,218]

plt.figure(figsize=(10, 6))
plt.bar(customer_segments, counts, color=['#B3CDE3', '#FED9A6', '#CCEBC5', '#FDDAEC', '#F2F2F2'])
plt.title('Customer Segment Distribution')
plt.xlabel('Customer Segment')
plt.ylabel('Number of Customers')
plt.xticks(rotation=45)
plt.tight_layout()  
plt.show()


In [None]:
rfm_scores['Customer Segment'].isnull().sum()

Clustering: K-means clustering and DBSCAN can be applied to segment customers based on their transactional behavior.By clustering customers based on transaction attributes such as Balance, Amount, and Category, we can identify groups of customers with similar spending patterns or preferences. Eg: High spenders, frequent shoppers, or users of specific services.

In [None]:
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans


#Group by 'Account No' and replace NaNs with the median for each group, so that all NaNs are filled with the median of their own remaining transactions
grouped = df.groupby('Account No')
df['Balance'] = grouped['Balance'].transform(lambda x: x.fillna(x.median()))
df['Amount'] = grouped['Amount'].transform(lambda x: x.fillna(x.median()))


#After this step we still had nulls in balance and amount as there were no other records for that Account No.
#Filling remaining NaNs after group-based imputation 
df['Balance'].fillna(df['Balance'].median(), inplace=True)
df['Amount'].fillna(df['Amount'].median(), inplace=True)

#Standardization
scaler = StandardScaler()
df[['Balance_scaled', 'Amount_scaled']] = scaler.fit_transform(df[['Balance', 'Amount']])

#Categorical Encoding using One-Hot Encoding
encoder = OneHotEncoder()
encoded_cats = encoder.fit_transform(df[['Category']]).toarray()
cat_feature_names = encoder.get_feature_names_out(['Category'])

df_encoded = pd.DataFrame(encoded_cats, columns=cat_feature_names, index=df.index)
df = pd.concat([df, df_encoded], axis=1)

Plotting the optimum number of clusters needed with the Elbow Plot

In [None]:
import matplotlib.pyplot as plt

#Calculating the sum of squared distances for different numbers of clusters
sse = []
for k in range(1, 11):  # Adjust the range as needed
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df[['Balance_scaled', 'Amount_scaled'] + list(cat_feature_names)])
    sse.append(kmeans.inertia_)

#Plotting SSE to find the elbow
plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), sse, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.show()

In [None]:
#Clustering seems to be accurate with clusters ranging from 3-5. Therefore moving with 4 clusters.
kmeans = KMeans(n_clusters=4, random_state=42)
df['Cluster'] = kmeans.fit_predict(df[['Balance_scaled', 'Amount_scaled'] + list(cat_feature_names)])

In [None]:
df['Cluster'].value_counts()

## Principal Component Analysis (PCA) for Visualization


Using PCA to reduce the dimensionality of your data to 2 or 3 principal components can help visualize the clusters in a two-dimensional or three-dimensional space. This approach will allow us to plot the clusters and see how they are distributed, giving a visual intuition of how distinct they are.

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

#Reducing dimensions with PCA
pca = PCA(n_components=2)  # for 2D visualization
principal_components = pca.fit_transform(df[['Balance_scaled', 'Amount_scaled'] + list(cat_feature_names)])

#Creating a DataFrame with PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['principal component 1', 'principal component 2'])
pca_df['Cluster'] = df['Cluster']

#Plotting the clusters
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'orange', 'purple', 'brown']  # adjust based on number of clusters
for i in range(kmeans.n_clusters):
    cluster_data = pca_df[pca_df['Cluster'] == i]
    plt.scatter(cluster_data['principal component 1'], cluster_data['principal component 2'], 
                color=colors[i], label=f'Cluster {i}', alpha=0.5)
plt.title('Cluster visualization using PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()


In [None]:
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

#Applying DBSCAN to the standardized features
dbscan = DBSCAN(eps=0.5, min_samples=5)  # These are common starting values for eps and min_samples
df['DBSCAN_Cluster'] = dbscan.fit_predict(df[['Balance', 'Amount'] + list(cat_feature_names)])

#Unique clusters we found 
print(f"Unique clusters found by DBSCAN: {len(set(df['DBSCAN_Cluster']))}")


Time Series Forecasting (ARIMA, LSTM): Time series forecasting techniques such as ARIMA (AutoRegressive Integrated Moving Average) or LSTM (Long Short-Term Memory) neural networks can be used to predict future transaction volumes or identify seasonal trends in customer behavior. They can guide resource allocation, marketing campaign planning, and inventory management.

Customer Churn Prediction: Logistic Regression, Random Forest, or Gradient Boosting Machines (GBM).
These models can forecast the likelihood of customers discontinuing their relationship with the bank based on their transaction history and behavior. By identifying customers at risk of churn, the bank can implement retention strategies to retain valuable customers and reduce churn rates.

Deep Learning for Anomaly Detection: Deep learning models, such as autoencoders or variational autoencoders (VAEs), could be trained on transactional data to detect anomalies or unusual patterns that may indicate fraudulent activities or outliers in customer behavior.