# RQ3: How can clustering techniques be used to identify groups of transactions that may be part of coordinated fraud activities? #

In [None]:
import os
os.environ['PATH'] += os.pathsep + os.path.expanduser('~/.local/bin')
import sys
sys.path.append(os.path.expanduser('~/.local/lib/python3.10/site-packages'))

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import hdbscan
import gc
from scipy.sparse import csr_matrix, vstack
from scipy.cluster.hierarchy import dendrogram, linkage
from joblib import Parallel, delayed
from sklearn.metrics import silhouette_samples, silhouette_score
import random

## Data Loading and Merging
- **Objective**: Load the transaction and identity datasets and merge them based on `TransactionID`.
- **Steps**:
  - Loaded `train_transaction.csv` and `train_identity.csv`.
  - Merged the datasets using an inner join on `TransactionID`.

In [None]:
#Loading the datasets
data_transaction = pd.read_csv('train_transaction.csv')
data_identity = pd.read_csv('train_identity.csv')

In [None]:
#Merging datasets on TransactionID
data = pd.merge(data_transaction, data_identity, on='TransactionID', how='left')

## Feature Selection
- **Objective**: Select relevant features based on the project plan and methodology.
- **Selected Features**:
  - Numerical: `TransactionAmt`
  - Categorical: `card1`, `card2`, `card3`, `card4`, `card5`, `card6`, `addr1`, `addr2`
  - Derived: `D1` to `D15`, `V1` to `V339`

In [None]:
#Feature selection based on our project plan and methodology
selected_features = ['TransactionAmt', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2'] + [f'D{i}' for i in range(1, 16)] + [f'V{i}' for i in range(1, 340)]
data = data[selected_features]

## Optimizing Data Types
- **Objective**: Optimize data types to save memory and improve computational efficiency.
- **Steps**:
  - Converted float64 columns to float32.
  - Converted int64 columns to int32.

In [None]:
#Optimizing data types to save memory
def optimize_data_types(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype('int32')
    return df

In [None]:
data = optimize_data_types(data)

## Normalization
- **Objective**: Standardize numerical features to have a mean of 0 and a standard deviation of 1.
- **Steps**:
  - Applied `StandardScaler` to `TransactionAmt`, `addr1`, and `addr2`.

In [None]:
#Normalizing numerical features
scaler = StandardScaler()
data[['TransactionAmt', 'addr1', 'addr2']] = scaler.fit_transform(data[['TransactionAmt', 'addr1', 'addr2']])

## Encoding Categorical Features
- **Objective**: Convert categorical variables into a numerical format suitable for machine learning algorithms.
- **Steps**:
  - Used `OneHotEncoder` to encode `card1`, `card2`, `card3`, `card4`, `card5`, and `card6`.
  - Converted the encoded features into a sparse DataFrame for memory efficiency.
  - Concatenated the encoded features back with the original DataFrame and dropped the original categorical columns.

In [None]:
#One-hot encoding categorical features using sparse matrices
categorical_cols = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
encoded_cols = encoder.fit_transform(data[categorical_cols])

In [None]:
#Converting to sparse df (more memory efficient)
encoded_df = pd.DataFrame.sparse.from_spmatrix(encoded_cols, columns=encoder.get_feature_names_out(categorical_cols))

#Concatenating with original df
data = pd.concat([data.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
data.drop(categorical_cols, axis=1, inplace=True)

## Imputing Missing Values
- **Objective**: Handle missing values in the dataset to ensure complete data for analysis.
- **Steps**:
  - Implemented a function to impute missing values in chunks to handle large datasets efficiently.
  - Used `SimpleImputer` with the mean strategy to fill missing values.

In [None]:
#Imputing the missing values manually for sparse data
def impute_sparse_data(df, chunk_size=10000):
    imputer = SimpleImputer(strategy='mean')
    sparse_chunks = []

    for start in range(0, df.shape[0], chunk_size):
        end = min(start + chunk_size, df.shape[0])
        chunk = df.iloc[start:end]

        #Imputing the chunk
        imputed_chunk = imputer.fit_transform(chunk.values)

        #Converting back to sparse format
        sparse_chunk = csr_matrix(imputed_chunk)
        sparse_chunks.append(sparse_chunk)

    #Combining all sparse chunks
    imputed_sparse_data = vstack(sparse_chunks)
    return imputed_sparse_data

In [None]:
#Using the function to impute the data
imputed_data_sparse = impute_sparse_data(data)

In [None]:
#Converting the imputed sparse matrix back to a df
data = pd.DataFrame.sparse.from_spmatrix(imputed_data_sparse, columns=data.columns)

#Clearing unused variables to free up memory
gc.collect()

## Dimensionality Reduction
- **Objective**: Reduce the dimensionality of the dataset to facilitate clustering.
- **Steps**:
  - Applied PCA to reduce the dataset to 10 principal components.

In [None]:
pca = PCA(n_components=10)
data_reduced = pca.fit_transform(data)

### K-Means Clustering
- **Objective**: Partition the dataset into distinct clusters using K-Means.
- **Steps**:
  - Applied K-Means clustering with 5 clusters.
  - Stored the cluster labels in the DataFrame.

In [None]:
#K-Means Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(data_reduced)
data['KMeans_Cluster'] = kmeans_labels

### HDBSCAN Clustering
- **Objective**: Identify clusters of varying density using HDBSCAN.
- **Steps**:
  - Applied HDBSCAN with `min_samples` of 10 and `min_cluster_size` of 500.
  - Stored the cluster labels in the DataFrame.

In [None]:
#HDBSCAN Clustering
clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=500)
hdbscan_labels = clusterer.fit_predict(data_reduced)
data['HDBSCAN_Cluster'] = hdbscan_labels

### Hierarchical Clustering
- **Objective**: Understand the hierarchical structure of the data through clustering.
- **Steps**:
  - Randomly sampled 20,000 data points due to memory constraints.
  - Applied Agglomerative Clustering with 5 clusters.
  - Stored the cluster labels in the sampled DataFrame.

In [None]:
#Randomly sampling the data for hierarchical clustering
sample_size = 20000
sampled_indices = np.random.choice(data_reduced.shape[0], sample_size, replace=False)
sampled_data_reduced = data_reduced[sampled_indices]

In [None]:
#Hierarchical Clustering on the sampled data
hierarchical = AgglomerativeClustering(n_clusters=5)
hierarchical_labels = hierarchical.fit_predict(sampled_data_reduced)

#Creating a df for the sampled data to store the labels
sampled_data_df = pd.DataFrame(sampled_data_reduced, columns=[f'PC{i+1}' for i in range(sampled_data_reduced.shape[1])])
sampled_data_df['Hierarchical_Cluster'] = hierarchical_labels

## Evaluation and Visualization

### Evaluating Clustering with WCSS
- **Objective**: Determine the optimal number of clusters using the Elbow Method.
- **Steps**:
  - Calculated WCSS for different numbers of clusters.
  - Plotted the Elbow Method graph to identify the optimal number of clusters.

In [None]:
#Evaluating K-Means with WCSS
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(data_reduced)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

### Silhouette Score Calculation
- **Objective**: Evaluate the quality of clustering using the Silhouette Score.
- **Steps**:
  - Randomly sampled 20,000 data points to calculate the Silhouette Score for K-Means, HDBSCAN, and Hierarchical Clustering.
  - Reported the Silhouette Scores for each clustering method.

In [None]:
#Silhouette Score Sampling
sample_size_for_silhouette = 20000
random_indices_silhouette = np.random.choice(data_reduced.shape[0], sample_size_for_silhouette, replace=False)
sampled_data_reduced_silhouette = data_reduced[random_indices_silhouette]
sampled_kmeans_labels = kmeans_labels[random_indices_silhouette]
sampled_hdbscan_labels = hdbscan_labels[random_indices_silhouette]

In [None]:
#Silhouette Score for K-Means
kmeans_silhouette = silhouette_score(sampled_data_reduced_silhouette, sampled_kmeans_labels)
print(f'K-Means Silhouette Score: {kmeans_silhouette}')

#Silhouette Score for HDBSCAN
hdbscan_silhouette = silhouette_score(sampled_data_reduced_silhouette, sampled_hdbscan_labels)
print(f'HDBSCAN Silhouette Score: {hdbscan_silhouette}')

#Silhouette Score for Hierarchical Clustering
#Ensuring sampled_data_reduced_silhouette is used instead of sampled_data_reduced for hierarchical clustering
sampled_hierarchical_labels = hierarchical_labels[:sample_size_for_silhouette]
hierarchical_silhouette = silhouette_score(sampled_data_reduced_silhouette, sampled_hierarchical_labels)
print(f'Hierarchical Silhouette Score: {hierarchical_silhouette}')

In [None]:
#Silhouette Scores
kmeans_silhouette = 0.8564298897506002
hdbscan_silhouette = -0.18430016878234834
hierarchical_silhouette = -0.0637329143989308

#Labels and Scores
labels = ['K-Means', 'HDBSCAN', 'Hierarchical']
scores = [kmeans_silhouette, hdbscan_silhouette, hierarchical_silhouette]

#Plotting the Silhouette Scores
plt.figure(figsize=(10, 6))
bars = plt.bar(labels, scores, color=['blue', 'green', 'red'])

#Adding the scores above the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, round(yval, 2), ha='center', va='bottom')

plt.ylim(-1, 1)
plt.axhline(0, color='gray', linewidth=0.8)
plt.title('Silhouette Scores for Different Clustering Methods')
plt.xlabel('Clustering Method')
plt.ylabel('Silhouette Score')
plt.show()

### Visualizing the Clusters

#### Scatter Plot for K-Means
- **Objective**: Visualize the clusters formed by K-Means.
- **Steps**:
  - Created a scatter plot for the first two principal components colored by K-Means cluster labels.

In [None]:
#Scatter plot for K-Means
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_reduced[:, 0], y=data_reduced[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('K-Means Clustering')
plt.show()

#### Heatmap of Selected Features
- **Objective**: Visualize the correlations between a subset of selected features to understand the relationships within the data.
- **Steps**:
  - Selected a subset of features for the correlation matrix to focus on key variables.
  - Sampled 20,000 data points to reduce memory usage and ensure computational efficiency.
  - Ensured the sampled data is in a dense format for processing.
  - Standardized the selected features without centering to prepare for correlation analysis.
  - Computed the correlation matrix on the standardized data.
  - Plotted a heatmap with adjusted font size and fewer decimal places to visualize the correlations between the selected features.

This approach allows us to visually assess the relationships between the selected features, providing insights into potential patterns and interactions within the data.

In [None]:
#Selecting a subset of features for the correlation matrix
selected_features_subset = ['TransactionAmt', 'card1', 'card2', 'card3', 'addr1', 'addr2'] + [f'D{i}' for i in range(1, 6)] + [f'V{i}' for i in range(1, 21)]

#Ensuring the selected features are in the dataset
selected_features_subset = [feature for feature in selected_features_subset if feature in data.columns]

#Sampling the data to reduce memory usage
sample_size = 20000
sampled_data_for_corr = data[selected_features_subset].sample(n=sample_size, random_state=42)

#Ensuring the sampled data is dense
sampled_data_for_corr_dense = sampled_data_for_corr.sparse.to_dense()

#Standardizing the data without centering (with_mean=False)
scaler = StandardScaler(with_mean=False)
sampled_data_for_corr_scaled = pd.DataFrame(scaler.fit_transform(sampled_data_for_corr_dense), columns=selected_features_subset)

#Computing the correlation matrix on the standardized data
corr_matrix_sampled_scaled = sampled_data_for_corr_scaled.corr()

#Plotting the heatmap with adjusted font size and fewer decimal places
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_sampled_scaled, annot=True, fmt='.1f', annot_kws={"size": 8}, cmap='coolwarm')
plt.title('Heatmap of Selected Features (Sampled and Standardized Data)')
plt.show()

#### Dendrogram for Hierarchical Clustering
- **Objective**: Visualize the hierarchical structure of the clusters to understand the clustering relationships within the dataset.
- **Steps**:
  - Selected a subset of features for the correlation matrix and dendrogram to focus on key variables.
  - Sampled 5000 data points to reduce memory usage and ensure computational efficiency.
  - Standardized the selected features without centering to prepare for clustering.
  - Computed the correlation matrix and plotted a heatmap to visualize correlations between selected features.
  - Created a dendrogram using the linkage method with 'ward' to display the hierarchical clustering.
  - Increased the figure size and rotated x-axis labels for better readability.
  - Optionally truncated the dendrogram to show only the top levels of the hierarchy, making it more readable.

This approach allows us to visually assess the hierarchical relationships between clusters and understand how the selected features contribute to the clustering structure.

In [None]:
#Selecting a subset of features for the correlation matrix
selected_features_subset = ['TransactionAmt', 'card1', 'card2', 'card3', 'addr1', 'addr2'] + [f'D{i}' for i in range(1, 6)] + [f'V{i}' for i in range(1, 21)]

#Ensuring the selected features are in the dataset
selected_features_subset = [feature for feature in selected_features_subset if feature in data.columns]

#Sampling the data to reduce memory usage
sample_size = 5000
sampled_data_for_corr = data[selected_features_subset].sample(n=sample_size, random_state=42)

#Ensuring the sampled data is dense
sampled_data_for_corr_dense = sampled_data_for_corr.sparse.to_dense()

#Standardizing the data without centering (with_mean=False)
scaler = StandardScaler(with_mean=False)
sampled_data_for_corr_scaled = pd.DataFrame(scaler.fit_transform(sampled_data_for_corr_dense), columns=selected_features_subset)

#Computing the correlation matrix on the standardized data
corr_matrix_sampled_scaled = sampled_data_for_corr_scaled.corr()

#Plotting the heatmap with adjusted font size and fewer decimal places
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix_sampled_scaled, annot=True, fmt='.1f', annot_kws={"size": 8}, cmap='coolwarm')
plt.title('Heatmap of Selected Features (Sampled and Standardized Data)')
plt.show()

#Dendrogram for Hierarchical Clustering on the sampled data
linked = linkage(sampled_data_for_corr_scaled, method='ward')

#Plotting the dendrogram with increased figure size and rotated labels
plt.figure(figsize=(15, 10))
dendrogram(linked)
plt.xticks(rotation=90)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()

#Truncating the dendrogram to make it more readable
plt.figure(figsize=(15, 10))
dendrogram(linked, truncate_mode='level', p=5)
plt.xticks(rotation=90)
plt.title('Hierarchical Clustering Dendrogram (Truncated)')
plt.show()

## Conclusion
- **Summary**:
  - The analysis provided insights into the clustering structure of the dataset.
  - K-Means clustering showed promising results with a high silhouette score.
  - Hierarchical clustering and HDBSCAN require further parameter tuning or alternative approaches to improve performance.
