In [None]:
# Imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from tabulate import tabulate
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [None]:
# PLease enter the absolute path of the Keggle data
file_path = r""
creditcard_df = pd.read_csv(file_path)

# List out all the NaN values in the data frame
creditcard_df.isnull().sum()
# Calculate the mean for each column to replace the NaN values
min_payment_mean = creditcard_df["MINIMUM_PAYMENTS"].mean()
creditcard_df["MINIMUM_PAYMENTS"].fillna(value=min_payment_mean, inplace=True)
credit_limit_mean = creditcard_df["CREDIT_LIMIT"].mean()
creditcard_df["CREDIT_LIMIT"].fillna(value=credit_limit_mean, inplace=True)
# Verify the replacement
creditcard_df.isnull().sum()

In [None]:
# We do not need the Custoner ID to perform the K-means segmentation
cleared_df = creditcard_df.loc[:, creditcard_df.columns != "CUST_ID"]

# Elbow method to get the K value
scores_1 = []
range_values = range(1, 20)
for column in range_values:
    kmeans = KMeans(n_clusters=column)
    kmeans.fit(cleared_df)
    scores_1.append(kmeans.inertia_)
    plt.plot(scores_1, "bx-")
plt.title("The Elbow Method")
plt.xlabel("Number of clusters")
plt.ylabel("SSE")

In [None]:
# Using 6 for K
k_means = KMeans(n_clusters=6)

# Get the cluster ID for each customer and append it to our cleared_df
cluster_id = k_means.fit_predict(
    cleared_df[
        [
            "BALANCE",
            "BALANCE_FREQUENCY",
            "PURCHASES",
            "ONEOFF_PURCHASES",
            "INSTALLMENTS_PURCHASES",
            "CASH_ADVANCE",
            "PURCHASES_FREQUENCY",
            "ONEOFF_PURCHASES_FREQUENCY",
            "PURCHASES_INSTALLMENTS_FREQUENCY",
            "CASH_ADVANCE_FREQUENCY",
            "CASH_ADVANCE_TRX",
            "PURCHASES_TRX",
            "CREDIT_LIMIT",
            "PAYMENTS",
            "MINIMUM_PAYMENTS",
            "PRC_FULL_PAYMENT",
            "TENURE",
        ]
    ]
)
cleared_df = cleared_df.assign(CLUSTER=cluster_id)

# Visualizing the data for each cluster
# Individually
for cluster_id in range(6):
    cluster_df = cleared_df.copy()
    cluster_df = cluster_df[cluster_df["CLUSTER"] == cluster_id]
    plt.figure(figsize=(10, 50))
    for column in range(len(cluster_df.columns)):
        plt.subplot(18, 1, column + 1)
        sns.distplot(
            cluster_df[cluster_df.columns[column]],
            kde_kws={"color": "b", "lw": 3, "label": "KDE"},
            hist_kws={"color": "g"},
        )
        plt.title(cluster_df.columns[column])
    plt.tight_layout()

In [None]:
# Side-by-side
for column in cleared_df.columns:
    plt.figure(figsize=(35, 5))
    for cluster_id in range(6):
        plt.subplot(1, 6, cluster_id + 1)
        cluster_column = cleared_df[cleared_df["CLUSTER"] == cluster_id]
        cluster_column[column].hist(bins=20)
        plt.title("{}    \nCluster {} ".format(column, cluster_id))
    plt.show()

In [None]:
# Creating a table to present the mean values for the clusters
combined_dict = {}
for cluster_id in range(6):
    cluster_df = cleared_df.copy()
    cluster_df = cluster_df[cluster_df["CLUSTER"] == cluster_id]
    cluster_mean_df = cluster_df.mean()
    cluster_mean_dict = cluster_mean_df.to_dict()
    for key in cluster_mean_dict.keys():
        if key not in combined_dict:
            combined_dict[key] = []
        combined_dict[key].append(cluster_mean_dict[key])

# NOTE: This table formatting was created with the help of Chat-GPT!
# Number of decimal places to display
precision = 2

# Create a header row
header = "{:<35}".format("Attribute")
for i in range(len(combined_dict["BALANCE"])):
    header += "{:<35}".format(f"Cluster {i}")

# Create a separator row
separator = "-" * len(header)

# Create rows for each attribute with limited precision
rows = ""
for key in combined_dict.keys():
    row = "{:<35}".format(key)
    for value in combined_dict[key]:
        formatted_value = (
            f"{value:.{precision}f}"  # Limit precision to 2 decimal places
        )
        row += "{:<35}".format(formatted_value)
    rows += row + "\n"

# Combine header, separator, and rows to create the table
table = header + "\n" + separator + "\n" + rows

print(table)