In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.cluster import SpectralClustering , KMeans
from sklearn.preprocessing import StandardScaler, normalize 
from sklearn.decomposition import PCA 
from sklearn.metrics import silhouette_score 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction

Using the [Credit Card Dataset](https://www.kaggle.com/arjunbhasin2013/ccdata?rvi=1). I will explore the unsupervised technique of clustering, Spectral Clustering alongside principal component analysis.

### Sections

1. Data Import and Cleaning
2. Principal Component Analysis
3. Spectral Clustering
4. Example Visualizations

# Data Import and Cleaning

Import the data into a [Pandas DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) from the comma separated values file (CSV).

In [4]:
credit_card = pd.read_csv('CC GENERAL.csv')
credit_card

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,,0.000000,12
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,,0.000000,6
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


Note that for the propuses of our analysis, the customer ID  is irrelevant, hence we delete from the table (using the command drop). 
Also, there are some missing values, we use the command fillna with the method *ffill*, which gets the next value from the null, and copies it.

In [5]:
credit_card.drop(["CUST_ID"], axis=1, inplace=True)
credit_card = credit_card.fillna(method='ffill')
credit_card

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,627.284787,0.000000,12
4,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,48.886365,0.000000,6
8947,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


Now, to use the [normalize method from Sci-kit learn](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html), where each data point is *normalized* by the l2-norm.

In [6]:
normalized_data = normalize(credit_card)
normalized_data

array([[3.93529208e-02, 7.87219116e-04, 9.17897286e-02, ...,
        1.34230246e-01, 0.00000000e+00, 1.15458778e-02],
       [2.93875725e-01, 8.34231054e-05, 0.00000000e+00, ...,
        9.84037362e-02, 2.03922922e-05, 1.10118488e-03],
       [3.10797802e-01, 1.24560826e-04, 9.63066935e-02, ...,
        7.81351110e-02, 0.00000000e+00, 1.49472991e-03],
       ...,
       [2.27729209e-02, 8.11047126e-04, 1.40538302e-01, ...,
        8.02142497e-02, 2.43314235e-04, 5.83954164e-03],
       [2.65239400e-02, 1.64244246e-03, 0.00000000e+00, ...,
        1.09890537e-01, 4.92732934e-04, 1.18255904e-02],
       [1.86405379e-01, 3.33425336e-04, 5.46775599e-01, ...,
        4.41566401e-02, 0.00000000e+00, 3.00082652e-03]])

# Principal Component Analysis

Since our data contains 17 features, we cannot visualize or analyze efficiently that many features, we must reduce it while mantain the most information we can. One of the main ways of doing such procedure is using a [Principal Component Analysis](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html), that uses SVD (Singular Value Decomposition).

In this code block we plot an explained variance ratio plot, where we analyze what is the percentage that each additional component adds to the total.

In [7]:
pca = PCA()
pca.fit(normalized_data)
explained_var = np.cumsum(pca.explained_variance_ratio_)
px.area(
    x=range(1, explained_var.shape[0] + 1),
    y=explained_var,
    labels={"x": "# Components", "y": "Explained Variance"}
)


We see that for 3 components, which is a great number to visualize, the total variance explained is 77.3%. Hence, we choose for the rest of the analysis.

In [8]:
pca_data = PCA(n_components=3)
credit_pca = pd.DataFrame(pca_data.fit_transform(normalized_data), columns=['Component 1','Component 2','Component 3'])
credit_pca

Unnamed: 0,Component 1,Component 2,Component 3
0,-0.315576,-0.044823,0.024746
1,0.317328,-0.156866,0.368285
2,-0.206978,-0.183004,-0.123505
3,-0.279409,-0.118059,-0.127551
4,0.140988,-0.090323,-0.093426
...,...,...,...
8945,-0.290478,0.165477,0.036824
8946,-0.312760,0.151931,0.021563
8947,-0.387197,-0.061227,-0.004984
8948,-0.331117,-0.180086,0.087395


The elbow graph is a great tool to decide on how many clusters to use while separating the data. The goal is to minimize inertia, the measure that determines the distance between each point and a central one during a clustering process.

In [9]:
inertia = []
for i in range(1,10):
    cluster = KMeans(n_clusters=i)
    cluster.fit(credit_pca)
    inertia.append(cluster.inertia_)
px.line(inertia, title="Elbow graph for Spectral Clustering",labels={"index":"Clusters","value":"Inertia"})





















For this specific dataset, we see that after 4 clusters there are dimishing returns, hence that is the number of clusters we will choose.

# Spectral Clustering

Now, we finally get to the **Spectral Clustering**.
Using the Sci-kit learn implementation, we can visualize the 4 different clusters within the 3 principal components.

In [10]:
cluster_spectral = SpectralClustering(n_clusters=4, affinity='nearest_neighbors')
cluster_spectral.fit(credit_pca)
credit_card['Labels'] = cluster_spectral.labels_
px.scatter_3d(x=credit_pca['Component 1'],y=credit_pca['Component 2'],z=credit_pca['Component 3'], color=cluster_spectral.labels_,size_max=18)

# Example Visualizations

In [9]:
px.scatter(credit_card,x='BALANCE', y='PURCHASES',color='Labels',title='Balance vs Purchases')

In [10]:
px.scatter(credit_card,x='CREDIT_LIMIT', y='PAYMENTS',color='Labels', title='Credit limit vs Payments')

In [11]:
from sklearn.metrics import f1_score

In [13]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn_extra.cluster import KMedoids
from sklearn.preprocessing import StandardScaler

# Create a synthetic dataset
X, y = make_classification(n_samples=1000, n_features=15, n_classes=2, random_state=89)

# Standardize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-Medoids clustering
kmedoids = KMedoids(n_clusters=2, random_state=42)
cluster_labels = kmedoids.fit_predict(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='binary')  # or 'macro', 'micro', 'weighted'

print(f'F1 Score: {f1}')


F1 Score: 0.9150943396226414
