In [1]:
# Import required libraries and dependencies
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading data
file_path = Path("elements.csv")
df_full = pd.read_csv(file_path)

y = df_full["class"]

df = df_full[['zScore_form', 'zScore_bps', 'zScore_gpts', 'zScore_ict', 'zScore_vbps', 'zScore_vsea', 
             'zScore_cost', 'zScore_sbp', 'zScore_ictval']]

In [3]:
# classifier function
def classifierKMEANS(x):
    if (x == 0):
        return 'bronze'
    elif (x == 1):
        return 'platin'
    elif (x == 2):
        return 'gold'
    else:
        return 'silver'

In [4]:
# classifier function
def classifierPCA(x):
    if (x == 0):
        return 'gold'
    elif (x == 1):
        return 'silver'
    elif (x == 2):
        return 'bronze'
    else:
        return 'platin'

In [5]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4, random_state=42)

In [6]:
# Fit the K-Means model
model.fit(df)

  super()._check_params_vs_input(X, default_n_init=10)


In [7]:
# Predict the clusters to group the matches
k_4 = model.predict(df)

In [8]:
# Create a copy of the DataFrame
df_predictions = df.copy()

In [9]:
# Add a new column to the DataFrame with the predicted clusters
df_predictions['clusters'] = k_4

# Display sample data
df_predictions.head()

Unnamed: 0,zScore_form,zScore_bps,zScore_gpts,zScore_ict,zScore_vbps,zScore_vsea,zScore_cost,zScore_sbp,zScore_ictval,clusters
0,0.0,0.171925,1.208079,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.124541,0.0,0.0,0.0,0.331505,0.0,0.0,1
2,0.821717,0.809909,0.511519,0.0,1.229696,1.048308,0.0,1.446112,0.044136,3
3,1.678696,0.0,0.356728,0.518624,0.0,0.38731,1.90842,0.0,0.524859,2
4,1.923547,0.271129,0.975892,1.762494,0.0,0.04359,2.650498,0.124443,0.544889,2


In [10]:
# Check the number in each predicted cluster
df_predictions['clusters'].value_counts()

clusters
1    338
3     76
2     61
0      6
Name: count, dtype: int64

In [11]:
# Compare above cluster to actual match results
df_full['class'].value_counts()

class
0    353
1     72
2     41
3     15
Name: count, dtype: int64

## k-mean with pca

In [12]:
# Create a PCA model instance and set `n_components=2`.
pca=PCA(n_components=2, random_state=42)

In [13]:
# Use the PCA model with `fit_transform` to reduce to two principal components.
match_pca = pca.fit_transform(df)

In [14]:
pca.explained_variance_ratio_

array([0.56543525, 0.17824122])

In [15]:
# Check total of variance explained by first two principal components
total_explained_variance = pca.explained_variance_ratio_.sum()
total_explained_variance

0.7436764701041747

In [16]:
# Create a new DataFrame with the PCA data.

# Creating a DataFrame with the PCA data
df_pca = pd.DataFrame(match_pca, columns=["pc1", "pc2"])

# Copy the match results from the original data
df_pca['results'] = y

# Set the match results column as index
df_pca = df_pca.set_index('results')

# Display sample data
df_pca.head()

Unnamed: 0_level_0,pc1,pc2
results,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-0.557078,0.308462
0,-0.888185,0.505069
1,0.757478,-0.708258
1,0.741551,0.916232
1,1.852684,1.642892


In [17]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=4, random_state=42)

In [18]:
# Fit the K-Means model using the PCA data
model.fit(df_pca)

  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
# Predict the clusters to group the matches using the PCA data
match_clusters = model.predict(df_pca)

In [20]:
# Create a copy of the DataFrame with the PCA data
df_pca_predictions = df_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_pca_predictions['clusters'] = match_clusters

# Display sample data
df_pca_predictions.head()

Unnamed: 0_level_0,pc1,pc2,clusters
results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.557078,0.308462,0
0,-0.888185,0.505069,0
1,0.757478,-0.708258,2
1,0.741551,0.916232,0
1,1.852684,1.642892,1


In [21]:
# Check the number in each predicted cluster
df_pca_predictions['clusters'].value_counts()

clusters
0    341
2     95
1     40
3      5
Name: count, dtype: int64

In [22]:
# Compare above cluster to actual match results
df_full['class'].value_counts()

class
0    353
1     72
2     41
3     15
Name: count, dtype: int64

In [23]:
df_full['kMeansClass'] = k_4
df_full['pcaKMeansClass'] = match_clusters

df_full['kMeans-cat'] = df_full.apply(lambda x: classifierKMEANS(x['kMeansClass']), axis=1)
df_full['pcaKMeans-cat'] = df_full.apply(lambda x: classifierPCA(x['pcaKMeansClass']), axis=1)

df_full.to_csv('elements.csv')