In [1]:
# Import required libraries and dependencies
import pandas as pd
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Loading data
file_path = Path("../Processed_Data/mlAlgo.csv")
df_full = pd.read_csv(file_path)

y = df_full["Result"]

df = df_full.drop(columns="Result")

# k-means

In [3]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=3, random_state=42)

In [4]:
# Fit the K-Means model
model.fit(df)

In [5]:
# Predict the clusters to group the matches
k_3 = model.predict(df)

In [6]:
# Create a copy of the DataFrame
df_predictions = df.copy()

In [7]:
# Add a new column to the DataFrame with the predicted clusters
df_predictions['clusters'] = k_3

# Display sample data
df_predictions.head()

Unnamed: 0,mktProb_homeWin,mktProb_Draw,mktProb_awayWin,eloProb_homeWin,eloProb_awayWin,AttackPowerHome,DefensePowerHome,goalDiffPowerHome,shotDiffPowerHome,formPowerHome,AttackPowerAway,DefensePowerAway,goalDiffPowerAway,shotDiffPowerAway,formPowerAway,clusters
0,0.72,0.2,0.08,0.55176,0.44824,0.164557,0.163265,1.625,1.612245,0.666667,0.117647,0.145161,0.666667,0.822581,0.5,0
1,0.62,0.24,0.14,0.440093,0.559907,0.058824,0.096154,1.0,1.634615,0.444444,0.177419,0.1,1.375,0.775,0.722222,0
2,0.54,0.26,0.2,0.490202,0.509798,0.145833,0.083333,1.4,0.8,0.444444,0.148148,0.104478,1.142857,0.80597,0.444444,0
3,0.45,0.28,0.28,0.504821,0.495179,0.044776,0.083333,0.428571,0.797619,0.277778,0.133333,0.135135,0.6,0.608108,0.333333,0
4,0.49,0.27,0.24,0.462538,0.537462,0.041667,0.1,0.5,1.2,0.333333,0.0875,0.142857,0.875,1.428571,0.444444,0


In [8]:
# Check the number in each predicted cluster
df_predictions['clusters'].value_counts()

0    5472
2     478
1     368
Name: clusters, dtype: int64

In [9]:
# Compare above cluster to actual match results
df_full['Result'].value_counts()

1    2936
2    1821
0    1561
Name: Result, dtype: int64

# k-means with pca

In [10]:
# Create a PCA model instance and set `n_components=2`.
pca=PCA(n_components=2)

In [11]:
# Use the PCA model with `fit_transform` to reduce to two principal components.
match_pca = pca.fit_transform(df)

In [12]:
# Retrieve the explained variance to determine how much information can be attributed to each principal component.
pca.explained_variance_ratio_

array([0.49030582, 0.41600877])

In [13]:
# Check total of variance explained by first two principal components
total_explained_variance = pca.explained_variance_ratio_.sum()
total_explained_variance

0.9063145912710524

In [14]:
# Create a new DataFrame with the PCA data.

# Creating a DataFrame with the PCA data
df_pca = pd.DataFrame(match_pca, columns=["pc1", "pc2"])

# Copy the match results from the original data
df_pca['results'] = y

# Set the match results column as index
df_pca = df_pca.set_index('results')

# Display sample data
df_pca.head()

Unnamed: 0_level_0,pc1,pc2
results,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.834001,0.384593
1,-0.073753,-0.201261
1,-0.333068,-0.011089
0,-0.834614,-1.019439
1,-0.408561,-0.824638


In [15]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters=3)

In [16]:
# Fit the K-Means model using the PCA data
model.fit(df_pca)

In [17]:
# Predict the clusters to group the matches using the PCA data
match_clusters = model.predict(df_pca)

In [18]:
# Create a copy of the DataFrame with the PCA data
df_pca_predictions = df_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_pca_predictions['clusters'] = match_clusters

# Display sample data
df_pca_predictions.head()

Unnamed: 0_level_0,pc1,pc2,clusters
results,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.834001,0.384593,0
1,-0.073753,-0.201261,0
1,-0.333068,-0.011089,0
0,-0.834614,-1.019439,0
1,-0.408561,-0.824638,0


In [19]:
# Check the number in each predicted cluster
df_pca_predictions['clusters'].value_counts()

0    5452
2     497
1     369
Name: clusters, dtype: int64

In [20]:
# Compare above cluster to actual match results
df_full['Result'].value_counts()

1    2936
2    1821
0    1561
Name: Result, dtype: int64