In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import  cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import sklearn.tree as tree
from copy import deepcopy
from analyze_tools import *

In [2]:
untoucheddf = pd.read_csv('../data/clean_game_data.csv', index_col=0)

In [3]:

df = pd.read_csv('../data/clean_game_data.csv', index_col=0).drop(columns=['Year', 'Game ID'])
df['Victory'] = df['Victory'].astype(int)
top_df = deepcopy(df[df['Role'] == 'TOP'].select_dtypes(include='number')).astype('float64').drop(columns=['Victory'])
mid_df = deepcopy(df[df['Role'] == 'MID'].select_dtypes(include='number')).astype('float64').drop(columns=['Victory'])
jg_df =  deepcopy(df[df['Role'] == 'JUNGLE'].select_dtypes(include='number')).astype('float64').drop(columns=['Victory'])
adc_df =  deepcopy(df[df['Role'] == 'ADC'].select_dtypes(include='number')).astype('float64').drop(columns=['Victory'])
sup_df = deepcopy(df[df['Role'] == 'SUPPORT'].select_dtypes(include='number')).astype('float64').drop(columns=['Victory'])

scaler = StandardScaler()
top_df[:] = scaler.fit_transform(top_df)
mid_df[:] = scaler.fit_transform(mid_df)
jg_df[:] = scaler.fit_transform(jg_df)
adc_df[:] = scaler.fit_transform(adc_df)
sup_df[:] = scaler.fit_transform(sup_df)

pca3_mid = PCA(n_components=3)
pca3_top = PCA(n_components=3)
pca3_jg = PCA(n_components=3)
pca3_adc = PCA(n_components=3)
pca3_sup = PCA(n_components=3)

pca3_mid_df = pd.DataFrame(pca3_mid.fit_transform(mid_df), columns=['PC1', 'PC2', 'PC3'])
pca3_top_df = pd.DataFrame(pca3_top.fit_transform(top_df), columns=['PC1', 'PC2', 'PC3'])
pca3_jg_df = pd.DataFrame(pca3_jg.fit_transform(jg_df), columns=['PC1', 'PC2', 'PC3'])
pca3_adc_df = pd.DataFrame(pca3_adc.fit_transform(adc_df), columns=['PC1', 'PC2', 'PC3'])
pca3_sup_df = pd.DataFrame(pca3_sup.fit_transform(sup_df), columns=['PC1', 'PC2', 'PC3'])



In [4]:
n_clusters = 4
top_kmeans = KMeans(n_clusters=n_clusters,random_state=42)
pred = top_kmeans.fit_predict(pca3_top_df)
new_top_df = deepcopy(top_df)
new_top_df['Cluster'] = pred
new_top_df['Victory'] = df[df['Role'] == 'TOP']['Victory']
print(new_top_df.groupby('Cluster').mean()['Victory'] * 100)

mid_kmeans = KMeans(n_clusters=n_clusters,random_state=42)
pred = mid_kmeans.fit_predict(pca3_mid_df)
new_mid_df = deepcopy(mid_df)
new_mid_df['Cluster'] = pred
new_mid_df['Victory'] = df[df['Role'] == 'MID']['Victory']
print(new_mid_df.groupby('Cluster').mean()['Victory'] * 100)

jg_kmeans = KMeans(n_clusters=n_clusters,random_state=42)
pred = jg_kmeans.fit_predict(pca3_jg_df)
new_jg_df = deepcopy(jg_df)
new_jg_df['Cluster'] = pred
new_jg_df['Victory'] = df[df['Role'] == 'JUNGLE']['Victory']
print(new_jg_df.groupby('Cluster').mean()['Victory'] * 100)

adc_kmeans = KMeans(n_clusters=n_clusters,random_state=42)
pred = adc_kmeans.fit_predict(pca3_adc_df)
new_adc_df = deepcopy(adc_df)
new_adc_df['Cluster'] = pred
new_adc_df['Victory'] = df[df['Role'] == 'ADC']['Victory']
print(new_adc_df.groupby('Cluster').mean()['Victory'] * 100)

sup_kmeans = KMeans(n_clusters=n_clusters,random_state=42)
pred = sup_kmeans.fit_predict(pca3_sup_df)
new_sup_df = deepcopy(sup_df)
new_sup_df['Cluster'] = pred
new_sup_df['Victory'] = df[df['Role'] == 'SUPPORT']['Victory']
print(new_sup_df.groupby('Cluster').mean()['Victory'] * 100)


Cluster
0    51.174935
1    80.934809
2    19.049170
3    60.071514
Name: Victory, dtype: float64
Cluster
0    67.553957
1    19.852741
2    50.101523
3    84.953233
Name: Victory, dtype: float64
Cluster
0    65.429480
1    15.160891
2    59.164292
3    87.642153
Name: Victory, dtype: float64
Cluster
0    16.159180
1    67.427123
2    43.308200
3    90.076954
Name: Victory, dtype: float64
Cluster
0    54.947368
1    83.700081
2    71.197411
3    23.315259
Name: Victory, dtype: float64


In [5]:
# Assigning the cluster to the original dataframe for each player for each role in each game

clusterdf = pd.read_csv('../data/clean_game_data.csv', index_col=0)
clusterdf['Cluster'] = -1
clusterdf.loc[df['Role'] == 'TOP', 'Cluster'] = top_kmeans.predict(pca3_top_df)
clusterdf.loc[df['Role'] == 'MID', 'Cluster'] = mid_kmeans.predict(pca3_mid_df)
clusterdf.loc[df['Role'] == 'JUNGLE', 'Cluster'] = jg_kmeans.predict(pca3_jg_df)
clusterdf.loc[df['Role'] == 'ADC', 'Cluster'] = adc_kmeans.predict(pca3_adc_df)
clusterdf.loc[df['Role'] == 'SUPPORT', 'Cluster'] = sup_kmeans.predict(pca3_sup_df)

In [6]:
# Learning a model for each role

X_top = new_top_df.drop(columns=['Victory', 'Cluster'])
y_top = new_top_df['Cluster']
X_train_top, X_test_top, y_train_top, y_test_top = train_test_split(X_top, y_top, test_size=0.2, random_state=42)
svm_top = SVC()
svm_top.fit(X_train_top, y_train_top)
print('Top SVM:', cross_val_score(svm_top, X_train_top, y_train_top, cv=5).mean())

X_mid = new_mid_df.drop(columns=['Victory', 'Cluster'])
y_mid = new_mid_df['Cluster']
X_train_mid, X_test_mid, y_train_mid, y_test_mid = train_test_split(X_mid, y_mid, test_size=0.2, random_state=42)
svm_mid = SVC()
svm_mid.fit(X_train_mid, y_train_mid)
print('Mid SVM:', cross_val_score(svm_mid, X_train_mid, y_train_mid, cv=5).mean())

X_jg = new_jg_df.drop(columns=['Victory', 'Cluster'])
y_jg = new_jg_df['Cluster']
X_train_jg, X_test_jg, y_train_jg, y_test_jg = train_test_split(X_jg, y_jg, test_size=0.2, random_state=42)
svm_jg = SVC()
svm_jg.fit(X_train_jg, y_train_jg)
print('Jg SVM:', cross_val_score(svm_jg, X_train_jg, y_train_jg, cv=5).mean())

X_adc = new_adc_df.drop(columns=['Victory', 'Cluster'])
y_adc = new_adc_df['Cluster']
X_train_adc, X_test_adc, y_train_adc, y_test_adc = train_test_split(X_adc, y_adc, test_size=0.2, random_state=42)
svm_adc = SVC()
svm_adc.fit(X_train_adc, y_train_adc)
print('Adc SVM:', cross_val_score(svm_adc, X_train_adc, y_train_adc, cv=5).mean())

X_sup = new_sup_df.drop(columns=['Victory', 'Cluster'])
y_sup = new_sup_df['Cluster']
X_train_sup, X_test_sup, y_train_sup, y_test_sup = train_test_split(X_sup, y_sup, test_size=0.2, random_state=42)
svm_sup = SVC()
svm_sup.fit(X_train_sup, y_train_sup)
print('Sup SVM:', cross_val_score(svm_sup, X_train_sup, y_train_sup, cv=5).mean())

Top SVM: 0.9667901681687472
Mid SVM: 0.9652090536501989
Jg SVM: 0.9694256485348743
Adc SVM: 0.965340198001916
Sup SVM: 0.9749597664762033


Learning cluster interation, distribution for each team comp.

In [20]:

x = np.zeros((len(clusterdf['Game ID'].unique()), 10))
y = np.zeros(len(clusterdf['Game ID'].unique()))

np.random.seed(42)
for i, gameid in enumerate(clusterdf['Game ID'].unique()):
    game = clusterdf[clusterdf['Game ID'] == gameid]
    if len(game) != 10:
        raise ValueError('Game with id {} has {} players'.format(gameid, len(game)))
    
    top_blue = game[(game['Role'] == 'TOP') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    mid_blue = game[(game['Role'] == 'MID') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    jg_blue = game[(game['Role'] == 'JUNGLE') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    adc_blue = game[(game['Role'] == 'ADC') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    sup_blue = game[(game['Role'] == 'SUPPORT') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]

    top_red = game[(game['Role'] == 'TOP') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    mid_red = game[(game['Role'] == 'MID') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    jg_red = game[(game['Role'] == 'JUNGLE') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    adc_red = game[(game['Role'] == 'ADC') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    sup_red = game[(game['Role'] == 'SUPPORT') & (game['Side'] == 'Red Side')]['Cluster'].values[0]

    # Formating into a vector of cluster attribution for each player in the game and the result of the game
    result = game[(game['Role'] == 'TOP') & (game['Side'] == 'Blue Side')]['Victory'].values[0].astype(int)
    # Randomizing the side attribution
    if np.random.rand() > 0.5:
        vector = np.array([top_red, mid_red, jg_red, adc_red, sup_red, top_blue, mid_blue, jg_blue, adc_blue, sup_blue])
        result = 1 - result
    else:
        vector = np.array([top_blue, mid_blue, jg_blue, adc_blue, sup_blue, top_red, mid_red, jg_red, adc_red, sup_red])
    x[i] = vector
    y[i] = result

    

In [15]:
# Learning a model to predict the result of the game based on the cluster attribution of the players

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(random_state=42)
clf.fit(x_train, y_train)
print('Random Forest Classifier')
print('Train score:', clf.score(x_train, y_train))
print('Test score:', clf.score(x_test, y_test))
print('Cross validation score:', cross_val_score(clf, x, y).mean())


Random Forest Classifier
Train score: 0.989193463363205
Test score: 0.9209694415173867
Cross validation score: 0.9114506498068142


In [9]:
# Test on 2024 data

df2024 = pd.read_csv('../data/game_data2024.csv', index_col=0)
# Harmonizing columns with the previous dataframe (has possibly more unknown columns)
df2024 = df2024[untoucheddf.columns]
df2024predict = df2024.drop(columns=['Year', 'Game ID'])

df2024['Cluster'] = -1

# Using SVM learned models to predict the cluster of each player

df2024.loc[df2024['Role'] == 'TOP', 'Cluster'] = svm_top.predict(scaler.transform(df2024predict[df2024['Role'] == 'TOP'].select_dtypes(include='number')))
df2024.loc[df2024['Role'] == 'MID', 'Cluster'] = svm_mid.predict(scaler.transform(df2024predict[df2024['Role'] == 'MID'].select_dtypes(include='number')))
df2024.loc[df2024['Role'] == 'JUNGLE', 'Cluster'] = svm_jg.predict(scaler.transform(df2024predict[df2024['Role'] == 'JUNGLE'].select_dtypes(include='number')))
df2024.loc[df2024['Role'] == 'ADC', 'Cluster'] = svm_adc.predict(scaler.transform(df2024predict[df2024['Role'] == 'ADC'].select_dtypes(include='number')))
df2024.loc[df2024['Role'] == 'SUPPORT', 'Cluster'] = svm_sup.predict(scaler.transform(df2024predict[df2024['Role'] == 'SUPPORT'].select_dtypes(include='number')))

df2024




Unnamed: 0,Side,Region,Year,Game ID,Team,Victory,Game Duration,Player,Role,Kills,...,Penta kills,GD@15,CSD@15,XPD@15,LVLD@15,Damage dealt to turrets,Total heal,Time ccing others,Total damage taken,Cluster
0,Blue Side,LEC,2024,57862,G2 Esports,False,28.517,Brokenblade,TOP,1.0,...,0.0,-471.0,0.0,435.0,1.0,293.0,3226.0,43.0,33743.0,3
1,Blue Side,LEC,2024,57862,G2 Esports,False,28.517,Yike,JUNGLE,1.0,...,0.0,-387.0,-5.0,-281.0,0.0,2657.0,14693.0,15.0,30359.0,3
2,Blue Side,LEC,2024,57862,G2 Esports,False,28.517,Caps,MID,5.0,...,0.0,1125.0,-1.0,458.0,0.0,5220.0,3372.0,18.0,20231.0,0
3,Blue Side,LEC,2024,57862,G2 Esports,False,28.517,Hans sama,ADC,3.0,...,0.0,361.0,-1.0,204.0,0.0,5856.0,2847.0,3.0,16606.0,1
4,Blue Side,LEC,2024,57862,G2 Esports,False,28.517,Mikyx,SUPPORT,1.0,...,0.0,-528.0,4.0,-635.0,-1.0,2286.0,943.0,20.0,16227.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,Red Side,LCS,2024,53709,100 Thieves,True,34.917,Sniper,TOP,2.0,...,0.0,312.0,18.0,-63.0,0.0,5756.0,5000.0,34.0,22879.0,3
6,Red Side,LCS,2024,53709,100 Thieves,True,34.917,River,JUNGLE,4.0,...,0.0,670.0,5.0,238.0,0.0,2016.0,18183.0,26.0,36494.0,3
7,Red Side,LCS,2024,53709,100 Thieves,True,34.917,Quid,MID,5.0,...,0.0,553.0,22.0,1264.0,1.0,6128.0,7223.0,19.0,22071.0,0
8,Red Side,LCS,2024,53709,100 Thieves,True,34.917,Meech,ADC,4.0,...,0.0,-977.0,-31.0,-1715.0,-2.0,3762.0,10294.0,32.0,14623.0,1


In [10]:
# Creating the input vector for 2024 data (to test the model)

x2024 = np.zeros((len(df2024['Game ID'].unique()), 10))
y2024 = np.zeros(len(df2024['Game ID'].unique()))

for i, gameid in enumerate(df2024['Game ID'].unique()):
    game = df2024[df2024['Game ID'] == gameid]
    if len(game) != 10:
        raise ValueError('Game with id {} has {} players'.format(gameid, len(game)))
    
    top_blue = game[(game['Role'] == 'TOP') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    mid_blue = game[(game['Role'] == 'MID') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    jg_blue = game[(game['Role'] == 'JUNGLE') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    adc_blue = game[(game['Role'] == 'ADC') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]
    sup_blue = game[(game['Role'] == 'SUPPORT') & (game['Side'] == 'Blue Side')]['Cluster'].values[0]

    top_red = game[(game['Role'] == 'TOP') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    mid_red = game[(game['Role'] == 'MID') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    jg_red = game[(game['Role'] == 'JUNGLE') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    adc_red = game[(game['Role'] == 'ADC') & (game['Side'] == 'Red Side')]['Cluster'].values[0]
    sup_red = game[(game['Role'] == 'SUPPORT') & (game['Side'] == 'Red Side')]['Cluster'].values[0]

    result = game[(game['Role'] == 'TOP') & (game['Side'] == 'Blue Side')]['Victory'].values[0].astype(int)
    vector = np.array([top_blue, mid_blue, jg_blue, adc_blue, sup_blue, top_red, mid_red, jg_red, adc_red, sup_red])
    x2024[i] = vector
    y2024[i] = result


In [21]:
from sklearn.linear_model import LogisticRegression

clf = RandomForestClassifier(random_state=42)
clf.fit(x, y)

clf_svm = SVC()
clf_svm.fit(x, y)

clf_log = LogisticRegression()
clf_log.fit(x, y)

print('Random Forest Classifier')
print('Test score:', clf.score(x2024, y2024))

print('SVM')
print('Test score:', clf_svm.score(x2024, y2024))

print('Logistic Regression')
print('Test score:', clf_log.score(x2024, y2024))

Random Forest Classifier
Test score: 0.6593625498007968
SVM
Test score: 0.601593625498008
Logistic Regression
Test score: 0.5756972111553785
