# League of Legends game outcome prediction

My goal here is to predict the outcome of a game knowing only the parameters at the end of the draft phase, ie the champions selected, the different summoner spells picked, and the bans from each team. Knowing this would allow to dodge games I risk losing, thus making climbing more efficient.

In [1]:
import pandas as pd
import sklearn
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

## Preparing the data

In [2]:
df = pd.read_csv("data\games.csv")

In [3]:
pd.set_option('display.max_columns', None)
df.head()

Unnamed: 0,gameId,creationTime,gameDuration,seasonId,winner,firstBlood,firstTower,firstInhibitor,firstBaron,firstDragon,firstRiftHerald,t1_champ1id,t1_champ1_sum1,t1_champ1_sum2,t1_champ2id,t1_champ2_sum1,t1_champ2_sum2,t1_champ3id,t1_champ3_sum1,t1_champ3_sum2,t1_champ4id,t1_champ4_sum1,t1_champ4_sum2,t1_champ5id,t1_champ5_sum1,t1_champ5_sum2,t1_towerKills,t1_inhibitorKills,t1_baronKills,t1_dragonKills,t1_riftHeraldKills,t1_ban1,t1_ban2,t1_ban3,t1_ban4,t1_ban5,t2_champ1id,t2_champ1_sum1,t2_champ1_sum2,t2_champ2id,t2_champ2_sum1,t2_champ2_sum2,t2_champ3id,t2_champ3_sum1,t2_champ3_sum2,t2_champ4id,t2_champ4_sum1,t2_champ4_sum2,t2_champ5id,t2_champ5_sum1,t2_champ5_sum2,t2_towerKills,t2_inhibitorKills,t2_baronKills,t2_dragonKills,t2_riftHeraldKills,t2_ban1,t2_ban2,t2_ban3,t2_ban4,t2_ban5
0,3326086514,1504279457970,1949,9,1,2,1,1,1,1,2,8,12,4,432,3,4,96,4,7,11,11,6,112,4,14,11,1,2,3,0,92,40,69,119,141,104,11,4,498,4,7,122,6,4,238,14,4,412,4,3,5,0,0,1,1,114,67,43,16,51
1,3229566029,1497848803862,1851,9,1,1,1,1,0,1,1,119,7,4,39,12,4,76,4,3,10,4,14,35,4,11,10,4,0,2,1,51,122,17,498,19,54,4,12,25,4,14,120,11,4,157,4,14,92,4,7,2,0,0,0,0,11,67,238,51,420
2,3327363504,1504360103310,1493,9,1,2,1,1,1,2,0,18,4,7,141,11,4,267,3,4,68,4,12,38,12,4,8,1,1,1,0,117,40,29,16,53,69,4,7,412,14,4,126,4,12,24,4,11,22,7,4,2,0,0,1,0,157,238,121,57,28
3,3326856598,1504348503996,1758,9,1,1,1,1,1,1,0,57,4,12,63,4,14,29,4,7,61,4,1,36,11,4,9,2,1,2,0,238,67,516,114,31,90,14,4,19,11,4,412,4,3,92,4,14,22,4,7,0,0,0,0,0,164,18,141,40,51
4,3330080762,1504554410899,2094,9,1,2,1,1,1,1,0,19,4,12,29,11,4,40,4,3,119,4,7,134,7,4,9,2,1,3,0,90,64,412,25,31,37,3,4,59,4,12,141,11,4,38,4,12,51,4,7,3,0,0,1,0,86,11,201,122,18


I can delete every column I don't plan on using. That's every column with information pertaining the game after it has started.

In [4]:
df = df.drop(["gameId", "creationTime", "gameDuration", "seasonId", "firstBlood", "firstTower", 
         "firstInhibitor", "firstBaron", "firstDragon", "firstRiftHerald", 
         "t1_towerKills", "t1_inhibitorKills", "t1_baronKills","t1_dragonKills", "t1_riftHeraldKills",
         "t2_towerKills", "t2_inhibitorKills", "t2_baronKills","t2_dragonKills", "t2_riftHeraldKills"], axis = 1)

In [5]:
X = df.iloc[:,1:]
y = df["winner"]

In [6]:
X.head()

Unnamed: 0,t1_champ1id,t1_champ1_sum1,t1_champ1_sum2,t1_champ2id,t1_champ2_sum1,t1_champ2_sum2,t1_champ3id,t1_champ3_sum1,t1_champ3_sum2,t1_champ4id,t1_champ4_sum1,t1_champ4_sum2,t1_champ5id,t1_champ5_sum1,t1_champ5_sum2,t1_ban1,t1_ban2,t1_ban3,t1_ban4,t1_ban5,t2_champ1id,t2_champ1_sum1,t2_champ1_sum2,t2_champ2id,t2_champ2_sum1,t2_champ2_sum2,t2_champ3id,t2_champ3_sum1,t2_champ3_sum2,t2_champ4id,t2_champ4_sum1,t2_champ4_sum2,t2_champ5id,t2_champ5_sum1,t2_champ5_sum2,t2_ban1,t2_ban2,t2_ban3,t2_ban4,t2_ban5
0,8,12,4,432,3,4,96,4,7,11,11,6,112,4,14,92,40,69,119,141,104,11,4,498,4,7,122,6,4,238,14,4,412,4,3,114,67,43,16,51
1,119,7,4,39,12,4,76,4,3,10,4,14,35,4,11,51,122,17,498,19,54,4,12,25,4,14,120,11,4,157,4,14,92,4,7,11,67,238,51,420
2,18,4,7,141,11,4,267,3,4,68,4,12,38,12,4,117,40,29,16,53,69,4,7,412,14,4,126,4,12,24,4,11,22,7,4,157,238,121,57,28
3,57,4,12,63,4,14,29,4,7,61,4,1,36,11,4,238,67,516,114,31,90,14,4,19,11,4,412,4,3,92,4,14,22,4,7,164,18,141,40,51
4,19,4,12,29,11,4,40,4,3,119,4,7,134,7,4,90,64,412,25,31,37,3,4,59,4,12,141,11,4,38,4,12,51,4,7,86,11,201,122,18


I want a one-hot encoding of the different combinations of champions and summoner spells, because the order in which the champions were picked doesn't really matter. I'll start by merging every column of champions and associated numbers to have 5 combinations of champions and summoner spells per team, then I'll merge those again for every team to get the one hot encoding.

In [7]:
for i in range(1,6):
    X[f"t1_champ_sums{i}"] = X[[f"t1_champ{i}id",f"t1_champ{i}_sum1",f"t1_champ{i}_sum2"]].values.tolist()
    X[f"t2_champ_sums{i}"] = X[[f"t2_champ{i}id",f"t2_champ{i}_sum1",f"t2_champ{i}_sum2"]].values.tolist()
    X[f"t1_champ_sums{i}"] = X[f"t1_champ_sums{i}"].apply(lambda x: ' '.join([str(i) for i in x]))
    X[f"t2_champ_sums{i}"] = X[f"t2_champ_sums{i}"].apply(lambda x: ' '.join([str(i) for i in x]))
    X = X.drop([f"t1_champ{i}id",f"t1_champ{i}_sum1", f"t1_champ{i}_sum2"], axis = 1)
    X = X.drop([f"t2_champ{i}id",f"t2_champ{i}_sum1", f"t2_champ{i}_sum2"], axis = 1)

X["t1_bans"] = X[[f"t1_ban{i}" for i in range(1,6)]].values.tolist()
X["t2_bans"] = X[[f"t2_ban{i}" for i in range(1,6)]].values.tolist()
X = X.drop([f"t1_ban{i}" for i in range(1,6)], axis = 1)
X = X.drop([f"t2_ban{i}" for i in range(1,6)], axis = 1)
X["t1_picks"] = X[[f"t1_champ_sums{i}" for i in range(1,6)]].values.tolist()
X["t2_picks"] = X[[f"t2_champ_sums{i}" for i in range(1,6)]].values.tolist()
X = X.drop([f"t1_champ_sums{i}" for i in range(1,6)], axis = 1)
X = X.drop([f"t2_champ_sums{i}" for i in range(1,6)], axis = 1)

In [8]:
X.head()

Unnamed: 0,t1_bans,t2_bans,t1_picks,t2_picks
0,"[92, 40, 69, 119, 141]","[114, 67, 43, 16, 51]","[8 12 4, 432 3 4, 96 4 7, 11 11 6, 112 4 14]","[104 11 4, 498 4 7, 122 6 4, 238 14 4, 412 4 3]"
1,"[51, 122, 17, 498, 19]","[11, 67, 238, 51, 420]","[119 7 4, 39 12 4, 76 4 3, 10 4 14, 35 4 11]","[54 4 12, 25 4 14, 120 11 4, 157 4 14, 92 4 7]"
2,"[117, 40, 29, 16, 53]","[157, 238, 121, 57, 28]","[18 4 7, 141 11 4, 267 3 4, 68 4 12, 38 12 4]","[69 4 7, 412 14 4, 126 4 12, 24 4 11, 22 7 4]"
3,"[238, 67, 516, 114, 31]","[164, 18, 141, 40, 51]","[57 4 12, 63 4 14, 29 4 7, 61 4 1, 36 11 4]","[90 14 4, 19 11 4, 412 4 3, 92 4 14, 22 4 7]"
4,"[90, 64, 412, 25, 31]","[86, 11, 201, 122, 18]","[19 4 12, 29 11 4, 40 4 3, 119 4 7, 134 7 4]","[37 3 4, 59 4 12, 141 11 4, 38 4 12, 51 4 7]"


Now I one hot encode the bans and the picks.

In [9]:
mlb_bans = MultiLabelBinarizer()
mlb_picks = MultiLabelBinarizer(sparse_output = True)

In [10]:
mlb_picks.fit(pd.concat([X["t1_picks"],X["t2_picks"]]))
mlb_bans.fit(pd.concat([X["t1_bans"],X["t2_bans"]]))

MultiLabelBinarizer()

In [11]:
Xt1_bans = mlb_bans.transform(X["t1_bans"])
Xt2_bans = mlb_bans.transform(X["t2_bans"])
X["t1_picks"] = X["t1_picks"].apply(lambda x: mlb_picks.transform([x]).toarray())
X["t2_picks"] = X["t2_picks"].apply(lambda x: mlb_picks.transform([x]).toarray())

In [12]:
Xt1_picks = X["t1_picks"]
Xt1_picks = np.concatenate(Xt1_picks, axis = 0)
Xt2_picks = X["t2_picks"]
Xt2_picks = np.concatenate(Xt2_picks, axis = 0)

In [13]:
X = np.concatenate([Xt1_bans,Xt2_bans, Xt1_picks, Xt2_picks], axis = 1)

In [14]:
X.shape

(51490, 6354)

## Training the model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
y_train.dtype

dtype('int64')

In [28]:
clf = LogisticRegression(verbose = 2, solver='liblinear', max_iter = 1000).fit(X_train, y_train)

[LibLinear]

## Showing model performance

In [29]:
sklearn.metrics.accuracy_score(y_train, clf.predict(X_train))

0.6423091862497572

In [30]:
sklearn.metrics.accuracy_score(y_test, clf.predict(X_test))

0.5383569625169936

In [24]:
sklearn.metrics.f1_score(y_test, clf.predict(X_test))

0.5469792262245091

In [25]:
sklearn.metrics.confusion_matrix(y_test, clf.predict(X_test))

array([[2870, 2361],
       [2393, 2674]], dtype=int64)

We can see that there is both high bias and variance. This is unsurprising for several reasons. Human performance is very low at this task, because to guess would require data on the different players which is not available here and not available when starting a match, since the beginning of season 12.

Hence the algorithm looks to be barely better than random guessing. Integration of player data is crucial. A highly skilled player against a worse one is likely to carry the game, and even win alone despite the team compositions.