# The Best Champion in League of Legends

**Name(s)**: Trevan Nguyen

**Website Link**: https://sudosure.github.io/LeagueOfLegends/

## Code

In [1]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'

from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind

### Cleaning and EDA

#### Introduction and Question Identification
What is the best champion in the League of Legends? We will be analyzing different aspects of different champions including kills, assists, winrate, roles, etc.

#### Data Cleaning

In [2]:
# Load in the raw Dataframe
lol_raw = pd.read_csv('2022_LoL_esports_match_data_from_OraclesElixir.csv',low_memory=False)
lol_raw.head()

Unnamed: 0,gameid,datacompleteness,url,league,year,split,playoffs,date,game,patch,...,opp_csat15,golddiffat15,xpdiffat15,csdiffat15,killsat15,assistsat15,deathsat15,opp_killsat15,opp_assistsat15,opp_deathsat15
0,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,121.0,391.0,345.0,14.0,0.0,1.0,0.0,0.0,1.0,0.0
1,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,100.0,541.0,-275.0,-11.0,2.0,3.0,2.0,0.0,5.0,1.0
2,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,119.0,-475.0,153.0,1.0,0.0,3.0,0.0,3.0,3.0,2.0
3,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,149.0,-793.0,-1343.0,-34.0,2.0,1.0,2.0,3.0,3.0,0.0
4,ESPORTSTMNT01_2690210,complete,,LCK CL,2022,Spring,0,2022-01-10 07:44:08,1,12.01,...,21.0,443.0,-497.0,7.0,1.0,2.0,2.0,0.0,6.0,2.0


In [3]:
# Get the columns that will be useful for our analysis
lol = lol_raw[['gameid','datacompleteness','position','champion', 'result', 'kills','deaths','assists','doublekills'
         ,'triplekills','quadrakills','pentakills']]
lol.head()

Unnamed: 0,gameid,datacompleteness,position,champion,result,kills,deaths,assists,doublekills,triplekills,quadrakills,pentakills
0,ESPORTSTMNT01_2690210,complete,top,Renekton,0,2,3,2,0.0,0.0,0.0,0.0
1,ESPORTSTMNT01_2690210,complete,jng,Xin Zhao,0,2,5,6,0.0,0.0,0.0,0.0
2,ESPORTSTMNT01_2690210,complete,mid,LeBlanc,0,2,2,3,0.0,0.0,0.0,0.0
3,ESPORTSTMNT01_2690210,complete,bot,Samira,0,2,4,2,0.0,0.0,0.0,0.0
4,ESPORTSTMNT01_2690210,complete,sup,Leona,0,1,5,6,0.0,0.0,0.0,0.0


In [4]:
# Removed team matchid data to focus on individual player data
lol = lol[lol['position'] != 'team']

# Change result to True and False and stored them into separate columns
lol['win'] = lol['result'].apply(lambda x: True if x == 1 else False)
lol['lose'] = lol['result'].apply(lambda x: False if x == 1 else True)

# Fill the nan values in double, triple, quadra, penta kills because a nan value in this case means that that type kill
# was not achieved in that game as they are usually rare in professional games.
lol['doublekills'] = lol['doublekills'].fillna(0.0)
lol['triplekills'] = lol['triplekills'].fillna(0.0)
lol['quadrakills'] = lol['quadrakills'].fillna(0.0)
lol['pentakills'] = lol['pentakills'].fillna(0.0)

lol.head()

Unnamed: 0,gameid,datacompleteness,position,champion,result,kills,deaths,assists,doublekills,triplekills,quadrakills,pentakills,win,lose
0,ESPORTSTMNT01_2690210,complete,top,Renekton,0,2,3,2,0.0,0.0,0.0,0.0,False,True
1,ESPORTSTMNT01_2690210,complete,jng,Xin Zhao,0,2,5,6,0.0,0.0,0.0,0.0,False,True
2,ESPORTSTMNT01_2690210,complete,mid,LeBlanc,0,2,2,3,0.0,0.0,0.0,0.0,False,True
3,ESPORTSTMNT01_2690210,complete,bot,Samira,0,2,4,2,0.0,0.0,0.0,0.0,False,True
4,ESPORTSTMNT01_2690210,complete,sup,Leona,0,1,5,6,0.0,0.0,0.0,0.0,False,True


In [5]:
# lol markdown
print(lol.head().to_markdown(index=False))

| gameid                | datacompleteness   | position   | champion   |   result |   kills |   deaths |   assists |   doublekills |   triplekills |   quadrakills |   pentakills | win   | lose   |
|:----------------------|:-------------------|:-----------|:-----------|---------:|--------:|---------:|----------:|--------------:|--------------:|--------------:|-------------:|:------|:-------|
| ESPORTSTMNT01_2690210 | complete           | top        | Renekton   |        0 |       2 |        3 |         2 |             0 |             0 |             0 |            0 | False | True   |
| ESPORTSTMNT01_2690210 | complete           | jng        | Xin Zhao   |        0 |       2 |        5 |         6 |             0 |             0 |             0 |            0 | False | True   |
| ESPORTSTMNT01_2690210 | complete           | mid        | LeBlanc    |        0 |       2 |        2 |         3 |             0 |             0 |             0 |            0 | False | True   |
| ESPORTSTMNT01

#### Univariate Analysis

In [23]:
# Winrate of every champion with a sufficient amount of total games played
champ_winrate = (lol.groupby('champion')['win'].sum().apply(lambda x: x if x >= 50 else np.nan) \
 / (lol.groupby('champion')['lose'].sum().apply(lambda x: x if x >= 50 else np.nan) \
    + lol.groupby('champion')['win'].sum().apply(lambda x: x if x >= 50 else np.nan))) \
.reset_index().rename(columns = {0: 'winrate'})

# Distribution of winrate among all the champions
fig = px.histogram(champ_winrate*100, x='winrate', nbins=12, labels={'winrate':'winrate (%)'})
fig.show()

In [28]:
fig.write_html('file-name.html', include_plotlyjs='cdn')

<bound method BaseFigure.write_html of Figure({
    'data': [{'alignmentgroup': 'True',
              'bingroup': 'x',
              'hovertemplate': 'winrate (%)=%{x}<br>count=%{y}<extra></extra>',
              'legendgroup': '',
              'marker': {'color': '#636efa', 'pattern': {'shape': ''}},
              'name': '',
              'nbinsx': 12,
              'offsetgroup': '',
              'orientation': 'v',
              'showlegend': False,
              'type': 'histogram',
              'x': array([53.47025496, 53.07509344, 51.5898767 , 49.5049505 , 49.87893462,
                          50.42918455,         nan,         nan, 49.07384539, 48.5645933 ,
                                  nan, 49.94565217, 52.38095238, 51.54185022, 54.01785714,
                                  nan, 45.61403509, 52.55172414, 50.50641458, 46.90721649,
                                  nan, 51.66204986, 56.90607735, 47.6900149 ,         nan,
                          50.        ,         nan

In [29]:
# Winrate of every champion with a plentiful amount of total games played
champ_winrate_reliable = (lol.groupby('champion')['win'].sum().apply(lambda x: x if x >= 500 else np.nan) \
 / (lol.groupby('champion')['lose'].sum().apply(lambda x: x if x >= 500 else np.nan) \
    + lol.groupby('champion')['win'].sum().apply(lambda x: x if x >= 500 else np.nan))) \
.reset_index().rename(columns = {0: 'winrate'})

# Distribution of winrate among all the champions with more games played
fig = px.histogram(champ_winrate_reliable*100, x='winrate', nbins=12, labels={'winrate':'winrate (%)'})
fig.show()

#### Bivariate Analysis

In [30]:
# Raw winrate of each champion
winrate_raw = (lol.groupby('champion')['win'].sum() \
 / (lol.groupby('champion')['lose'].sum() \
    + lol.groupby('champion')['win'].sum())) \
.reset_index().rename(columns = {0: 'winrate'})

In [31]:
# Average amount of kills per champion
champ_avg_kills = lol.groupby('champion')['kills'].mean().sort_values(ascending=False)\
.reset_index().rename(columns = {'kills': 'average kills'})

# Merged DF with winrate and average kills
merged_win_kill = winrate_raw.merge(champ_avg_kills , left_on='champion', right_on='champion')

# Scatterplot of average kills compared with winrate
fig = px.scatter(x=merged_win_kill['average kills'], y=merged_win_kill['winrate'],\
                 labels={'x':'Average Kills','y':'Winrate (%)'})
fig.show()

In [32]:
fig.write_html('file-name.html', include_plotlyjs='cdn')

In [10]:
# Average amount of assists per champion
champ_avg_assists = lol.groupby('champion')['assists'].mean().sort_values(ascending=False).reset_index() \
.rename(columns = {'assists': 'average assists'})

# Merged DF with winrate and average assists
merged_win_assists = winrate_raw.merge(champ_avg_assists , left_on='champion', right_on='champion')

# Scatterplot of average kills compared with winrate
fig = px.scatter(x=merged_win_assists['average assists'], y=merged_win_kill['winrate'] \
                 , labels={'x':'Average Assists','y':'Winrate (%)'})
fig.show()

#### Interesting Aggregates

In [11]:
# Champions with the highest winrate with a sufficient amount of total games played
best_champs_win = (lol.groupby('champion')['win'].sum().apply(lambda x: x if x >= 50 else np.nan) \
 / (lol.groupby('champion')['lose'].sum().apply(lambda x: x if x >= 50 else np.nan) \
    + lol.groupby('champion')['win'].sum().apply(lambda x: x if x >= 50 else np.nan))) \
.sort_values(ascending=False).reset_index().rename(columns = {0: 'winrate'})

best_champs_win.head()

Unnamed: 0,champion,winrate
0,Darius,0.569061
1,Rell,0.561497
2,Wukong,0.560832
3,Twisted Fate,0.555126
4,Taliyah,0.552226


In [12]:
print(best_champs_win.head().to_markdown(index=False))

| champion     |   winrate |
|:-------------|----------:|
| Darius       |  0.569061 |
| Rell         |  0.561497 |
| Wukong       |  0.560832 |
| Twisted Fate |  0.555126 |
| Taliyah      |  0.552226 |


In [13]:
# Average amount of kills for each role depending if it was a lost or win
# Pivot table of "kills" column by grouping and pivoting based on the "position" and "result"
pivot_table = pd.pivot_table(lol, values='kills', index='position', columns='result', aggfunc='mean') \
.rename(columns={0:"lost average kills", 1:"win average kills"})
pivot_table

result,lost average kills,win average kills
position,Unnamed: 1_level_1,Unnamed: 2_level_1
bot,2.580067,5.938062
jng,2.103116,4.011086
mid,2.253373,4.74992
sup,0.630903,1.116003
top,1.803967,3.785829


### Assessment of Missingness

In [46]:
# Missingness between pentakills and champions
contingency_table = pd.crosstab(lol_raw['doublekills'].isnull(), lol_raw['champion'].notnull())

# Perform the chi-square test to determine the dependency
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Check if the dependency is significant
alpha = 0.05
if p_value > alpha:
    print("The missingness of 'doublekills' depends on the 'champion' column.")
else:
    print("The missingness of 'doublekills' does not depend on the 'champion' column.")

The missingness of 'doublekills' depends on the 'champion' column.


In [47]:
# Missingness between pentakills and positions
contingency_table = pd.crosstab(lol_raw['doublekills'].isnull(), lol_raw['position'])

# Perform the chi-square test to determine the dependency
chi2, p_value, _, _ = chi2_contingency(contingency_table)

# Check if the dependency is significant
if p_value < alpha:
    print("The missingness of 'doublekills' depends on the 'position' column.")
else:
    print("The missingness of 'doublekills' does not depend on the 'position' column.")

The missingness of 'doublekills' does not depend on the 'position' column.


In [53]:
# subset = lol_raw[['position', 'doublekills']]

# # Create two subsets: one where 'position' is missing and one where it is not missing
# missing_position = subset[subset['position'].isnull()]
# not_missing_position = subset[~subset['position'].isnull()]

# # Create the plot
# fig = px.histogram(subset, x='doublekills', color_discrete_sequence=['blue'], marginal='rug', nbins=15)
# #fig.update_traces(opacity=0.7, name='All Data')

# fig.add_vline(x=p_value, line_color='red')

# # Update layout
# fig.update_layout(
#     title="Distribution of 'doublekills' when 'position' is missing vs. not missing",
#     xaxis_title="Doublekills",
#     yaxis_title="Count",
#     barmode='overlay',
#     legend_title="Data"
# )

# # Show the plot
# fig.show()

In [50]:
fig.write_html('file-name.html', include_plotlyjs='cdn')

### Hypothesis Testing

Null Hypothesis: The average number of kills for top laners is the same as the average number of kills for mid laners.

Alternative Hypothesis: The average number of kills for top laners is different from the average number of kills for mid laners.

In [16]:
# Create dataframes for top and mid laners
top_laners = lol[lol['position'] == 'top']
mid_laners = lol[lol['position'] == 'mid']

# Extract the kills for each position
top_laner_kills = top_laners['kills']
mid_laner_kills = mid_laners['kills']

# Perform the independent t-test
t_stat, p_value = ttest_ind(top_laner_kills, mid_laner_kills, equal_var=False)

# Define the significance level
alpha = 0.05

# Compare the p-value to the significance level
if p_value < alpha:
    print(f'P-value: {p_value}')
    print("Reject the null hypothesis")
    print("There is a significant difference in the average number of kills between top laners and mid laners.")
else:
    print(f'P-value: {p_value}')
    print("Fail to reject the null hypothesis")
    print("There is no significant difference in the average number of kills between top laners and mid laners.")

P-value: 1.1057544086805869e-203
Reject the null hypothesis
There is a significant difference in the average number of kills between top laners and mid laners.
