# Notebook Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
from pydantic import BaseModel, ValidationError
from typing import List
from datetime import datetime
import pydantic_core
import concurrent.futures
import json

# Preprocessing Data

In [3]:
# Load gathered data
df_reshaped =pd.read_pickle('./saved_files/df_reshaped.pickle').copy()

In [4]:
# Drop rows with nan values and with missing values
df_reshaped.isna().sum()

left_pick_order_1    54
left_pick_order_2    54
left_pick_order_3    54
left_pick_order_4    54
left_pick_order_5    54
                     ..
right_job_cd_5       54
right_preban          0
right_postban         0
first_pick            0
is_win                0
Length: 76, dtype: int64

In [5]:
df_reshaped.dropna(inplace = True)
df_reshaped.reset_index(drop=True, inplace= True)
df_reshaped.isna().sum()

left_pick_order_1    0
left_pick_order_2    0
left_pick_order_3    0
left_pick_order_4    0
left_pick_order_5    0
                    ..
right_job_cd_5       0
right_preban         0
right_postban        0
first_pick           0
is_win               0
Length: 76, dtype: int64

In [6]:
# Filter out rows with missing values in columns: first_pick, left_postban and right_postban
df_reshaped = df_reshaped[(df_reshaped['first_pick'] != 'no_fp_data') & (df_reshaped['left_postban'] != 'no_preban_data') & (df_reshaped['right_postban'] != 'no_preban_data')]
df_reshaped

Unnamed: 0,left_pick_order_1,left_pick_order_2,left_pick_order_3,left_pick_order_4,left_pick_order_5,left_hero_code_1,left_hero_code_2,left_hero_code_3,left_hero_code_4,left_hero_code_5,...,right_attribute_cd_5,right_job_cd_1,right_job_cd_2,right_job_cd_3,right_job_cd_4,right_job_cd_5,right_preban,right_postban,first_pick,is_win
0,1,2,3,4,5,c1159,c2039,c6037,c1096,c1038,...,light,manauser,knight,assassin,warrior,assassin,"[c2112, c2066]",c2042,1,1
1,1,2,3,4,5,c2112,c2042,c1019,c5016,c1151,...,light,knight,warrior,assassin,manauser,knight,"[c2039, c1055]",c2090,0,1
2,1,2,3,4,5,c2112,c2039,c1129,c2069,c2102,...,wind,knight,assassin,assassin,mage,assassin,"[c1118, c2066]",c1014,1,1
3,1,2,3,4,5,c1159,c2039,c6037,c1096,c1135,...,ice,manauser,knight,warrior,assassin,mage,"[c2112, c1125]",c1103,1,1
4,1,2,3,4,5,c1159,c2042,c6037,c1135,c1096,...,ice,manauser,manauser,knight,warrior,mage,"[c1151, c2112]",c2022,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9941,1,2,3,4,5,c1159,c2016,c2008,c2066,c2106,...,fire,warrior,knight,manauser,mage,warrior,"[c1118, c2066]",c2088,0,2
9942,1,2,3,4,5,c2090,c1159,c2008,c5082,c1151,...,fire,knight,assassin,warrior,assassin,assassin,"[c2112, c1055]",c2090,0,2
9943,1,2,3,4,5,c2090,c1159,c2106,c2101,c2089,...,fire,manauser,assassin,manauser,warrior,assassin,"[c1055, c2066]",c5089,1,1
9944,1,2,3,4,5,c1159,c2016,c2008,c1117,c2106,...,dark,manauser,mage,assassin,assassin,warrior,"[c2109, c1117]",c2066,0,2


In [None]:
# Change hero codes in df_reshaped to hero names
heroes_df = pd.read_pickle('./saved_files/heroes_names.pickle')
heroes_df

In [None]:
# Convert gathered data into a dictionary to convert hero codes into names
heroes_dic = heroes_df.to_dict()
print(heroes_dic[0])
print(type(heroes_dic))

In [None]:
# Add information on prebans to the df
df_reshaped[['left_preban_1', 'left_preban_2']] = pd.DataFrame(df_reshaped.left_preban.to_list(), index= df_reshaped.index)
#df_reshaped[['left_preban_1', 'left_preban_2', 'left_preban']]
df_reshaped[['right_preban_1', 'right_preban_2']] = pd.DataFrame(df_reshaped.right_preban.to_list(), index= df_reshaped.index)
df_reshaped.drop(['left_preban', 'right_preban'], axis = 1, inplace= True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reshaped[['left_preban_1', 'left_preban_2']] = pd.DataFrame(df_reshaped.left_preban.to_list(), index= df_reshaped.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reshaped[['left_preban_1', 'left_preban_2']] = pd.DataFrame(df_reshaped.left_preban.to_list(), index= df_reshaped.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/inde

In [None]:
df_reshaped[['left_preban_1', 'left_preban_2', 'right_preban_1', 'right_preban_2']]

Unnamed: 0,left_preban_1,left_preban_2,right_preban_1,right_preban_2
0,c1133,c2066,c2112,c2066
1,c1133,c2066,c2039,c1055
2,c1133,c2066,c1118,c2066
3,c1133,c2066,c2112,c1125
4,c2066,c1133,c1151,c2112
...,...,...,...,...
9941,c2112,c1055,c1118,c2066
9942,c2112,c1055,c2112,c1055
9943,c2112,c1055,c1055,c2066
9944,c2112,c1055,c2109,c1117


In [None]:
# Replace hero codes with names
df_reshaped.replace(to_replace= heroes_dic[0], inplace= True)
df_reshaped

NameError: name 'heroes_dic' is not defined

In [None]:
df_reshaped.columns

Index(['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
       'left_pick_order_4', 'left_pick_order_5', 'left_hero_code_1',
       'left_hero_code_2', 'left_hero_code_3', 'left_hero_code_4',
       'left_hero_code_5', 'left_attack_damage_1', 'left_attack_damage_2',
       'left_attack_damage_3', 'left_attack_damage_4', 'left_attack_damage_5',
       'left_receive_damage_1', 'left_receive_damage_2',
       'left_receive_damage_3', 'left_receive_damage_4',
       'left_receive_damage_5', 'left_kill_count_1', 'left_kill_count_2',
       'left_kill_count_3', 'left_kill_count_4', 'left_kill_count_5',
       'left_attribute_cd_1', 'left_attribute_cd_2', 'left_attribute_cd_3',
       'left_attribute_cd_4', 'left_attribute_cd_5', 'left_job_cd_1',
       'left_job_cd_2', 'left_job_cd_3', 'left_job_cd_4', 'left_job_cd_5',
       'left_postban', 'right_pick_order_1', 'right_pick_order_2',
       'right_pick_order_3', 'right_pick_order_4', 'right_pick_order_5',
       'right_hero_co

In [None]:
# Change df values to relevant dtypes
# df_reshaped.convert_dtypes().dtypes

# integers
df_reshaped[['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
       'left_pick_order_4', 'left_pick_order_5', 'left_attack_damage_1', 'left_attack_damage_2',
       'left_attack_damage_3', 'left_attack_damage_4', 'left_attack_damage_5',
       'left_receive_damage_1', 'left_receive_damage_2',
       'left_receive_damage_3', 'left_receive_damage_4',
       'left_receive_damage_5', 'left_kill_count_1', 'left_kill_count_2',
       'left_kill_count_3', 'left_kill_count_4', 'left_kill_count_5',
       'right_pick_order_1', 'right_pick_order_2',
       'right_pick_order_3', 'right_pick_order_4', 'right_pick_order_5', 'right_attack_damage_1',
       'right_attack_damage_2', 'right_attack_damage_3',
       'right_attack_damage_4', 'right_attack_damage_5',
       'right_receive_damage_1', 'right_receive_damage_2',
       'right_receive_damage_3', 'right_receive_damage_4',
       'right_receive_damage_5', 'right_kill_count_1', 'right_kill_count_2',
       'right_kill_count_3', 'right_kill_count_4', 'right_kill_count_5',
       'first_pick', 'is_win']] = df_reshaped[['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
       'left_pick_order_4', 'left_pick_order_5', 'left_attack_damage_1', 'left_attack_damage_2',
       'left_attack_damage_3', 'left_attack_damage_4', 'left_attack_damage_5',
       'left_receive_damage_1', 'left_receive_damage_2',
       'left_receive_damage_3', 'left_receive_damage_4',
       'left_receive_damage_5', 'left_kill_count_1', 'left_kill_count_2',
       'left_kill_count_3', 'left_kill_count_4', 'left_kill_count_5',
       'right_pick_order_1', 'right_pick_order_2',
       'right_pick_order_3', 'right_pick_order_4', 'right_pick_order_5', 'right_attack_damage_1',
       'right_attack_damage_2', 'right_attack_damage_3',
       'right_attack_damage_4', 'right_attack_damage_5',
       'right_receive_damage_1', 'right_receive_damage_2',
       'right_receive_damage_3', 'right_receive_damage_4',
       'right_receive_damage_5', 'right_kill_count_1', 'right_kill_count_2',
       'right_kill_count_3', 'right_kill_count_4', 'right_kill_count_5',
       'first_pick', 'is_win']].astype(int)


#strings
df_reshaped[['left_hero_code_1', 'left_hero_code_2', 'left_hero_code_3', 'left_hero_code_4',
       'left_hero_code_5', 'left_attribute_cd_1', 'left_attribute_cd_2', 'left_attribute_cd_3',
       'left_attribute_cd_4', 'left_attribute_cd_5', 'left_job_cd_1',
       'left_job_cd_2', 'left_job_cd_3', 'left_job_cd_4', 'left_job_cd_5',
       'left_postban','right_hero_code_1', 'right_hero_code_2', 'right_hero_code_3',
       'right_hero_code_4', 'right_hero_code_5', 'right_attribute_cd_1', 'right_attribute_cd_2', 'right_attribute_cd_3',
       'right_attribute_cd_4', 'right_attribute_cd_5', 'right_job_cd_1',
       'right_job_cd_2', 'right_job_cd_3', 'right_job_cd_4', 'right_job_cd_5',
       'right_postban', 'left_preban_1',
       'left_preban_2', 'right_preban_1', 'right_preban_2']] = df_reshaped[['left_hero_code_1', 'left_hero_code_2', 'left_hero_code_3', 'left_hero_code_4',
       'left_hero_code_5', 'left_attribute_cd_1', 'left_attribute_cd_2', 'left_attribute_cd_3',
       'left_attribute_cd_4', 'left_attribute_cd_5', 'left_job_cd_1',
       'left_job_cd_2', 'left_job_cd_3', 'left_job_cd_4', 'left_job_cd_5',
       'left_postban','right_hero_code_1', 'right_hero_code_2', 'right_hero_code_3',
       'right_hero_code_4', 'right_hero_code_5', 'right_attribute_cd_1', 'right_attribute_cd_2', 'right_attribute_cd_3',
       'right_attribute_cd_4', 'right_attribute_cd_5', 'right_job_cd_1',
       'right_job_cd_2', 'right_job_cd_3', 'right_job_cd_4', 'right_job_cd_5',
       'right_postban', 'left_preban_1',
       'left_preban_2', 'right_preban_1', 'right_preban_2']].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reshaped[['left_pick_order_1', 'left_pick_order_2', 'left_pick_order_3',
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reshaped[['left_hero_code_1', 'left_hero_code_2', 'left_hero_code_3', 'left_hero_code_4',


In [None]:
df_reshaped.describe()

Unnamed: 0,left_pick_order_1,left_pick_order_2,left_pick_order_3,left_pick_order_4,left_pick_order_5,left_attack_damage_1,left_attack_damage_2,left_attack_damage_3,left_attack_damage_4,left_attack_damage_5,...,right_receive_damage_3,right_receive_damage_4,right_receive_damage_5,right_kill_count_1,right_kill_count_2,right_kill_count_3,right_kill_count_4,right_kill_count_5,first_pick,is_win
count,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,...,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0,9892.0
mean,1.0,2.0,3.0,4.0,5.0,8358.296401,11651.846846,11073.439143,13027.751415,10190.808532,...,39982.515063,25037.167711,17982.807521,0.16023,0.230894,0.235342,0.290841,0.236454,0.496563,1.391832
std,0.0,0.0,0.0,0.0,0.0,12387.267146,15707.541664,16893.497811,18449.573722,16816.526785,...,37798.12589,32752.530231,28656.323167,0.461985,0.593552,0.609974,0.706874,0.635289,0.500013,0.488184
min,1.0,2.0,3.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,2.0,3.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,...,13374.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,2.0,3.0,4.0,5.0,2463.5,5411.5,3934.5,3230.5,0.0,...,27567.0,15180.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,2.0,3.0,4.0,5.0,12761.75,17441.25,14322.0,21650.75,15992.0,...,57568.25,36902.0,25727.5,0.0,0.0,0.0,0.0,0.0,1.0,2.0
max,1.0,2.0,3.0,4.0,5.0,111032.0,132986.0,148284.0,125165.0,125145.0,...,310896.0,288966.0,293721.0,6.0,6.0,5.0,7.0,5.0,1.0,2.0


In [None]:
# Save preprocessed df in a pickle format
df_reshaped.to_pickle('./saved_files/data_final.pickle')

# Explolatory Data Analysis

In [None]:
df_reshaped = pd.read_pickle('./data_final.pickle')

In [None]:
# Quick EDA by utilising ProfileReport
from ydata_profiling import ProfileReport

profile = ProfileReport(df_reshaped)
profile.to_file('./saved_files/EDA.json')
profile.to_file('./saved_files/EDA.html')

# Numeric data requires sclaling

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'could not convert string to float: 'wind'')


Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]