## Introduction to Data Project Phase 2
__Parham Javan 810800008<br>__
__Yaser Azad 810800003__

# Environment Setup: Install & Import Libraries

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import math
from scipy import stats

from scipy.stats import norm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.autograd import Variable
from tqdm import tqdm

RANDOM_SEED = 0
MAX_TIME_STEP = 30

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


### Load the Data

In [41]:
df = pd.read_csv('League of Legends competitive matches between 2015-2017 Dataset/LeagueofLegends.csv', sep=',')
df = df[df['gamelength'] >= MAX_TIME_STEP]
df.reset_index(drop = True, inplace = True)
matches = len(df)
print(f'Number of matches: {matches}')

Number of matches: 6384


### Drop unnecessary columns

In [42]:
columns_to_drop = ['League', 'Type', 'blueTeamTag', 'redTeamTag', 'rResult']
df.drop(columns_to_drop, axis=1, inplace=True)

In [43]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6384 entries, 0 to 6383
Data columns (total 52 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Year              6384 non-null   int64 
 1   Season            6384 non-null   object
 2   bResult           6384 non-null   int64 
 3   gamelength        6384 non-null   int64 
 4   golddiff          6384 non-null   object
 5   goldblue          6384 non-null   object
 6   bKills            6384 non-null   object
 7   bTowers           6384 non-null   object
 8   bInhibs           6384 non-null   object
 9   bDragons          6384 non-null   object
 10  bBarons           6384 non-null   object
 11  bHeralds          6384 non-null   object
 12  goldred           6384 non-null   object
 13  rKills            6384 non-null   object
 14  rTowers           6384 non-null   object
 15  rInhibs           6384 non-null   object
 16  rDragons          6384 non-null   object
 17  rBarons       

In [44]:
# droping specific lane diffrences
df.drop(df.columns[19:53], axis=1, inplace=True)

In [45]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6384 entries, 0 to 6383
Data columns (total 19 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        6384 non-null   int64 
 1   Season      6384 non-null   object
 2   bResult     6384 non-null   int64 
 3   gamelength  6384 non-null   int64 
 4   golddiff    6384 non-null   object
 5   goldblue    6384 non-null   object
 6   bKills      6384 non-null   object
 7   bTowers     6384 non-null   object
 8   bInhibs     6384 non-null   object
 9   bDragons    6384 non-null   object
 10  bBarons     6384 non-null   object
 11  bHeralds    6384 non-null   object
 12  goldred     6384 non-null   object
 13  rKills      6384 non-null   object
 14  rTowers     6384 non-null   object
 15  rInhibs     6384 non-null   object
 16  rDragons    6384 non-null   object
 17  rBarons     6384 non-null   object
 18  rHeralds    6384 non-null   object
dtypes: int64(3), object(16)
memory usage: 947.8+ KB


### Missing values

In [46]:
# Count missing values in each column
missing_values_per_column = df.isnull().sum()
print(missing_values_per_column)

Year          0
Season        0
bResult       0
gamelength    0
golddiff      0
goldblue      0
bKills        0
bTowers       0
bInhibs       0
bDragons      0
bBarons       0
bHeralds      0
goldred       0
rKills        0
rTowers       0
rInhibs       0
rDragons      0
rBarons       0
rHeralds      0
dtype: int64


no missing values

### Converting str to list and Converting team specific to diffrence

In [47]:
# After applying literal_eval, each element in the golddiff column is converted from a string to an actual list.
from ast import literal_eval
df['golddiff'] = df['golddiff'].apply(literal_eval)
df[['golddiff']].head()

Unnamed: 0,golddiff
0,"[0, 0, -14, -65, -268, -431, -488, -789, -494,..."
1,"[0, 0, -26, -18, 147, 237, -152, 18, 88, -242,..."
2,"[0, 0, 10, -60, 34, 37, 589, 1064, 1258, 913, ..."
3,"[0, 0, -15, 25, 228, -6, -243, 175, -346, 16, ..."
4,"[40, 40, 44, -36, 113, 158, -121, -191, 23, 20..."


In [48]:
# counts the items that occur up to and including MAX_TIME_STEP time step.
def count_item(items):
    count = np.zeros(MAX_TIME_STEP, dtype=np.int8)
    for timestep in range(MAX_TIME_STEP) :
        for item in items:
            if item[0] <= timestep + 1:
                count[timestep] += 1
    return count

df['bDragons'] = df['bDragons'].apply(literal_eval)
df['rDragons'] = df['rDragons'].apply(literal_eval)

df['bDragons'] = df['bDragons'].apply(count_item)
df['rDragons'] = df['rDragons'].apply(count_item)
df['dragondiff'] = df['bDragons'] - df['rDragons']

df[['dragondiff']].tail()

Unnamed: 0,dragondiff
6379,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1,..."
6380,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,..."
6381,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, ..."
6382,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."
6383,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [49]:
df['bBarons'] = df['bBarons'].apply(literal_eval)
df['rBarons'] = df['rBarons'].apply(literal_eval)

df['bBarons'] = df['bBarons'].apply(count_item)
df['rBarons'] = df['rBarons'].apply(count_item)
df['barondiff'] = df['bBarons'] - df['rBarons']

df[['barondiff']].head()

Unnamed: 0,barondiff
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [50]:
df['bHeralds'] = df['bHeralds'].apply(literal_eval)
df['rHeralds'] = df['rHeralds'].apply(literal_eval)

df['bHeralds'] = df['bHeralds'].apply(count_item)
df['rHeralds'] = df['rHeralds'].apply(count_item)
df['heralddiff'] = df['bHeralds'] - df['rHeralds']

df[['heralddiff']].head()

Unnamed: 0,heralddiff
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [51]:
df['bTowers'] = df['bTowers'].apply(literal_eval)
df['rTowers'] = df['rTowers'].apply(literal_eval)

df['bTowers'] = df['bTowers'].apply(count_item)
df['rTowers'] = df['rTowers'].apply(count_item)
df['towerdiff'] = df['bTowers'] - df['rTowers']

df[['towerdiff']].head()

Unnamed: 0,towerdiff
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -2, -2, ..."


In [52]:
df['bInhibs'] = df['bInhibs'].apply(literal_eval)
df['rInhibs'] = df['rInhibs'].apply(literal_eval)

df['bInhibs'] = df['bInhibs'].apply(count_item)
df['rInhibs'] = df['rInhibs'].apply(count_item)
df['inhibitordiff'] = df['bInhibs'] - df['rInhibs']

df[['inhibitordiff']].head()

Unnamed: 0,inhibitordiff
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [53]:
df['bKills'] = df['bKills'].apply(literal_eval)
df['rKills'] = df['rKills'].apply(literal_eval)

df['bKills'] = df['bKills'].apply(count_item)
df['rKills'] = df['rKills'].apply(count_item)
df['killdiff'] = df['bKills'] - df['rKills']

df[['killdiff']].head()

Unnamed: 0,killdiff
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, ..."
2,"[0, 0, 0, 0, 0, 1, 2, 2, 1, 1, 2, 2, 5, 5, 5, ..."
3,"[0, 0, 0, 0, 0, 0, 0, -1, 0, -1, -1, -1, -1, -..."
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [54]:
stats = ['golddiff','dragondiff', 'barondiff', 'heralddiff', 'towerdiff', 'inhibitordiff', 'killdiff']
x = df[stats]
y = df['bResult']

x.tail()

Unnamed: 0,golddiff,dragondiff,barondiff,heralddiff,towerdiff,inhibitordiff,killdiff
6379,"[0, -8, -187, -37, -92, -164, -229, -424, -256...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -3, -3, -..."
6380,"[0, 0, -18, -95, 45, -87, -117, 199, 126, 92, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, ..."
6381,"[0, 0, -86, -39, -207, -349, -60, -140, 187, -...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -2,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, -1, -1, -1, -1, -1, -2, -2, -2, -4, ..."
6382,"[0, 0, -97, 33, 351, 284, 299, 263, 403, 623, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6383,"[0, 0, -8, -225, -36, 73, 464, 184, 1171, 1409...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, -1, -1, 0..."


do not need these anymore

In [55]:
columns_to_drop = ['goldblue', 'bKills', 'bTowers', 'bInhibs', 'bDragons', 'bBarons', 'bHeralds']
df.drop(columns_to_drop, axis=1, inplace=True)

In [56]:
columns_to_drop = ['goldred', 'rKills', 'rTowers', 'rInhibs', 'rDragons', 'rBarons', 'rHeralds']
df.drop(columns_to_drop, axis=1, inplace=True)

In [57]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6384 entries, 0 to 6383
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           6384 non-null   int64 
 1   Season         6384 non-null   object
 2   bResult        6384 non-null   int64 
 3   gamelength     6384 non-null   int64 
 4   golddiff       6384 non-null   object
 5   dragondiff     6384 non-null   object
 6   barondiff      6384 non-null   object
 7   heralddiff     6384 non-null   object
 8   towerdiff      6384 non-null   object
 9   inhibitordiff  6384 non-null   object
 10  killdiff       6384 non-null   object
dtypes: int64(3), object(8)
memory usage: 548.8+ KB
None


In [58]:
df.describe()


Unnamed: 0,Year,bResult,gamelength
count,6384.0,6384.0,6384.0
mean,2016.260965,0.533365,38.985276
std,0.856199,0.498925,7.159522
min,2014.0,0.0,30.0
25%,2016.0,0.0,34.0
50%,2016.0,1.0,38.0
75%,2017.0,1.0,43.0
max,2018.0,1.0,95.0


In [59]:
print(df.head(4))

   Year  Season  bResult  gamelength  \
0  2015  Spring        1          40   
1  2015  Spring        0          38   
2  2015  Spring        1          40   
3  2015  Spring        0          41   

                                            golddiff  \
0  [0, 0, -14, -65, -268, -431, -488, -789, -494,...   
1  [0, 0, -26, -18, 147, 237, -152, 18, 88, -242,...   
2  [0, 0, 10, -60, 34, 37, 589, 1064, 1258, 913, ...   
3  [0, 0, -15, 25, 228, -6, -243, 175, -346, 16, ...   

                                          dragondiff  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1,...   

                                           barondiff  \
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...   
3  [0, 0, 0, 

### Get last item (new columns)

In [60]:
# Add an integer column containing the value of the last item of each list
stats_last = []
for col in df.columns:
    if col.endswith('diff'):  # Check if column name ends with 'diff'
        stats_last.append(col + '_last')
        df[col + '_last'] = df[col].apply(lambda x: x[-1])

In [61]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6384 entries, 0 to 6383
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Year                6384 non-null   int64 
 1   Season              6384 non-null   object
 2   bResult             6384 non-null   int64 
 3   gamelength          6384 non-null   int64 
 4   golddiff            6384 non-null   object
 5   dragondiff          6384 non-null   object
 6   barondiff           6384 non-null   object
 7   heralddiff          6384 non-null   object
 8   towerdiff           6384 non-null   object
 9   inhibitordiff       6384 non-null   object
 10  killdiff            6384 non-null   object
 11  golddiff_last       6384 non-null   int64 
 12  dragondiff_last     6384 non-null   int8  
 13  barondiff_last      6384 non-null   int8  
 14  heralddiff_last     6384 non-null   int8  
 15  towerdiff_last      6384 non-null   int8  
 16  inhibitordiff_last  6384

In [62]:
print(df['golddiff'].head(4).iloc[0][-5:])

[4473, 4639, 4762, 4686, 6057]


In [63]:

print(df[stats_last].head(4))

   golddiff_last  dragondiff_last  barondiff_last  heralddiff_last  \
0           6057               -2              -1                0   
1           1165                1               1                0   
2          13915                2               0                0   
3          -4507               -3               0                0   

   towerdiff_last  inhibitordiff_last  killdiff_last  
0               3                   1              4  
1               0                   0              1  
2               5                   0              8  
3              -1                   0             -6  


In [64]:
columns_to_analyze = [stat for stat in stats_last]

# Calculate mean and standard deviation for each column
means = df[columns_to_analyze].mean()
stds = df[columns_to_analyze].std()

# Create a DataFrame to store the results
summary_df = pd.DataFrame({'Mean': means, 'Standard Deviation': stds})

# Print the summary DataFrame
print(summary_df)


                          Mean  Standard Deviation
golddiff_last       528.956767        10913.377710
dragondiff_last      -0.108709            2.127554
barondiff_last       -0.005482            0.853044
heralddiff_last       0.070959            0.765140
towerdiff_last        0.447995            3.989268
inhibitordiff_last    0.068139            0.941278
killdiff_last         0.246711            6.586110


In [65]:
df.describe()


Unnamed: 0,Year,bResult,gamelength,golddiff_last,dragondiff_last,barondiff_last,heralddiff_last,towerdiff_last,inhibitordiff_last,killdiff_last
count,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0
mean,2016.260965,0.533365,38.985276,528.956767,-0.108709,-0.005482,0.070959,0.447995,0.068139,0.246711
std,0.856199,0.498925,7.159522,10913.37771,2.127554,0.853044,0.76514,3.989268,0.941278,6.58611
min,2014.0,0.0,30.0,-25974.0,-4.0,-2.0,-3.0,-11.0,-5.0,-26.0
25%,2016.0,0.0,34.0,-9732.25,-2.0,-1.0,0.0,-2.0,0.0,-4.0
50%,2016.0,1.0,38.0,1748.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2017.0,1.0,43.0,10433.5,1.0,1.0,1.0,3.0,0.0,5.0
max,2018.0,1.0,95.0,23772.0,5.0,2.0,3.0,11.0,6.0,28.0


Save Preprocessed Data

In [66]:
df.to_csv('Preprocessed Data/Preprocessed Data.csv', index=False)

### Normalizing Dataset

In [67]:
# Initialize dictionaries for scalers and normalized data
scalers = {}
normalized_data = {}

# Normalize columns with lists of numerical values
for stat in stats:
    scalers[stat] = StandardScaler()
    
    # Partial fit the scaler on each row of the statistic
    for row in df[stat]:
        scalers[stat].partial_fit(np.asarray(row).reshape(-1, 1))
    
    # Transform and normalize each row of the statistic
    normalized_data[stat] = [scalers[stat].transform(np.asarray(row).reshape(-1, 1)).reshape(-1) for row in df[stat]]

# Normalize columns with single numerical values
for stat in stats_last:
    scalers[stat] = StandardScaler()
    
    # Partial fit the scaler on the column data
    scalers[stat].fit(df[[stat]])
    
    # Transform and normalize the column data
    normalized_data[stat] = scalers[stat].transform(df[[stat]]).reshape(-1)

# Create a new DataFrame 'normaldf' 
# Specify columns to add to normaldf
columns_to_add = ['Year', 'Season', 'bResult', 'gamelength']
normaldf = df[columns_to_add].copy()

# Add normalized data to normaldf
for stat in stats:
    normaldf[stat] = normalized_data[stat]

for stat in stats_last:
    normaldf[stat] = normalized_data[stat]

# Save the normalized DataFrame to a CSV file
normaldf.to_csv('Preprocessed Data/Preprocessed Data normalized.csv', index=False)


In [68]:
print(normaldf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6384 entries, 0 to 6383
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Year                6384 non-null   int64  
 1   Season              6384 non-null   object 
 2   bResult             6384 non-null   int64  
 3   gamelength          6384 non-null   int64  
 4   golddiff            6384 non-null   object 
 5   dragondiff          6384 non-null   object 
 6   barondiff           6384 non-null   object 
 7   heralddiff          6384 non-null   object 
 8   towerdiff           6384 non-null   object 
 9   inhibitordiff       6384 non-null   object 
 10  killdiff            6384 non-null   object 
 11  golddiff_last       6384 non-null   float64
 12  dragondiff_last     6384 non-null   float64
 13  barondiff_last      6384 non-null   float64
 14  heralddiff_last     6384 non-null   float64
 15  towerdiff_last      6384 non-null   float64
 16  inhibi

In [69]:
normaldf.describe()


Unnamed: 0,Year,bResult,gamelength,golddiff_last,dragondiff_last,barondiff_last,heralddiff_last,towerdiff_last,inhibitordiff_last,killdiff_last
count,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0,6384.0
mean,2016.260965,0.533365,38.985276,-2.97729e-17,-2.8938140000000004e-17,1.279956e-17,-4.452022e-18,-2.7825139999999995e-19,-1.2243060000000002e-17,1.1130060000000002e-17
std,0.856199,0.498925,7.159522,1.000078,1.000078,1.000078,1.000078,1.000078,1.000078,1.000078
min,2014.0,0.0,30.0,-2.428674,-1.829141,-2.338301,-4.013906,-2.869923,-5.384741,-3.985473
25%,2016.0,0.0,34.0,-0.9403148,-0.8890205,-1.165937,-0.09274673,-0.6136932,-0.07239569,-0.6448485
50%,2016.0,1.0,38.0,0.1117105,0.0510999,0.006427435,-0.09274673,-0.1123088,-0.07239569,-0.03746215
75%,2017.0,1.0,43.0,0.9076309,0.5211601,1.178792,1.214306,0.6397677,-0.07239569,0.7217707
max,2018.0,1.0,95.0,2.129942,2.401401,2.351156,3.828413,2.645305,6.302419,4.214242


## Part 3: Dimensionality Reduction

In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import math
from scipy import stats

from scipy.stats import norm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.autograd import Variable
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [71]:
stats = ['golddiff','dragondiff', 'barondiff', 'heralddiff', 'towerdiff', 'inhibitordiff', 'killdiff']
# Add an integer column containing the value of the last item of each list
stats_last = []
for col in df.columns:
    if col.endswith('diff'):  # Check if column name ends with 'diff'
        stats_last.append(col + '_last')
        df[col + '_last'] = df[col].apply(lambda x: x[-1])


convert each Diff column list to 30 new columns

In [72]:
for col in stats:
    # Extracting the first 30 values from each row in the current column
    df[f'{col}_first_30'] = df[col].apply(lambda x: x[:30])

    # Creating new columns for the first 30 values (assuming they're always there)
    for i in range(30):
        df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)

    # Dropping the intermediate column '{col}_first_30'
    df.drop(columns=[f'{col}_first_30'], inplace=True)
    
df.drop(stats, axis=1, inplace=True)
df.drop(stats_last, axis=1, inplace=True)
columns_to_drop = ['Season', 'Year']
df.drop(columns_to_drop, axis=1, inplace=True)

df.to_csv('Preprocessed Data/df_extract.csv', index=False)

  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_first_30'] = df[col].apply(lambda x: x[:30])
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col}_first_30'].apply(lambda x: x[i] if i < len(x) else None)
  df[f'{col}_{i+1}'] = df[f'{col

In [73]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6384 entries, 0 to 6383
Columns: 212 entries, bResult to killdiff_30
dtypes: int64(32), int8(180)
memory usage: 2.7 MB
None


PCA for Variance 95%+

In [74]:
df_subset = df.iloc[:, 2:]


# Separate out the numerical columns
numerical_cols = df_subset.select_dtypes(include=['float64', 'int64']).columns

# Standardize the numerical data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_cols])

# Initialize PCA
pca = PCA()

# Fit PCA
pca.fit(scaled_data)

# Determine number of components to retain 95% variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cumulative_variance >= 0.95) + 1

# Print the number of components and explained variance
print(f'Number of components to retain 95% variance: {n_components}')
print(f'Explained variance with {n_components} components: {cumulative_variance[n_components - 1]:.4f}')

# Use the determined number of components
pca = PCA(n_components=n_components)

# Fit and transform the data
pca_data = pca.fit_transform(scaled_data)

# Convert the transformed data into a dataframe if needed
pca_df = pd.DataFrame(data=pca_data, columns=[f'PC{i}' for i in range(1, n_components + 1)])

# Concatenate the PCA dataframe with the non-numerical columns from original dataframe
final_df = pd.concat([df['bResult'],df['gamelength'], pca_df], axis=1)

# Save the final dataframe to a CSV file
final_df.to_csv('Preprocessed Data/final_dataframe_with_PCA.csv', index=False)


# save the PCA components for potential future use
pca_components = pd.DataFrame(data=pca.components_, columns=numerical_cols)
pca_components.to_csv('Preprocessed Data/pca_components.csv', index=False)

print("\nFinal dataframe and PCA components saved successfully.")


Number of components to retain 95% variance: 10
Explained variance with 10 components: 0.9584

Final dataframe and PCA components saved successfully.


PCA for 2 component

In [76]:
# Step 1: Standardize the data
scaler = StandardScaler()
X = df.values  # Extract the values from the DataFrame
X_scaled = scaler.fit_transform(X)

# Step 2: Perform PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

# Print explained variance
explained_variance = pca.explained_variance_ratio_
print("Explained variance for each principal component:", explained_variance)

# Step 3: Save principal components into a DataFrame
principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# Step 4: Save principal_df and df to CSV files
principal_df.to_csv('Preprocessed Data/2nd_principal_components.csv', index=False)
df.to_csv('Preprocessed Data/original_data.csv', index=False)

# Print a message confirming the saving process
print("Data saved successfully.")


Explained variance for each principal component: [0.31247929 0.09468971]
Data saved successfully.
