In [1]:
import pandas as pd
import numpy as np

# Import local CSV
df = pd.read_pickle('data/tft_games/tft_games_initial_clean_pickle.csv')

df

Unnamed: 0,level,placement,length,round,queue,lobby_rank,units,augment_1,augment_2,augment_3,...,unit_13_name,unit_13_tier,unit_13_item_1,unit_13_item_2,unit_13_item_3,unit_14_name,unit_14_tier,unit_14_item_1,unit_14_item_2,unit_14_item_3
0,10,2,2193,39,Ranked,Bronze I,"[{'name': 'azir', 'star': 1, 'items': ['infini...",theroadlesstraveled,jeweledlotus,capriciousforge,...,,,,,,,,,,
1,10,1,2117,35,Ranked,Bronze I,"[{'name': 'ahri', 'star': 3, 'items': ['spearo...",teambuilding,radiantrefactor,fatedcrown,...,,,,,,,,,,
2,7,8,1543,23,Ranked,Silver IV,"[{'name': 'bard', 'star': 1, 'items': [None]},...",buriedtreasuresii,mythiccrown,balancedbudget+,...,,,,,,,,,,
3,7,5,2023,32,Ranked,Bronze I,"[{'name': 'galio', 'star': 2, 'items': ['redem...",blisteringstrikes,biggrabbag,pandorasitemsii,...,,,,,,,,,,
4,8,6,1658,27,Ranked,,"[{'name': 'aatrox', 'star': 2, 'items': [None]...",pandorasitems,teambuilding,bruisercrest,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76671,8,2,2195,40,Ranked,Silver I,"[{'name': 'aatrox', 'star': 3, 'items': ['fist...",toomuchcandy,ghostlycrown,rollthedice,...,,,,,,,,,,
76672,7,8,1413,21,Ranked,Silver II,"[{'name': 'bard', 'star': 2, 'items': ['hextec...",epoch,teambuilding,finalascension,...,,,,,,,,,,
76673,9,3,2169,37,Ranked,Silver I,"[{'name': 'alune', 'star': 2, 'items': ['jewel...",buriedtreasuresiii,gargantuanresolve,rollthedice,...,,,,,,,,,,
76674,8,6,2006,30,Ranked,Silver II,"[{'name': 'bard', 'star': 3, 'items': ['redbuf...",escortquest,heroicgrabbag,stationarysupportii,...,,,,,,,,,,


Lets start with game summary info. Lets take a look again at what game summary looks like again

In [2]:
# # Replace any 'N/A' with np.nan
df.replace('N/A', np.nan, inplace=True)

# Summary statistics for level, length_seconds, rounds, and placement
summary_stats = df[['level', 'length', 'round', 'lobby_rank']].describe(include = 'all')

# Check for missing values in the relevant columns
missing_values = df[['level', 'length', 'round', 'lobby_rank']].isnull().sum()

summary_stats, missing_values

(               level        length         round lobby_rank
 count   76676.000000  76676.000000  76676.000000      75822
 unique           NaN           NaN           NaN         30
 top              NaN           NaN           NaN    Gold IV
 freq             NaN           NaN           NaN       9707
 mean        8.378671   1927.607491     31.103474        NaN
 std         0.894662    295.323242      5.899399        NaN
 min         4.000000    602.000000      6.000000        NaN
 25%         8.000000   1712.000000     27.000000        NaN
 50%         8.000000   1950.000000     32.000000        NaN
 75%         9.000000   2144.000000     35.000000        NaN
 max        10.000000   2914.000000     51.000000        NaN,
 level           0
 length          0
 round           0
 lobby_rank    854
 dtype: int64)

First, let's address the missing values in the lobby_rank column by removing the rows that contain them. In ranked games, players in the lobby with little to no experience in the set might have builds that are outliers compared to the rest of the data.

Next, we'll normalize the length and round columns, standardize the level column, and perform categorical encoding on lobby_rank, followed by normalizing it.


In [3]:
# Drops all nan in the lobby_rank column
df = df.dropna(subset=['lobby_rank'])

# Check for missing values in the relevant columns
missing_values = df[['level', 'length', 'round', 'lobby_rank']].isnull().sum()

missing_values

level         0
length        0
round         0
lobby_rank    0
dtype: int64

There are no longer any NaN values in the summary columns, so we can proceed with normalization, standardization, and categorical encoding.

In [4]:
summary_df = pd.DataFrame(df['placement'])

def normalize_column(column_to_normalize):
    # Get the minimum and maximum values of the column
    min_value = column_to_normalize.min()
    max_value = column_to_normalize.max()
    
    # Apply the Min-Max normalization formula
    column_to_normalize = (column_to_normalize - min_value) / (max_value - min_value)
    return column_to_normalize

# Normalize the 'length' and 'round' columns and add it to the new DataFrame
summary_df['length'] = normalize_column(df['length'])
summary_df['round'] = normalize_column(df['round'])

def standardize_column(column_to_standardize):
    # Get the mean and standard deviation of the column
    mean_value = column_to_standardize.mean()
    std_value = column_to_standardize.std()
    
    # Apply the standardization formula
    column_to_standardize = (column_to_standardize - mean_value) / std_value
    return column_to_standardize

# Standardize the 'level' column and add it to the new DataFrame
summary_df['level'] = standardize_column(df['level'])

summary_df

Unnamed: 0,placement,length,round,level
0,2,0.688149,0.733333,1.811366
1,1,0.655277,0.644444,1.811366
2,8,0.407007,0.377778,-1.542105
3,5,0.614619,0.577778,-1.542105
5,7,0.437716,0.422222,-0.424281
...,...,...,...,...
76671,2,0.689014,0.755556,-0.424281
76672,8,0.350779,0.333333,-1.542105
76673,3,0.677768,0.688889,0.693542
76674,6,0.607266,0.533333,-0.424281


In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Initialize a new DataFrame with the 'placement' column from the original DataFrame
summary_df_sklearn = pd.DataFrame(df['placement'])

# Initialize the MinMaxScaler for normalization
scaler = MinMaxScaler()

# Normalize the 'length' and 'round' columns and add it to the new DataFrame
summary_df_sklearn['length'] = scaler.fit_transform(df[['length']])
summary_df_sklearn['round'] = scaler.fit_transform(df[['round']])

# Initialize the StandardScaler for standardization
scaler = StandardScaler()

# Standardize the 'level' column and add it to the new DataFrame
summary_df_sklearn['level'] = scaler.fit_transform(df[['level']])

summary_df_sklearn

Unnamed: 0,placement,length,round,level
0,2,0.688149,0.733333,1.811378
1,1,0.655277,0.644444,1.811378
2,8,0.407007,0.377778,-1.542115
3,5,0.614619,0.577778,-1.542115
5,7,0.437716,0.422222,-0.424284
...,...,...,...,...
76671,2,0.689014,0.755556,-0.424284
76672,8,0.350779,0.333333,-1.542115
76673,3,0.677768,0.688889,0.693547
76674,6,0.607266,0.533333,-0.424284


In [6]:
# Define the rank tiers and levels
rank_tiers = ["Iron", "Bronze", "Silver", "Gold", "Platinum", "Diamond", "Master", "Grandmaster", "Challenger"]
rank_levels = ["IV", "III", "II", "I"]

# Generate the rank mapping dictionary
rank_mapping = {
    f"{tier} {level}": index
    for index, (tier, level) in enumerate(
        (tier, level) for tier in rank_tiers for level in rank_levels
    )
}

# Add special cases without levels
rank_mapping["Master"] = len(rank_mapping)
rank_mapping["Grandmaster"] = len(rank_mapping) + 1
rank_mapping["Challenger"] = len(rank_mapping) + 2

# Function to preprocess ranks and remove LP details
def preprocess_rank(rank):
    if isinstance(rank, str):
        if "LP" in rank:
            rank = rank.split()[0]  # Only take the first part (e.g., "Master")
        return rank
    return None  # Return None for non-string values

# Apply the mapping to the DataFrame
ranking_stripped = df["lobby_rank"].apply(preprocess_rank)
summary_df["lobby_rank"] = normalize_column(ranking_stripped.map(rank_mapping))

summary_df

Unnamed: 0,placement,length,round,level,lobby_rank
0,2,0.688149,0.733333,1.811366,0.171429
1,1,0.655277,0.644444,1.811366,0.171429
2,8,0.407007,0.377778,-1.542105,0.200000
3,5,0.614619,0.577778,-1.542105,0.171429
5,7,0.437716,0.422222,-0.424281,0.171429
...,...,...,...,...,...
76671,2,0.689014,0.755556,-0.424281,0.285714
76672,8,0.350779,0.333333,-1.542105,0.257143
76673,3,0.677768,0.688889,0.693542,0.285714
76674,6,0.607266,0.533333,-0.424281,0.257143


Now, let's focus on the augments column and apply categorical encoding to its different values. First, let's take another look at the contents of the augments column.

In [7]:
# Identify columns related to augments
augment_columns = [col for col in df.columns if 'augment' in col.lower()]

# Summary statistics for augment columns
augment_summary_stats = df[augment_columns].describe(include='all')

# Get the count of nan columns for each augment column
augment_na = df[augment_columns].isnull().sum()

augment_summary_stats, augment_na

(               augment_1      augment_2   augment_3
 count              75822          75649       74608
 unique               225            229         196
 top     wanderingtrainer  pandorasitems  biggrabbag
 freq                1770           1744        2267,
 augment_1       0
 augment_2     173
 augment_3    1214
 dtype: int64)

In [8]:
# Fill missing values with None to represent missing Augment
augment_df = df[augment_columns].fillna('None')

# Get the count of nan columns for each augment column
augment_na = augment_df.isnull().sum()

augment_na

augment_1    0
augment_2    0
augment_3    0
dtype: int64

In [9]:
from category_encoders import TargetEncoder

# Create an instance of the TargetEncoder, specifying the columns to be encoded
te = TargetEncoder(cols=augment_columns)

# Extract the target column from the dataframe
target = df['placement']

# Fit the TargetEncoder to the augment_df and target, and transform the augment_df
encoded_augments = te.fit_transform(augment_df, target)

# Update the summary_df with the encoded values for the specified columns
summary_df[augment_columns] = encoded_augments

summary_df

Unnamed: 0,placement,length,round,level,lobby_rank,augment_1,augment_2,augment_3
0,2,0.688149,0.733333,1.811366,0.171429,4.274232,4.302792,4.384880
1,1,0.655277,0.644444,1.811366,0.171429,4.611421,4.565737,4.050157
2,8,0.407007,0.377778,-1.542105,0.200000,4.398335,4.248980,4.480667
3,5,0.614619,0.577778,-1.542105,0.171429,4.486739,4.296875,4.808094
5,7,0.437716,0.422222,-0.424281,0.171429,4.709890,4.584816,4.513920
...,...,...,...,...,...,...,...,...
76671,2,0.689014,0.755556,-0.424281,0.285714,4.189586,4.046243,4.487342
76672,8,0.350779,0.333333,-1.542105,0.257143,4.144089,4.465894,4.181019
76673,3,0.677768,0.688889,0.693542,0.285714,4.273902,4.490870,4.487342
76674,6,0.607266,0.533333,-0.424281,0.257143,4.299169,4.712225,4.258486


In [10]:
# Identify columns related to traits
trait_columns = [col for col in df.columns if 'trait' in col.lower()]

# Summary statistics for traits columns
trait_summary_stats = df[trait_columns].describe(include='all')

# Get the count of nan columns for each traits column
trait_na = df[trait_columns].isnull().sum()

trait_summary_stats, trait_na

(       trait_1_name  trait_1_count trait_2_name  trait_2_count trait_3_name  \
 count         75822   75822.000000        75362   75362.000000        71456   
 unique           27            NaN           27            NaN           27   
 top          mythic            NaN     arcanist            NaN     behemoth   
 freq          10847            NaN        10661            NaN        12759   
 mean            NaN       4.978304          NaN       2.774847          NaN   
 std             NaN       1.544380          NaN       0.861577          NaN   
 min             NaN       1.000000          NaN       1.000000          NaN   
 25%             NaN       4.000000          NaN       2.000000          NaN   
 50%             NaN       5.000000          NaN       3.000000          NaN   
 75%             NaN       6.000000          NaN       3.000000          NaN   
 max             NaN       9.000000          NaN       7.000000          NaN   
 
         trait_3_count trait_4_name  t

In [11]:
# Extract trait name and count columns
trait_names = [col for col in trait_columns if 'name' in col]
trait_counts = [col for col in trait_columns if 'count' in col]

# Initialize an empty DataFrame to hold the encoded values for each trait column
encoded_data = pd.DataFrame()

# Apply TargetEncoder
encoder = TargetEncoder()

# Loop through each pair of trait name and count columns
for name_col, count_col in zip(trait_names, trait_counts):
    temp_df = df[[name_col, count_col]].copy()
    temp_df.columns = ['trait_name', 'trait_count']

    # Apply TargetEncoder to the current trait column pair
    encoded_col = encoder.fit_transform(temp_df['trait_name'], temp_df['trait_count'])

    # Add the encoded values to the encoded_data DataFrame with the appropriate column names
    encoded_data[name_col] = encoded_col

# Fill NaN with 0 for the encoded value
encoded_traits = encoded_data.fillna(0)

# Create a new list of the column names based on the previous trait_columns list
trait_encoded_columns = [trait_columns[i].replace('_name', '') for i in range(0, len(trait_columns), 2)]

# Add the encoded_traits to the summary_df
summary_df[trait_encoded_columns] = encoded_traits

summary_df

Unnamed: 0,placement,length,round,level,lobby_rank,augment_1,augment_2,augment_3,trait_1,trait_2,trait_3,trait_4,trait_5
0,2,0.688149,0.733333,1.811366,0.171429,4.274232,4.302792,4.384880,3.903119,2.785139,2.054433,2.000000,2.000000
1,1,0.655277,0.644444,1.811366,0.171429,4.611421,4.565737,4.050157,5.813285,2.179101,2.008225,2.000000,2.000000
2,8,0.407007,0.377778,-1.542105,0.200000,4.398335,4.248980,4.480667,5.102517,2.233749,2.003449,2.000000,2.003122
3,5,0.614619,0.577778,-1.542105,0.171429,4.486739,4.296875,4.808094,5.895499,2.388684,2.008225,2.000710,2.000000
5,7,0.437716,0.422222,-0.424281,0.171429,4.709890,4.584816,4.513920,4.350622,2.233749,2.045893,2.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76671,2,0.689014,0.755556,-0.424281,0.285714,4.189586,4.046243,4.487342,5.541109,3.120494,2.003449,2.001922,2.000000
76672,8,0.350779,0.333333,-1.542105,0.257143,4.144089,4.465894,4.181019,5.102517,3.091686,2.054433,2.000000,2.000000
76673,3,0.677768,0.688889,0.693542,0.285714,4.273902,4.490870,4.487342,5.385577,2.179101,2.125075,1.000000,0.000000
76674,6,0.607266,0.533333,-0.424281,0.257143,4.299169,4.712225,4.258486,5.895499,2.388684,2.008225,2.057957,2.000000


In [12]:
# Identify columns related to units
unit_columns = [col for col in df.columns if 'unit_' in col.lower()]

# Summary statistics for unit columns
unit_summary_stats = df[unit_columns].describe(include='all')

# Get the count of nan columns for each unit column
unit_na = df[unit_columns].isnull().sum()

unit_summary_stats, unit_na

(       unit_1_name   unit_1_tier      unit_1_item_1      unit_1_item_2  \
 count        75822  75822.000000              49446              39887   
 unique          43           NaN                138                138   
 top         aatrox           NaN  guinsoosrageblade  guinsoosrageblade   
 freq         11189           NaN               3621               3029   
 mean           NaN      2.027327                NaN                NaN   
 std            NaN      0.607382                NaN                NaN   
 min            NaN      1.000000                NaN                NaN   
 25%            NaN      2.000000                NaN                NaN   
 50%            NaN      2.000000                NaN                NaN   
 75%            NaN      2.000000                NaN                NaN   
 max            NaN      3.000000                NaN                NaN   
 
             unit_1_item_3 unit_2_name   unit_2_tier      unit_2_item_1  \
 count               31

In [13]:
# Load the power data files
tft_units_power_df = pd.read_csv('data/tft_units/tft_units_power.csv')
tft_items_power_df = pd.read_csv('data/tft_items/tft_items_power.csv')

# Create lookup dictionaries for unit and item power levels
unit_power_dict = {(row['Name'], row['Tier']): row['Power Level'] for _, row in tft_units_power_df.iterrows()}
item_power_dict = {row['Name']: row['Power Level'] for _, row in tft_items_power_df.iterrows()}

# Define a function to calculate the total power score for a list of units
def calculate_total_unit_power(units):
    total_scores = []
    for unit in units:
        unit_score = unit_power_dict.get((unit['name'], unit['star']), 0)
        item_scores = sum(item_power_dict.get(item, 0) for item in unit['items'] if item)
        total_scores.append(unit_score + item_scores)
    return total_scores

# Calculate the power scores for each unit in the dataframe
units_power_scores = [calculate_total_unit_power(item) for item in df['units']]

# Determine the maximum number of units in any game
max_units = max(len(x) for x in units_power_scores)

# Create a DataFrame for unit power scores and add the scores to it
units_encoded_df = pd.DataFrame(units_power_scores, columns=[f'unit_{i}_powerscore' for i in range(1, max_units + 1)])

# Fill na values with 0
units_encoded_df = units_encoded_df.fillna(0)

# Ensure index alignment before merging
units_encoded_df = units_encoded_df.reset_index(drop=True)
summary_df = summary_df.reset_index(drop=True)

# Merge the encoded units DataFrame to the main DataFrame
summary_df[units_encoded_df.columns] = units_encoded_df

summary_df

Unnamed: 0,placement,length,round,level,lobby_rank,augment_1,augment_2,augment_3,trait_1,trait_2,...,unit_5_powerscore,unit_6_powerscore,unit_7_powerscore,unit_8_powerscore,unit_9_powerscore,unit_10_powerscore,unit_11_powerscore,unit_12_powerscore,unit_13_powerscore,unit_14_powerscore
0,2,0.688149,0.733333,1.811366,0.171429,4.274232,4.302792,4.384880,3.903119,2.785139,...,2.943637,5.507338,5.566097,1.950610,3.831156,5.255435,1.865984,0.0,0.0,0.0
1,1,0.655277,0.644444,1.811366,0.171429,4.611421,4.565737,4.050157,5.813285,2.179101,...,4.279802,5.892107,3.404246,3.205034,4.317366,3.851147,0.000000,0.0,0.0,0.0
2,8,0.407007,0.377778,-1.542105,0.200000,4.398335,4.248980,4.480667,5.102517,2.233749,...,3.194145,3.111792,4.180305,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
3,5,0.614619,0.577778,-1.542105,0.171429,4.486739,4.296875,4.808094,5.895499,2.388684,...,4.732673,2.816261,2.687025,3.395754,0.000000,0.000000,0.000000,0.0,0.0,0.0
4,7,0.437716,0.422222,-0.424281,0.171429,4.709890,4.584816,4.513920,4.350622,2.233749,...,4.678123,2.625831,3.326468,2.000300,0.000000,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75817,2,0.689014,0.755556,-0.424281,0.285714,4.189586,4.046243,4.487342,5.541109,3.120494,...,3.418789,4.100293,3.931018,4.350483,0.000000,0.000000,0.000000,0.0,0.0,0.0
75818,8,0.350779,0.333333,-1.542105,0.257143,4.144089,4.465894,4.181019,5.102517,3.091686,...,1.675136,3.268155,1.706192,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0
75819,3,0.677768,0.688889,0.693542,0.285714,4.273902,4.490870,4.487342,5.385577,2.179101,...,5.699499,6.774194,4.387030,4.223557,3.032323,0.000000,0.000000,0.0,0.0,0.0
75820,6,0.607266,0.533333,-0.424281,0.257143,4.299169,4.712225,4.258486,5.895499,2.388684,...,4.261845,2.601094,3.847003,2.000300,0.000000,0.000000,0.000000,0.0,0.0,0.0


In [14]:
summary_df.to_csv('data/tft_games/tft_games_after_dp.csv', index=False)