In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

In [2]:
#Load and small preprocess of data 

data_path = '../CleanData/all_players_df.pkl'
df = pd.read_pickle(data_path)

print(df.isnull().sum())

player_name                                0
real_name                                  0
team                                       0
age                                        0
rating                                     0
                                          ..
utility_damage_per_round                   0
utility_kills_per_100_rounds               0
utility_flashes_thrown_per_round           0
utility_flash_assists_per_round            0
utility_time_opponent_flashed_per_round    0
Length: 69, dtype: int64


In [3]:
# Drop the age column due to error in current data with age (no time to fix will come back to it)
df = df.drop('age', axis=1, errors='ignore')

In [4]:
# List of columns selected via feature selection
columns_to_keep = [
    # Numeric features
    'kd_ratio', 'firepower_damage_per_round_win', 'kills_per_round',
    'firepower_score', 'impact', 'trading_damage_per_kill', 'kast',
    'entrying_support_rounds', 'utility_time_opponent_flashed_per_round',
    
    # Categorical features
    'team', 'player_name', 'real_name',
    
    # Target variable
    'rating'
]

# Keep only the selected columns
df = df[columns_to_keep]
print(f"DataFrame shape after selecting columns: {df.shape}")

DataFrame shape after selecting columns: (968, 13)


In [5]:
#Split the data into train and test
X = df.drop('rating', axis=1)
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
#One-Hot Encoding for team
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

team_encoded_train = onehot.fit_transform(X_train[['team']])
team_encoded_test = onehot.transform(X_test[['team']])

# Get the new column names for team_name
team_columns = onehot.get_feature_names_out(['team'])

In [8]:
# Target Encoding for player_name and real_name
te = TargetEncoder()
player_encoded_train = te.fit_transform(X_train[['player_name', 'real_name']], y_train)
player_encoded_test = te.transform(X_test[['player_name', 'real_name']])