In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
#Load and small preprocess of data 

data_path = '../CleanData/all_players_df.pkl'
df = pd.read_pickle(data_path)

print(df.isnull().sum())

player_name                                0
real_name                                  0
team                                       0
age                                        0
rating                                     0
                                          ..
utility_damage_per_round                   0
utility_kills_per_100_rounds               0
utility_flashes_thrown_per_round           0
utility_flash_assists_per_round            0
utility_time_opponent_flashed_per_round    0
Length: 69, dtype: int64


In [3]:
# Drop the age column due to error in current data with age (no time to fix will come back to it)
df = df.drop('age', axis=1, errors='ignore')

In [4]:
# List of columns selected via feature selection
columns_to_keep = [
    # Numeric features
    'kd_ratio', 'firepower_damage_per_round_win', 'kills_per_round',
    'firepower_score', 'impact', 'trading_damage_per_kill', 'kast',
    'entrying_support_rounds', 'utility_time_opponent_flashed_per_round',
    
    # Categorical features
    'team', 'player_name', 'real_name',
    
    # Target variable
    'rating'
]

# Keep only the selected columns
df = df[columns_to_keep]
print(f"DataFrame shape after selecting columns: {df.shape}")

DataFrame shape after selecting columns: (968, 13)


In [5]:
#Split the data into train and test
X = df.drop('rating', axis=1)
y = df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
#One-Hot Encoding for team
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

team_encoded_train = onehot.fit_transform(X_train[['team']])
team_encoded_test = onehot.transform(X_test[['team']])

# Get the new column names for team_name
team_columns = onehot.get_feature_names_out(['team'])

In [8]:
# Target Encoding for player_name and real_name
te = TargetEncoder()
player_encoded_train = te.fit_transform(X_train[['player_name', 'real_name']], y_train)
player_encoded_test = te.transform(X_test[['player_name', 'real_name']])

In [9]:
# Combine the encoded categorical features with the numeric features
numeric_features = ['kd_ratio', 'firepower_damage_per_round_win', 'kills_per_round',
                    'firepower_score', 'impact', 'trading_damage_per_kill', 'kast',
                    'entrying_support_rounds', 'utility_time_opponent_flashed_per_round']

X_train_encoded = np.hstack([X_train[numeric_features].values, team_encoded_train, player_encoded_train])
X_test_encoded = np.hstack([X_test[numeric_features].values, team_encoded_test, player_encoded_test])

In [10]:
# Create a list of feature names
feature_names = numeric_features + list(team_columns) + ['player_name_encoded', 'real_name_encoded']

# Convert to dataframe for easier handling
X_train_encoded = pd.DataFrame(X_train_encoded, columns=feature_names)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=feature_names)

print("Shape of encoded training data:", X_train_encoded.shape)
print("Shape of encoded test data:", X_test_encoded.shape)
print("Feature names:", feature_names)

Shape of encoded training data: (774, 186)
Shape of encoded test data: (194, 186)
Feature names: ['kd_ratio', 'firepower_damage_per_round_win', 'kills_per_round', 'firepower_score', 'impact', 'trading_damage_per_kill', 'kast', 'entrying_support_rounds', 'utility_time_opponent_flashed_per_round', 'team_1win', 'team_359', 'team_3dmax', 'team_500', 'team_9 pandas', 'team_9ine', 'team_9ine academy', 'team_alliance', 'team_alter ego', 'team_alternate attax', 'team_ambush', 'team_amkal', 'team_anonymo', 'team_antic', 'team_apogee', 'team_arcred', 'team_astralis', 'team_atox', 'team_aurora', 'team_b8', 'team_bad news chickens', 'team_bc.game', 'team_bestia', 'team_betboom', 'team_betera', 'team_big', 'team_bleed', 'team_blue otter karma', 'team_bogatyri', 'team_boss', 'team_bounty hunters', 'team_case', 'team_cloud9', 'team_complexity', 'team_cph wolves', 'team_dash', 'team_dofg', 'team_drillas', 'team_dusty roots', 'team_dxa', 'team_e-xolos lazer', 'team_eclot', 'team_ecstatic', 'team_ence',

### Model Time

Time to train the models



In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}