In [12]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
#Load and small preprocess of data 

data_path = '../CleanData/all_players_df.pkl'
df = pd.read_pickle(data_path)

print(df.isnull().sum())

player_name                                0
real_name                                  0
team                                       0
age                                        0
rating                                     0
                                          ..
utility_damage_per_round                   0
utility_kills_per_100_rounds               0
utility_flashes_thrown_per_round           0
utility_flash_assists_per_round            0
utility_time_opponent_flashed_per_round    0
Length: 69, dtype: int64


In [14]:
# Drop the age column due to error in current data with age (no time to fix will come back to it)
df = df.drop('age', axis=1, errors='ignore')

In [15]:
# List of columns selected via feature selection
columns_to_keep = [
    # Numeric features
    'kd_ratio', 'firepower_damage_per_round_win', 'kills_per_round',
    'firepower_score', 'impact', 'trading_damage_per_kill', 'kast',
    'entrying_support_rounds', 'utility_time_opponent_flashed_per_round',
    
    # Categorical features
    'team',
    
    # Target variable
    'rating'
]

# Keep selected columns plus player_name and real_name (for reference)
df = df[columns_to_keep + ['player_name', 'real_name']]
print(f"DataFrame shape after selecting columns: {df.shape}")

DataFrame shape after selecting columns: (968, 13)


In [16]:
# Split features and target
X = df.drop(['rating', 'player_name', 'real_name'], axis=1)
y = df['rating']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save player names for later reference
train_names = df.loc[X_train.index, ['player_name', 'real_name']]
test_names = df.loc[X_test.index, ['player_name', 'real_name']]

In [17]:
# Target Encoding for 'team'
te_team = TargetEncoder(cols=['team'])
X_train_encoded = te_team.fit_transform(X_train, y_train)
X_test_encoded = te_team.transform(X_test)

In [18]:
# List of numeric features
numeric_features = [col for col in X_train.columns if col != 'team']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the numeric features in the training data
X_train_encoded[numeric_features] = scaler.fit_transform(X_train_encoded[numeric_features])

# Transform the numeric features in the test data
X_test_encoded[numeric_features] = scaler.transform(X_test_encoded[numeric_features])

print("Shape of encoded training data:", X_train_encoded.shape)
print("Shape of encoded test data:", X_test_encoded.shape)
print("Feature names:", X_train_encoded.columns.tolist())

Shape of encoded training data: (774, 10)
Shape of encoded test data: (194, 10)
Feature names: ['kd_ratio', 'firepower_damage_per_round_win', 'kills_per_round', 'firepower_score', 'impact', 'trading_damage_per_kill', 'kast', 'entrying_support_rounds', 'utility_time_opponent_flashed_per_round', 'team']


### Model Time

Time to train the models



In [19]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42, n_estimators=100),
    'XGBoost': XGBRegressor(random_state=42, n_estimators=100)
}

In [20]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Train and evaluate models
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train_encoded, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_encoded)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {'model': model, 'mse': mse, 'mae': mae, 'r2': r2}
    
    print(f"{name}:")
    print(f"  MSE: {mse:.4f}")
    print(f"  MAE: {mae:.4f}")
    print(f"  R-squared: {r2:.4f}\n")

Linear Regression:
  MSE: 0.0020
  MAE: 0.0374
  R-squared: 0.8853

Random Forest:
  MSE: 0.0018
  MAE: 0.0314
  R-squared: 0.9003

XGBoost:
  MSE: 0.0021
  MAE: 0.0344
  R-squared: 0.8804



In [21]:
# Identify the best model
best_model = max(results, key=lambda x: results[x]['r2'])
print(f"Best model: {best_model}")

Best model: Random Forest


In [22]:
# Make predictions using the best model
best_model_name = best_model
best_model_object = results[best_model_name]['model']
y_pred = best_model_object.predict(X_test_encoded)

In [23]:
# Create a DataFrame with predictions and actual values
predictions_df = pd.DataFrame({
    'player_name': test_names['player_name'],
    'real_name': test_names['real_name'],
    'team': X_test['team'],
    'actual_rating': y_test,
    'predicted_rating': y_pred
})

In [24]:
# Calculate the difference between predicted and actual ratings
predictions_df['rating_difference'] = predictions_df['predicted_rating'] - predictions_df['actual_rating']

# Sort by the absolute difference to see the best and worst predictions
predictions_df['abs_difference'] = abs(predictions_df['rating_difference'])
predictions_df_sorted = predictions_df.sort_values('abs_difference')

print("Top 10 most accurate predictions:")
print(predictions_df_sorted.head(10))

print("\nTop 10 least accurate predictions:")
print(predictions_df_sorted.tail(10))


Top 10 most accurate predictions:
    player_name           real_name        team  actual_rating  \
876       AdreN  Dauren Kystaubayev     no team       0.333333   
923     SEMINTE      Valentin Bodea     no team       0.274510   
626      f0rest     Patrik Lindberg     no team       0.568627   
76       regali       Iulian Harjău    entropiq       0.666667   
450        eraa       Sean Knutsson  cph wolves       0.509804   
260      interz    Timofey Yakushin      cloud9       0.352941   
892    oskarish  Oskar Stenborowski     no team       0.313725   
231        asap      Tyson Paterson     rooster       0.647059   
755       tarik         Tarik Celik     no team       0.431373   
535    innocent         Paweł Mocek      rebels       0.352941   

     predicted_rating  rating_difference  abs_difference  
876          0.333137          -0.000196        0.000196  
923          0.274314          -0.000196        0.000196  
626          0.568039          -0.000588        0.000588  
76 

## Predictions are good

With some solid predictions and a best and second best model I am going to save the data and models and move on to optimization.