In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split, GroupKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

In [17]:
# Load data
data = pd.read_csv('VCT_2024.csv')

In [None]:
# Drop irrelevant columns (menghilangkan data yang tak relevant agar hanya berisikan kumpulan lean dataset yang hanya berisi fitur yang relevan secara statistik sambil mencegah kebocoran data)
drop_cols = ['Region', 'Player', 'Team Abbreviated', 'Event', 'CL', 'R']
data = data.drop(columns=drop_cols)
#Player: Dihilangkan untuk mencegal model mengingat pola performa individual
#Region/Event: Ga berguna untuk prediksi
#CL/R: Digunakan untuk pola performa individual

In [19]:
# Define numeric columns (excluding ACS to prevent leakage)
numeric_cols = ['K:D', 'KAST', 'ADR', 'KPR', 'APR', 'FKPR', 'FDPR', 'HS%', 'CL%', 'CW', 'CP']

In [20]:
# --- Define Binary Target (Win=1, Loss=0) ---
# Using median ACS as threshold (but don't include ACS in features!)
data['Win'] = (data['ACS'] >= data['ACS'].median()).astype(int) #Convert continuous performance metric (ACS) into actionable binary classification task.
#Median split chosen over mean for robustness against outlier performances
#Binary classification framework enables interpretable win/loss predictions
#Business Interpretation: Identifies whether a player's performance is above/below tournament median
y = data['Win']
X = data.drop(columns=['Win', 'ACS'])  # Critical: Remove ACS to prevent leakage
#ACS digunakan untuk pembuatan target tetapi dihapus dari fitur untuk mencegah model berbuat curang dengan mengakses langsung metrik yang diperoleh dari target.

In [21]:
# --- Preprocessing ---
# 1. Encode categorical variables (Team)
X = pd.get_dummies(X, columns=['Team'], drop_first=True) #Convert team affiliations to numerical format while preventing multicollinearity.
#drop_first=True implements one-hot encoding with k-1 dummy variables
#Prevents linear dependence in the feature matrix
#Example: For 10 teams, creates 9 binary columns

In [22]:
# 2. Handle missing values
imputer = SimpleImputer(strategy='median')  # More robust than mean (Handle missing values without distorting feature distributions.)
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])
#Resilient to extreme outliers (e.g., a player with 10:0 K:D ratio)
#Preserves original distribution shape for skewed metrics
#Particularly important for esports data where performance metrics often have long tails

In [23]:
# 3. Normalize numeric features
scaler = MinMaxScaler() #Normalize features to [0,1] range despite Random Forest's scale invariance.
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
#Enables fair feature importance comparison
#Prepares data for potential model comparisons (e.g., logistic regression)
#Mitigates potential numerical stability issues in deep trees

In [24]:
# Verify no NaNs remain
print("Remaining NaNs:", X.isna().sum().sum())

Remaining NaNs: 0


In [25]:
# --- Train-Test Split with Grouping ---
# Use Player name as group to prevent same player in both sets
groups = data['Team']  # Or use Match ID if available(Prepares for more rigorous validation where all matches from one team stay together, prevent information leakage between train/test sets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
) #Create validation framework that preserves class balance, enables reprodicible results, and prepares for team-based validation expansion
#stratify=y ensures equal win/loss ratios in train/test sets
#random_state=42 enables experiment replication
#30% test size provides sufficient validation samples without starving training data


In [26]:
# --- Train Random Forest with Anti-Overfitting Settings ---
rf_pipeline = make_pipeline(
    #Build an interpretable yet powerful model with built-in overfitting protections.
    RandomForestClassifier( #Although currently simple, enables easy addition of preprocessing steps, maintains sklearn API consistency
        n_estimators=50,       # Reduced from 100
        max_depth=5,           # Shallower trees
        min_samples_split=10, #Only splits with ≥10 samples prevent noisy divisions
        max_features='sqrt',   # Default for Random Forest
        random_state=42,
        class_weight='balanced'  # Handle class imbalance
    )
)

In [27]:
# --- Evaluation ---
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)
#Provide actionable performance insights beyond simple accuracy
print("\n=== Model Performance ===")
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



=== Model Performance ===
Test Accuracy: 0.9356223175965666
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       116
           1       0.92      0.96      0.94       117

    accuracy                           0.94       233
   macro avg       0.94      0.94      0.94       233
weighted avg       0.94      0.94      0.94       233



In [28]:
# Cross-validation with grouped splits
print("\n=== Cross-Validation ===")
cv_scores = cross_val_score(rf_pipeline, X, y, cv=GroupKFold(n_splits=5), groups=groups)
print(f"CV Accuracy: {cv_scores.mean():.2%} (±{cv_scores.std():.2%})")


=== Cross-Validation ===
CV Accuracy: 92.81% (±1.20%)


In [29]:
# Feature importance
if hasattr(rf_pipeline.steps[-1][1], 'feature_importances_'):
    importances = rf_pipeline.steps[-1][1].feature_importances_
    features = X.columns
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
    print("\n=== Top 10 Features ===")
    
    print(feature_importance.sort_values('Importance', ascending=False).head(10))


=== Top 10 Features ===
   Feature  Importance
4      KPR    0.284247
3      ADR    0.226599
1      K:D    0.132462
12    KMax    0.059116
6     FKPR    0.057496
16      FK    0.054551
17      FD    0.028166
13       K    0.019549
14       D    0.017872
5      APR    0.017710


In [30]:
# Compare train vs test accuracy
print("\n=== Overfitting Check ===")
train_accuracy = rf_pipeline.score(X_train, y_train)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Train Accuracy: {train_accuracy:.2%}")
print(f"Test Accuracy: {test_accuracy:.2%}")
print(f"Gap: {abs(train_accuracy - test_accuracy):.2%} (should be <5%)")


=== Overfitting Check ===
Train Accuracy: 95.76%
Test Accuracy: 93.56%
Gap: 2.19% (should be <5%)
