In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load data
data = pd.read_csv('VCT_2024.csv')

# Drop irrelevant columns
drop_cols = ['Region', 'Player', 'Team Abbreviated', 'Event', 'CL', 'R']
data = data.drop(columns=drop_cols)

# Define numeric columns (including ACS)
numeric_cols = ['ACS', 'K:D', 'KAST', 'ADR', 'KPR', 'APR', 'FKPR', 'FDPR', 'HS%', 'CL%', 'CW', 'CP']

# --- Define Binary Target (Win=1, Loss=0) ---
data['Win'] = (data['ACS'] >= data['ACS'].median()).astype(int)
y = data['Win']
X = data.drop(columns=['Win']);,

SyntaxError: invalid syntax (367092213.py, line 14)

In [None]:
# --- Preprocessing ---
# 1. Encode categorical variables (Team)
X = pd.get_dummies(X, columns=['Team'], drop_first=True)

# 2. Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')  # Replace NaNs with mean
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

In [None]:
# 3. Normalize numeric features
scaler = MinMaxScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [None]:
# 4. Final check for NaNs (should print 0)
print("Remaining NaNs:", X.isna().sum().sum())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- Train Random Forest ---
# Using a pipeline to ensure no data leakage
rf_pipeline = make_pipeline(
    RandomForestClassifier(n_estimators=100, random_state=42, max_depth=3, max_features='sqrt', min_samples_split=10)
)
rf_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = rf_pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Feature importance (Random Forest specific)
if hasattr(rf_pipeline.steps[-1][1], 'feature_importances_'):
    importances = rf_pipeline.steps[-1][1].feature_importances_
    features = X.columns
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': importances})
    print("\nFeature Importance:")
    print(feature_importance.sort_values('Importance', ascending=False))

In [None]:
train_accuracy = rf_pipeline.score(X_train, y_train)  
test_accuracy = accuracy_score(y_test, y_pred)  
print(f"Train Accuracy: {train_accuracy:.2%}")  
print(f"Test Accuracy: {test_accuracy:.2%}")  

In [None]:
importances = rf_pipeline.steps[-1][1].feature_importances_
features = X.columns
pd.DataFrame({"Feature": features, "Importance": importances}).sort_values("Importance", ascending=False)

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_pipeline, X, y, cv=5)
print(f"CV Accuracy: {cv_scores.mean():.2%} (Â±{cv_scores.std():.2%})")