In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler

In [38]:
#1. Data Collection
# Load the dataset
data = pd.read_csv('VCT_2024.csv')

# Display the first few rows of the dataset
columns = ['Region', 'Player', 'Team Abbreviated', 'Event', 'CL']
data = data.drop(columns=columns)
nanColumns = ['R', 'KAST', 'ADR', 'FKPR', 'FDPR', 'HS%', 'CL%', 'CW', 'CP']
for nan in nanColumns:
    data[[nan]] = data[[nan]].fillna(data[nan].mean())

data.head()

data.info()
print(data['R'].describe())
print(data['R'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775 entries, 0 to 774
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Team    775 non-null    object 
 1   Rnd     775 non-null    int64  
 2   R       775 non-null    float64
 3   ACS     775 non-null    float64
 4   K:D     775 non-null    float64
 5   KAST    775 non-null    float64
 6   ADR     775 non-null    float64
 7   KPR     775 non-null    float64
 8   APR     775 non-null    float64
 9   FKPR    775 non-null    float64
 10  FDPR    775 non-null    float64
 11  HS%     775 non-null    float64
 12  CL%     775 non-null    float64
 13  CW      775 non-null    float64
 14  CP      775 non-null    float64
 15  KMax    775 non-null    int64  
 16  K       775 non-null    int64  
 17  D       775 non-null    int64  
 18  A       775 non-null    int64  
 19  FK      775 non-null    int64  
 20  FD      775 non-null    int64  
dtypes: float64(13), int64(7), object(1)
mem

In [None]:
#2. Data Preprocessing
# Normalize selected numeric columns
num_cols = ['ACS', 'K:D', 'KAST', 'ADR', 'KPR', 'APR', 'FKPR', 'FDPR', 'HS%', 'CL%', 'CW', 'CP']
scaler = MinMaxScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

# Encoding categorical columns
data = pd.get_dummies(data, columns=['Team'], drop_first=True)

# Choose target variable - assuming 'R' is the target for prediction
target_column = 'R'  # Adjust this if you have a specific target for winning
X = data.drop(columns=[target_column])
y = data[target_column]


In [36]:
#3. Train/Test Split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [37]:
#4. Model Training and Evaluation
# Initialize SVM model
svm_model = SVC(kernel='linear', C=1, probability=True)
svm_model.fit(X_train, y_train)

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_

print("Best parameters for Random Forest:", grid_search.best_params_)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
#5. Model Evaluation
# Predict using SVM
y_pred_svm = svm_model.predict(X_test)

# Predict using Random Forest
y_pred_rf = best_rf_model.predict(X_test)

# Calculate metrics for SVM
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_precision = precision_score(y_test, y_pred_svm, average='weighted', zero_division=0)
svm_recall = recall_score(y_test, y_pred_svm, average='weighted')
svm_f1 = f1_score(y_test, y_pred_svm, average='weighted')

# Calculate metrics for Random Forest
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf, average='weighted', zero_division=0)
rf_recall = recall_score(y_test, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test, y_pred_rf, average='weighted')

# Display results
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'SVM': [svm_accuracy, svm_precision, svm_recall, svm_f1],
    'Random Forest': [rf_accuracy, rf_precision, rf_recall, rf_f1]
})

print(results_df)


In [None]:
#6. Visualization of Results
# Plotting performance comparison
results_df.set_index('Metric').plot(kind='bar', figsize=(10, 6))
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.show()
