Setup

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import torch
import torch.nn
import torch.optim as optim
import sqlite3
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import normalize

# Model imports to test
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB


from DataPreprocessing import DataPreprocesser
from MyDataset import MyDataset

In [None]:
connection = sqlite3.connect('dota2.db')
cursor = connection.cursor()
MyProcesser = DataPreprocesser(connection, cursor)

Load Data

In [None]:
DataPreprocesser.to_dataframes(MyProcesser)  # Load the data into dataframes

data = MyProcesser.merge_data()

y = data.loc[:, 'radiant_win']
X = data.drop(columns=['radiant_win'])

View Data

In [None]:
tmp, counts = np.unique(y, return_counts=True)
percentages = (counts / y.size) * 100

bars = plt.bar([0,1], percentages, color=['skyblue', 'salmon'])

for bar, percentage in zip(bars, percentages):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, yval + 1, f'{percentage:.2f}%', ha='center', va='bottom')

# Assign title and labels
plt.title('Percent Distribution of Radiant vs Dire Wins')
plt.xlabel('Value')
plt.ylabel('Percentage (%)')
bars[0].set_label('Dire Won')
bars[1].set_label('Radiant Won')

plt.legend()
plt.xticks([0,1])  # 0 is seen as a dire win, 1 is seen as a radiant win

plt.show()

Data Setup

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
train_data = MyDataset(X_train, y_train, transforms=None)
test_data = MyDataset(X_test, y_test, transforms=None)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=False)

Setting Up Models

In [None]:
# I'll be testing various models to use, and employing the best one

# For now, no parameter modifications outside of default
models = {
    "Logistic Regression": LogisticRegression(),
    "SVM": SVC(probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB()
}

# I want to add some neural networks to be used
neural_networks = {
    "MLP": None,
}

Train Models & Neural Networks

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)  # Does the training itself
    y_pred = model.predict(X_test)
    
    # Calculate metrics below
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy * 100:.2f}%")
    
    # ROC_AUC
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    print(f"ROC AUC Score: {roc_auc:.2f}")
    
    loss = log_loss(y_test, model.predict_proba(X_test))
    print(f"Log Loss: {loss:.2f}")

In [None]:
for name, item in neural_networks.items():
    pass  # Requires a more complex loop, will set up later

Evaluate Models

Performance Analysis

Save Data