In [20]:
# IMPORTS
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

Read in dset

In [2]:
df = pd.read_csv('../data/processed/total.csv')

Drop na's, duplicates

In [9]:
df = df.dropna()
df = df.drop_duplicates()

Select features to use

In [33]:
# Specifying which features are being used
X_cols = [col for col in df.columns if 
             "_0" in col 
             or "_1" in col 
             or "_3" in col 
             or "_5" in col 
             or "_10" in col 
             or "_20" in col
             or "IS_HOME" in col]
print("Features used:")

X = df[X_cols].copy()
print(X.columns)
y = df["TEAM_WINS_A"].copy()

Features used:
Index(['TEAM_WINS_prev_0_A', 'PTS_for_prev_0_A', 'PTS_against_prev_0_A',
       'FG_PCT_for_prev_0_A', 'FG_PCT_against_prev_0_A',
       'FG3_PCT_for_prev_0_A', 'FG3_PCT_against_prev_0_A', 'AST_for_prev_0_A',
       'AST_against_prev_0_A', 'REB_for_prev_0_A',
       ...
       'PTS_against_prev_20_B', 'FG_PCT_for_prev_20_B',
       'FG_PCT_against_prev_20_B', 'FG3_PCT_for_prev_20_B',
       'FG3_PCT_against_prev_20_B', 'AST_for_prev_20_B',
       'AST_against_prev_20_B', 'REB_for_prev_20_B', 'REB_against_prev_20_B',
       'IS_HOME_B'],
      dtype='object', length=134)


Change data to np array or tensor, AFTER feature selection stuff

In [34]:
X = X.values
y = y.values

Scale X

In [35]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

Split the data into train test

In [86]:
# Splitting data into training and testing data
X_train, X_test, y_train, y_true = train_test_split(X, y, test_size=0.2, random_state=2)
print('Split complete')

Split complete


Define model, train

In [131]:
# Model training
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
print("Trained!")

# Get training accuracy
print("Accuracy: {:.2f}".format(1 - mean_absolute_error(lr_model.predict(X_train), y_train)))

Trained!
Accuracy: 0.66


Evaluate model performance

In [88]:
# Tests Logistic Regression Model
y_pred = lr_model.predict(X_test)
y_pred = y_pred.round()
print("ACCURACY:")
print(1 - mean_absolute_error(y_pred, y_true))

ACCURACY:
0.6524565092716498


In [89]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_true, y_pred)
conf_matrix

array([[3414, 1840],
       [1796, 3412]])

In [24]:
# Shows how often home team is predicted vs away team
# TODO


Test with torch model...

In [90]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score

Redefine data as torch tensor

In [91]:
X_tr = torch.from_numpy(X_train.astype(np.float32))
y_tr= torch.from_numpy(y_train.astype(np.float32)).view(-1, 1)
X_te = torch.from_numpy(X_test.astype(np.float32))
y_te = torch.from_numpy(y_true.astype(np.float32)).view(-1, 1)

In [107]:
# Model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_size):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_size, 1)
        """self.linear2 = nn.Linear(input_size * 2, input_size * 2)
        self.linear3 = nn.Linear(input_size * 2, 1)"""
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.linear(x)
        """out = self.linear2(out)
        out = self.linear3(out)"""
        out = self.sigmoid(out)
        return out

# Instantiate the model
input_size = X_tr.shape[1]
model = LogisticRegressionModel(input_size)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = torch.optim.LBFGS(model.parameters(), lr=0.01, max_iter=100)
#optimizer = torch.optim.SGD(model.parameters(), lr=0.3)  

def closure():
    optimizer.zero_grad()  # Zero the gradients to clear the previous values
    y_pred = model(X_tr)  # Forward pass to compute predictions
    loss = criterion(y_pred, y_tr)  # Compute the main loss

    # Add L2 regularization (weight decay)
    l2_lambda = 0.01  # Regularization strength (adjust as needed)
    l2_reg = sum(p.pow(2.0).sum() for p in model.parameters())  # Compute L2 norm of model parameters
    loss += l2_lambda * l2_reg  # Add L2 regularization term to the loss

    loss.backward()  # Backward pass to compute gradients
    return loss  # Return the total loss (including regularization term)

In [108]:
# Training the model
epochs = 10
for epoch in range(epochs):
    optimizer.step(closure)  # Update the model's parameters by one step

    if True:
        print(f'Epoch [{epoch}/{epochs}], Loss: {closure().item():.4f}')
    lowest_loss = min(100, closure().item())

Epoch [0/10], Loss: 0.6582
Epoch [1/10], Loss: 0.6508
Epoch [2/10], Loss: 0.6497
Epoch [3/10], Loss: 0.6496
Epoch [4/10], Loss: 0.6496
Epoch [5/10], Loss: 0.6496
Epoch [6/10], Loss: 0.6496
Epoch [7/10], Loss: 0.6496
Epoch [8/10], Loss: 0.6496
Epoch [9/10], Loss: 0.6496


In [127]:
# Testing the model
with torch.no_grad():
    logit_pred = model(X_te)
    y_pred = (y_pred >= 0.5).float()
    
    # Evaluate the accuracy
    accuracy = accuracy_score(y_pred, y_true)
    print("Accuracy:", accuracy)
    print("Lowest Loss:", lowest_loss)

Accuracy: 0.6356337220416747
Lowest Loss: 0.6495757102966309


TESTING RANDOM FOREST MODEL

In [111]:
from sklearn.ensemble import RandomForestClassifier

In [128]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [129]:
# Evaluate the model
# Predict on the test set
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6426113553813803


GRADIENT BOOSTING MACHINE

In [117]:
from sklearn.ensemble import GradientBoostingClassifier

In [118]:
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = gb_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6499713247944944


SUPPORT VECTOR MACHINES

In [132]:
from sklearn.svm import SVC

In [135]:
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)

# Train the SVM classifier
svm_classifier.fit(X_train, y_train)

In [134]:
# Evaluation
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.