<a href="https://colab.research.google.com/github/RonanD10/Tennis-Match-Prediction/blob/main/match_winner_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

import torch
import torch.nn as nn
import torch.nn.functional as F

from google.colab import drive, files
drive.mount('/content/drive')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
features = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Tennis Project/features_processed_random.csv')

In [13]:
# Overall data
X = features.drop(['A_won', 'slam_round'], axis=1)
y = features['A_won']
X_train = X[:int(0.8 * len(X))]
y_train = y[:int(0.8 * len(y))]
X_test = X[int(0.8 * len(y)):]
y_test = y[int(0.8 * len(y)):]

# Slam data
slam_features = features[features['slam_round'] > 0]
X_test_slam = features[int(0.8 * len(y)):]
X_test_slam = X_test_slam[X_test_slam['slam_round'] > 0]
y_test_slam = X_test_slam['A_won']
X_test_slam = X_test_slam.drop(['A_won', 'slam_round'], axis=1)

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_test_slam = scaler.transform(X_test_slam)

In [None]:
# Comparing models

models = [LogisticRegression(max_iter=10000), RandomForestClassifier(), SVC()]

# Overall predictions
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model.__class__.__name__} accuracy: {accuracy}')

# Slam predictions only
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test_slam)
    accuracy = accuracy_score(y_test_slam, y_pred)
    print(f'{model.__class__.__name__} accuracy: {accuracy}')

# Slam predictions, trained only on slam data
X_train_slam = slam_features[:int(0.8 * len(slam_features))]
y_train_slam = X_train_slam['A_won']
X_train_slam = X_train_slam.drop(['A_won', 'slam_round'], axis=1)

for model in models:
    model.fit(X_train_slam, y_train_slam)
    y_pred = model.predict(X_test_slam)
    accuracy = accuracy_score(y_test_slam, y_pred)
    print(f'{model.__class__.__name__} accuracy: {accuracy}')


In [6]:
# Accuracy per slam round

def slam_accuracy(features, round, model):

    X = features.drop(['A_won', 'slam_round'], axis=1)
    y = features['A_won']
    X_train = X[:int(0.8 * len(X))]
    y_train = y[:int(0.8 * len(y))]

    test = features[int(0.8 * len(y)):]
    test_slams = test[test['slam_round'] == round]
    X_test = test_slams.drop(['A_won', 'slam_round'], axis=1)
    y_test = test_slams['A_won']

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

model = RandomForestClassifier()
for i in range(1, 8):
    accuracy = slam_accuracy(features, i, model)
    print(f'Accuracy for slam round {i}: {accuracy}')


Accuracy for slam round 1: 0.8402041617589321
Accuracy for slam round 2: 0.8056206088992974
Accuracy for slam round 3: 0.7667682926829268
Accuracy for slam round 4: 0.7530487804878049
Accuracy for slam round 5: 0.75
Accuracy for slam round 6: 0.8048780487804879
Accuracy for slam round 7: 0.7560975609756098


In [12]:
# Slam upsets (lower ranked wins)

slam_features_upset_A = slam_features[(slam_features['A_rank'] > slam_features['B_rank']) & (slam_features['A_won'] == 1)]
slam_features_upset_B = slam_features[(slam_features['A_rank'] < slam_features['B_rank']) & (slam_features['A_won'] == 0)]
slam_features_upset = pd.concat([slam_features_upset_A, slam_features_upset_B])
y_test_slam_upset = slam_features_upset['A_won']
slam_features_upset = slam_features_upset.drop(['A_won', 'slam_round'], axis=1)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
slam_features_upset = scaler.transform(slam_features_upset)

model.fit(X_train, y_train)
y_pred = model.predict(slam_features_upset)
accuracy = accuracy_score(y_test_slam_upset, y_pred)
print(f'Upset prediction accuracy: {accuracy}')

Upset prediction accuracy: 0.5914718019257221


In [7]:
# Analyse model coeffs -- graphs and explanations
# Compare classification models (gridsearchCV) & explain why better and worse
#

## Deep Learning Model

In [None]:
class Model(nn.Module):
  def __init__(self, in_features=21, h1=24, h2=24, out_features=2):
    super().__init__()
    self.fc1 = nn.Linear(in_features, h1)
    self.fc2 = nn.Linear(h1, h2)
    self.out = nn.Linear(h2, out_features)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.out(x)

    return x

model = Model()

X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)

y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(list(y_test))

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

epochs = 1000
losses = []
for i in range(epochs):
  y_pred = model.forward(X_train)
  loss = criterion(y_pred, y_train)
  if loss < 5:
    losses.append(loss.detach().numpy())

  if i % 50 == 0:
    print(f'Epoch: {i} and loss: {loss}')

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

Epoch: 0 and loss: 0.41927477717399597
Epoch: 50 and loss: 0.4226013123989105
Epoch: 100 and loss: 0.42005351185798645
Epoch: 150 and loss: 0.41951823234558105
Epoch: 200 and loss: 0.4192602336406708
Epoch: 250 and loss: 0.4191246032714844
Epoch: 300 and loss: 0.41901645064353943
Epoch: 350 and loss: 0.41892099380493164
Epoch: 400 and loss: 0.41883841156959534
Epoch: 450 and loss: 0.4187612235546112
Epoch: 500 and loss: 0.41867920756340027
Epoch: 550 and loss: 0.41860705614089966
Epoch: 600 and loss: 0.4185396432876587
Epoch: 650 and loss: 0.4184684157371521
Epoch: 700 and loss: 0.41840142011642456
Epoch: 750 and loss: 0.4183393716812134
Epoch: 800 and loss: 0.41828030347824097
Epoch: 850 and loss: 0.4182570278644562
Epoch: 900 and loss: 0.41818875074386597
Epoch: 950 and loss: 0.41809308528900146


In [None]:
with torch.no_grad():
  y_eval = model.forward(X_test)
  loss = criterion(y_eval, y_test)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy_score(y_test, torch.argmax(y_eval, dim=1))}')
print(sum(torch.argmax(y_eval, dim=1) != y_test))
print(len(y_test))

Loss: 0.40476226806640625
Accuracy: 0.8095705040203962
tensor(971)
5099


In [None]:
# Accuracy per slam round

In [None]:
plt.plot(range(len(losses)), losses)
plt.ylabel("loss/error")
plt.xlabel('Epoch')

In [None]:
# Predicting outcome based only on ranking
correct = 0
for i in range(len(features)):
    if features.loc[i, 'A_rank'] < features.loc[i, 'B_rank'] and features.loc[i, 'A_won'] == 1:
        correct += 1
    elif features.loc[i, 'A_rank'] > features.loc[i, 'B_rank'] and features.loc[i, 'A_won'] == 0:
        correct += 1
    else:
        pass

print('Rank accuracy: ', correct/len(features))

# Predicting outcome based only on Elo
correct = 0
for i in range(len(features)):
    if features.loc[i, 'A_elo'] > features.loc[i, 'B_elo'] and features.loc[i, 'A_won'] == 1:
        correct += 1
    elif features.loc[i, 'A_elo'] < features.loc[i, 'B_elo'] and features.loc[i, 'A_won'] == 0:
        correct += 1
    else:
        pass

print('Elo accuracy: ', correct/len(features))

# Predicting outcome based only on 12 month form
correct = 0
for i in range(len(features)):
    if features.loc[i, 'A_12month_form'] > features.loc[i, 'B_12month_form'] and features.loc[i, 'A_won'] == 1:
        correct += 1
    elif features.loc[i, 'A_12month_form'] < features.loc[i, 'B_12month_form'] and features.loc[i, 'A_won'] == 0:
        correct += 1
    else:
        pass

print('12-month form accuracy: ', correct/len(features))

Rank accuracy:  0.6595385753496636
Elo accuracy:  0.6650834145700676
12-month form accuracy:  0.718582125152967
