In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [3]:
from src.dataset import load_team_data, load_agg_player_data
from src.preprocessing import impute_missing_values, split_data, remove_name_columns, encode_target_variable, remove_na_columns, find_knee_point

team_statistics, y = load_team_data()
player_statistics = load_agg_player_data()
x = pd.concat([team_statistics, player_statistics], axis=1, join='inner')

x = remove_name_columns(x)
y = encode_target_variable(y)
(x_train, y_train), (x_val, y_val), (x_test, y_test) = split_data(x, y)

x_train, imputer, numeric_columns = impute_missing_values(x_train)
x_val, _, _ = impute_missing_values(x_val, imputer=imputer, numeric_columns=numeric_columns)
x_test, _, _ = impute_missing_values(x_test, imputer=imputer, numeric_columns=numeric_columns)

x_train, non_na_columns = remove_na_columns(x_train)
x_val, _ = remove_na_columns(x_val, non_na_columns=non_na_columns)
x_test, _ = remove_na_columns(x_test, non_na_columns=non_na_columns)

y_train = y_train.to_numpy().flatten()
y_val = y_val.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

In [4]:
# x_train_clean = x_train.copy()
# x_val_clean = x_val.copy()
# x_test_clean = x_test.copy()

In [29]:
x_train = x_train_clean.copy()
x_val = x_val_clean.copy()
x_test = x_test_clean.copy()

In [30]:
# === Load mutual information feature ===
scores = np.load("features_importance_mutual_info_based.npy")

order = np.argsort(scores)[::-1]
scores_sorted = scores[order]

k = 10
knee_indices = [find_knee_point(scores_sorted)]

for i in range(k-1):
    knee_indices.append(find_knee_point(scores_sorted[knee_indices[i]:]) + knee_indices[i])
# =======================================
    
# === Extract best features ===
index_knee = 2
columns_selected = x_train.columns[order[:knee_indices[index_knee]]]

features = list(columns_selected)
features = set([feature[5:] for feature in features])

columns_to_keep = ["HOME_" + feature for feature in features] + ["AWAY_" + feature for feature in features]

# x_train = x_train[columns_to_keep]
# x_val = x_val[columns_to_keep]
# x_test = x_test[columns_to_keep]
# =============================

In [31]:
from src.preprocessing import data_augmentation

best_features = pd.read_csv('best_features_team_agg_based.csv').values.flatten()

x_train = data_augmentation(x_train, best_features)
x_val = data_augmentation(x_val, best_features)
x_test = data_augmentation(x_test, best_features)

best_features = list(best_features)

In [32]:
columns_to_keep = columns_to_keep + [best_feature + "_DIFF" for best_feature in best_features] + ["HOME_" + best_feature for best_feature in best_features] + ["AWAY_" + best_feature for best_feature in best_features]
columns_to_keep = list(set(columns_to_keep))

In [33]:
x_train = x_train[columns_to_keep]
x_val = x_val[columns_to_keep]
x_test = x_test[columns_to_keep]

In [35]:
from sklearn.svm import SVC

model = SVC(kernel='linear', C=1, random_state=42)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))
print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.4985393115711927
Validation accuracy:  0.505332656170645
Test accuracy:  0.5026412027631044


In [None]:
# Train accuracy:  0.4921884923155087
# Validation accuracy:  0.5063484002031488
# Test accuracy:  0.4965461194636327

In [36]:
from sklearn.svm import SVC

model = SVC(kernel='rbf', C=1, random_state=42)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))
print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.4946018036326686
Validation accuracy:  0.5038090401218893
Test accuracy:  0.4973587972368956


In [37]:
from sklearn.svm import SVC

model = SVC(kernel='poly', C=1, random_state=42, degree=2)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))
print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.49193445954528137
Validation accuracy:  0.505332656170645
Test accuracy:  0.4973587972368956


In [38]:
from sklearn.svm import SVC

model = SVC(kernel='poly', C=1, random_state=42, degree=3)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))
print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.4915534103899403
Validation accuracy:  0.5007618080243779
Test accuracy:  0.4924827305973182


In [39]:
from sklearn.svm import SVC

model = SVC(kernel='poly', C=1, random_state=42, degree=5)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))

print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.5085736059951734
Validation accuracy:  0.494667343829355
Test accuracy:  0.4880130028443722


In [41]:
from sklearn.svm import SVC

model = SVC(kernel='poly', C=1, random_state=42, degree=1)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))
print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.4966340657944875
Validation accuracy:  0.505840528186897
Test accuracy:  0.4997968305566843


In [42]:
from sklearn.svm import SVC

model = SVC(kernel='sigmoid', C=1, random_state=42)
model.fit(x_train, y_train)

print("Train accuracy: ", model.score(x_train, y_train))
print("Validation accuracy: ", model.score(x_val, y_val))
print("Test accuracy: ", model.score(x_test, y_test))

Train accuracy:  0.40492823574241077
Validation accuracy:  0.40832910106653125
Test accuracy:  0.41202763104429097
