In [42]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

In [85]:
df = pd.read_csv('../data/ALLDATA_v2.csv')
print(df.head())

In [11]:
#KNN for computing Pos_avgs
from sklearn.impute import KNNImputer

# Select features for KNN
features = ['HS', 'AS', 'HST', 'AST', 'Hpts', 'Apts', 'Home_Form_Points', 'Away_Form_Points']
target_columns = ["HTPos_avg", "ATPos_avg"]

# Create a mask to identify rows with missing values
missing_mask = df[target_columns].isnull()

# Add missingness indicators to the DataFrame
for col in target_columns:
    df[f"{col}_missing"] = missing_mask[col].astype(int)

# Create a copy of the data for imputation
imputation_data = df[features + target_columns].copy()

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
imputed_data = knn_imputer.fit_transform(imputation_data)

# Convert imputed data back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features + target_columns)

# Replace only the missing values in the original DataFrame
for col in target_columns:
    df.loc[missing_mask[col], col] = imputed_df.loc[missing_mask[col], col]

In [12]:
# import matplotlib.pyplot as plt

# # Visualize imputed vs. non-imputed values
# plt.hist(df['HTPos_avg'], bins=30, alpha=0.5, label='Home Pos Avg')
# plt.hist(df.loc[missing_mask['HTPos_avg'], 'HTPos_avg'], bins=30, alpha=0.5, label='Imputed Home Pos Avg')
# plt.legend()
# plt.show()

In [13]:
df = df.drop(df.columns[0], axis=1)
df = df.drop(columns=['Date', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'Attendance', "HTV($m)",	"ATV($m)", "HTPos_avg", "ATPos_avg", "HSPE (%)", "HPE (%)", "ASPE (%)", "APE (%)"],axis=1)
print(df.head())

In [14]:
one_hot_encoded_hometeam = pd.get_dummies(df['HomeTeam'], prefix='HomeTeam')
one_hot_encoded_awayteam = pd.get_dummies(df['AwayTeam'], prefix='AwayTeam')
one_hot_encoded_referee = pd.get_dummies(df['Referee'], prefix='Referee')
one_hot_encoded_ftr = pd.get_dummies(df['FTR'], prefix='FTR')
df = pd.concat([df, one_hot_encoded_hometeam, one_hot_encoded_awayteam, one_hot_encoded_referee, one_hot_encoded_ftr], axis=1)
df = df.drop(columns=['HomeTeam', 'AwayTeam', 'Referee', 'FTR'], axis=1)
print(df.head())

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.layers import BatchNormalization
import keras_tuner

X = df.drop(columns=['FTR_A', 'FTR_D', 'FTR_H'])
y = df[['FTR_A', 'FTR_D', 'FTR_H']]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(y.shape[1]),
    y=np.argmax(y.values, axis=1)
)
class_weights = dict(enumerate(class_weights))

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=1)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

y_pred = model.predict(X_test)
y_pred_original = np.argmax(y_pred, axis=1)
y_test_original = np.argmax(y_test.values, axis=1)

print("Confusion Matrix:")
print(confusion_matrix(y_test_original, y_pred_original))

print("Classification Report:")
print(classification_report(y_test_original, y_pred_original))

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.3912 - loss: 1.1347 - val_accuracy: 0.4808 - val_loss: 1.0380 - learning_rate: 0.0010
Epoch 2/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4687 - loss: 1.0509 - val_accuracy: 0.4938 - val_loss: 1.0113 - learning_rate: 0.0010
Epoch 3/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4788 - loss: 1.0316 - val_accuracy: 0.4658 - val_loss: 1.0263 - learning_rate: 0.0010
Epoch 4/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5011 - loss: 1.0226 - val_accuracy: 0.4904 - val_loss: 1.0133 - learning_rate: 0.0010
Epoch 5/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5207 - loss: 1.0040 - val_accuracy: 0.4733 - val_loss: 1.0277 - learning_rate: 0.0010
Epoch 6/100
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m