In [160]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report

from xgboost import XGBRegressor

In [161]:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../data/ALLDATA_v2.csv')
print(df.head())

In [163]:
#KNN for computing Pos_avgs
from sklearn.impute import KNNImputer

# Select features for KNN
features = ['HS', 'AS', 'HST', 'AST', 'Hpts', 'Apts', 'Home_Form_Points', 'Away_Form_Points']
target_columns = ["HTPos_avg", "ATPos_avg"]

# Create a mask to identify rows with missing values
missing_mask = df[target_columns].isnull()

# Add missingness indicators to the DataFrame
for col in target_columns:
    df[f"{col}_missing"] = missing_mask[col].astype(int)

# Create a copy of the data for imputation
imputation_data = df[features + target_columns].copy()

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5)
imputed_data = knn_imputer.fit_transform(imputation_data)

# Convert imputed data back to a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=features + target_columns)

# Replace only the missing values in the original DataFrame
for col in target_columns:
    df.loc[missing_mask[col], col] = imputed_df.loc[missing_mask[col], col]

In [164]:
# import matplotlib.pyplot as plt

# # Visualize imputed vs. non-imputed values
# plt.hist(df['HTPos_avg'], bins=30, alpha=0.5, label='Home Pos Avg')
# plt.hist(df.loc[missing_mask['HTPos_avg'], 'HTPos_avg'], bins=30, alpha=0.5, label='Imputed Home Pos Avg')
# plt.legend()
# plt.show()

In [165]:
# CHECK IF THIS IS ACTUALLY USEFUL TO INCLUDE, FEELS LIKE ACCURACY DIPS SOMETIMES BECAUSE OF THIS AS THERE'S AN OBSERVED FLUCTUATION IN ACCURACY BETWEEN 0.46 AND 0.53 
# Use a combination of random forest regressor and an iterative imputer to get missing values for HSPE and ASPE

# Random Forest Regression
def random_forest_impute(df, target_col, feature_cols):
    """
    Trains a RandomForestRegressor to predict 'target_col' using 'feature_cols'.
    Fills in missing values in 'target_col' in the original df.
    """
    not_missing_mask = df[target_col].notnull()
    missing_mask = df[target_col].isnull()

    df_not_missing = df[not_missing_mask]
    df_missing = df[missing_mask]

    if df_missing.empty:
        print(f"No missing values for {target_col}. Skipping RF imputation.")
        return df

    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=20,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    )

    rf.fit(df_not_missing[feature_cols], df_not_missing[target_col])
    imputed_values = rf.predict(df_missing[feature_cols])
    df.loc[missing_mask, target_col] = imputed_values

    return df

# Example feature set for random forest:
rf_features = [
    'Hpts', 'Apts', 
    'Home_Form_Points', 'Away_Form_Points',
    'Home_H2H_Win_Rate', 'Away_H2H_Win_Rate',
    'HTS', 'ATS'
]

df["HSPE_missing"] = df["HSPE (%)"].isnull().astype(int)
df["ASPE_missing"] = df["ASPE (%)"].isnull().astype(int)

df = random_forest_impute(
    df=df, 
    target_col='HSPE (%)', 
    feature_cols=rf_features
)

df = random_forest_impute(
    df=df, 
    target_col='ASPE (%)', 
    feature_cols=rf_features
)


In [166]:
# Random forest followed by iterated imputation to be able to get missing values for HPE AND APE and include them as features
df["HPE_missing"] = df["HPE (%)"].isnull().astype(int)
df["APE_missing"] = df["APE (%)"].isnull().astype(int)

rf_features_for_hpe_ape = [
    'Hpts', 'Apts',
    'Home_Form_Points', 'Away_Form_Points',
    'Home_H2H_Win_Rate', 'Away_H2H_Win_Rate',
    'HTS', 'ATS',
]

df = random_forest_impute(
    df=df,
    target_col='HPE (%)',
    feature_cols=rf_features_for_hpe_ape
)

df = random_forest_impute(
    df=df,
    target_col='APE (%)',
    feature_cols=rf_features_for_hpe_ape
)

impute_cols = (
    rf_features_for_hpe_ape + 
    ["HPE (%)", "APE (%)"]
)

# Remove duplicates from the combined list
impute_cols = list(dict.fromkeys(impute_cols))

# We only keep these columns in a separate DataFrame
iter_data = df[impute_cols].copy()

# Keep original features
original_features = df[impute_cols].copy()  # or just keep the rf_features part

# Initialize IterativeImputer
iter_imputer = IterativeImputer(
    estimator=RandomForestRegressor(
        n_estimators=500,
        max_depth=20,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    ),
    max_iter=5,
    random_state=42
)

# Fit-transform
imputed_array = iter_imputer.fit_transform(iter_data)
imputed_iter_df = pd.DataFrame(imputed_array, columns=impute_cols)

df['HPE (%)']  = imputed_iter_df['HPE (%)']
df['APE (%)']  = imputed_iter_df['APE (%)']

# And revert the other features to their originals (in case the imputer changed them)
for col in set(impute_cols) - set(["HPE (%)", "APE (%)"]):
    df[col] = original_features[col]


In [None]:
df = df.drop(df.columns[0], axis=1)
df = df.drop(columns=['Date', 'FTHG', 'FTAG', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR', 'Attendance'],axis=1)
print(df.head())

In [None]:
one_hot_encoded_hometeam = pd.get_dummies(df['HomeTeam'], prefix='HomeTeam')
one_hot_encoded_awayteam = pd.get_dummies(df['AwayTeam'], prefix='AwayTeam')
one_hot_encoded_referee = pd.get_dummies(df['Referee'], prefix='Referee')
one_hot_encoded_ftr = pd.get_dummies(df['FTR'], prefix='FTR')
df = pd.concat([df, one_hot_encoded_hometeam, one_hot_encoded_awayteam, one_hot_encoded_referee, one_hot_encoded_ftr], axis=1)
df = df.drop(columns=['HomeTeam', 'AwayTeam', 'Referee', 'FTR'], axis=1)
print(df.head())

In [169]:
df["HTV_missing"] = df["HTV($m)"].isnull().astype(int)
df["ATV_missing"] = df["ATV($m)"].isnull().astype(int)

valuation_features = [
    "Season", "Round",
    "Hpts", "Apts",
    "Home_Form_Points", "Away_Form_Points",
    "Home_Win_Streak", "Away_Win_Streak",
    "Home_H2H_Win_Rate", "Away_H2H_Win_Rate"
]

def xgb_impute(df, target_col, feature_cols):
    """
    Trains an XGBoost regressor to predict 'target_col' using 'feature_cols'.
    Fills in missing values in 'target_col' in the original df.
    """
    not_missing_mask = df[target_col].notnull()
    missing_mask = df[target_col].isnull()

    # If no missing values, just return
    if df[missing_mask].empty:
        print(f"No missing values for {target_col}. Skipping XGBoost imputation.")
        return df
    
    # Prepare data
    df_not_missing = df[not_missing_mask].copy()
    df_missing = df[missing_mask].copy()

    # Initialize XGBRegressor (tune parameters as needed)
    xgb = XGBRegressor(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    # Fit only on known values
    xgb.fit(
        df_not_missing[feature_cols],
        df_not_missing[target_col]
    )

    # Predict for missing
    imputed_values = xgb.predict(df_missing[feature_cols])

    # Fill back
    df.loc[missing_mask, target_col] = imputed_values

    return df

df = xgb_impute(df, target_col="HTV($m)", feature_cols=valuation_features)
df = xgb_impute(df, target_col="ATV($m)", feature_cols=valuation_features)



In [170]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.layers import BatchNormalization
import keras_tuner

X = df.drop(columns=['FTR_A', 'FTR_D', 'FTR_H'])
y = df[['FTR_A', 'FTR_D', 'FTR_H']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.arange(y.shape[1]),
    y=np.argmax(y.values, axis=1)
)
class_weights = dict(enumerate(class_weights))

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dropout(0.4),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')
])


model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, verbose=1)

history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

y_pred = model.predict(X_test)
y_pred_original = np.argmax(y_pred, axis=1)
y_test_original = np.argmax(y_test.values, axis=1)

print("Confusion Matrix:")
print(confusion_matrix(y_test_original, y_pred_original))

print("Classification Report:")
print(classification_report(y_test_original, y_pred_original))