In [75]:
# === Step 0: Imports ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import joblib
import json

# === Step 1: Load & Prepare Dataset ===
with open('LFC_matches.csv', 'r', encoding='utf-8-sig') as f:
    lines = [line.strip() for line in f if line.strip()]

column_names = ['Matchday', 'Opponent', 'Venue', 'Result', 'Score', 'Starting XI']
data = [line.split(',', 5) for line in lines]
df = pd.DataFrame(data[1:], columns=column_names)

df['Starting XI'] = df['Starting XI'].str.strip('"').apply(lambda x: [p.strip() for p in x.split(';')])
all_players = sorted(set(player for lineup in df['Starting XI'] for player in lineup))
bool_cols = all_players

for player in all_players:
    df[player] = df['Starting XI'].apply(lambda lineup: player in lineup)

df.drop(columns=['Starting XI'], inplace=True)
df[bool_cols] = df[bool_cols].astype('float32')
df['Home'] = df['Venue'].apply(lambda x: 1 if x == 'H' else 0).astype('float32')
df['Outcome_Class'] = df['Result'].map({'Loss': 0, 'Draw': 1, 'Win': 2})
y = to_categorical(df['Outcome_Class'])

df.drop(columns=['Matchday', 'Opponent', 'Venue', 'Result', 'Score'], inplace=True)

# Save bool_cols for later
with open("bool_cols.json", "w") as f:
    json.dump(bool_cols, f)

# === Step 2: Feature Engineering ===
X = df[bool_cols + ['Home']].values.astype('float32')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === Step 3: Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=np.argmax(y, axis=1)
)

# === Step 4: Handle Class Imbalance ===
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.argmax(y, axis=1)),
    y=np.argmax(y, axis=1)
)
class_weight_dict = dict(enumerate(class_weights))

# === Step 5: Build & Train Model ===
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(3, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

early_stop = EarlyStopping(patience=10, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=8,
    class_weight=class_weight_dict,
    callbacks=[early_stop],
    verbose=1
)

# === Step 6: Evaluate & Save Model ===
loss, acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {acc:.3f}")

model.save("lfc_match_predictor_softmax.keras")
joblib.dump(scaler, "lfc_input_scaler.pkl")

# === Step 7: Preview Predictions ===
labels = ['Loss', 'Draw', 'Win']
probs = model.predict(X_test)
for i, prob in enumerate(probs[:5]):
    result = dict(zip(labels, np.round(prob, 3)))
    predicted = labels[np.argmax(prob)]
    print(f"Game {i+1}: Prediction → {predicted}, Probabilities → {result}")

# === Step 8: Custom Match Prediction Function ===
def predict_lineup(lineup, is_home, model, scaler, bool_cols):
    vector = np.zeros(len(bool_cols) + 1, dtype='float32')
    for i, player in enumerate(bool_cols):
        if player in lineup:
            vector[i] = 1.0
    vector[-1] = 1.0 if is_home else 0.0
    scaled = scaler.transform([vector])
    probs = model.predict(scaled)[0]
    prediction = labels[np.argmax(probs)]
    probabilities = dict(zip(labels, np.round(probs, 3)))
    return prediction, probabilities

model.save("lfc_match_predictor_softmax.keras")




Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 153ms/step - accuracy: 0.3281 - loss: 1.9619 - val_accuracy: 0.0000e+00 - val_loss: 1.5462
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.2552 - loss: 1.2507 - val_accuracy: 0.0000e+00 - val_loss: 1.5294
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.3698 - loss: 1.1369 - val_accuracy: 0.1667 - val_loss: 1.5165
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.2344 - loss: 1.6971 - val_accuracy: 0.0000e+00 - val_loss: 1.5058
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.3802 - loss: 1.2254 - val_accuracy: 0.0000e+00 - val_loss: 1.4933
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.4531 - loss: 0.9403 - val_accuracy: 0.0000e+00 - val_loss: 1.4787
Epoch 7/100
[1m3/3[0m [32m━━