In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# --- 1. Load and Prepare the Data ---
try:
    df_perf = pd.read_csv('Full Dataset (league standings with players stats).csv')
    df_perf = df_perf.loc[:, ~df_perf.columns.str.contains('^Unnamed')]
    df_perf['fullname'] = df_perf['firstname'] + ' ' + df_perf['lastname']
    player_name = "James Philip Milner"
    df_player = df_perf[df_perf['fullname'] == player_name].copy()
    df_player = df_player.sort_values('season').dropna(subset=['totalGoals', 'goalAssists', 'totalShots'])
    print(f"✅ Data prepared for {player_name}")
except FileNotFoundError:
    print("❌ File not found.")
    exit()

# --- 2. Create Time-Series Sequences ---
features = ['totalGoals', 'goalAssists', 'totalShots']
target = 'totalGoals'
lookback = 2
X, y = [], []
for i in range(lookback, len(df_player)):
    X.append(df_player[features].iloc[i-lookback:i].values)
    y.append(df_player[target].iloc[i])

X = np.array(X)
y = np.array(y)

# Use the same train/test split for both models to ensure a fair comparison
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# --- 3. Evaluate the LSTM Model ---
print("\n--- Evaluating LSTM Model ---")
# Scale the data for the LSTM
scaler_X = MinMaxScaler(feature_range=(0, 1))
scaler_y = MinMaxScaler(feature_range=(0, 1))
# Reshape for scaling
X_train_flat = X_train.reshape(-1, X_train.shape[2])
X_test_flat = X_test.reshape(-1, X_test.shape[2])
y_train_flat = y_train.reshape(-1, 1)
y_test_flat = y_test.reshape(-1, 1)
# Fit and transform
X_train_scaled_flat = scaler_X.fit_transform(X_train_flat)
X_test_scaled_flat = scaler_X.transform(X_test_flat)
y_train_scaled_flat = scaler_y.fit_transform(y_train_flat)
# Reshape back to sequences
X_train_scaled = X_train_scaled_flat.reshape(X_train.shape)
X_test_scaled = X_test_scaled_flat.reshape(X_test.shape)

# Build and train LSTM
lstm_model = Sequential([
    LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(units=50),
    Dense(units=1)
])
lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train_scaled, y_train_scaled_flat, epochs=100, batch_size=1, verbose=0)

# Make predictions and inverse scale them
predictions_scaled = lstm_model.predict(X_test_scaled)
predictions_lstm = scaler_y.inverse_transform(predictions_scaled)
rmse_lstm = np.sqrt(mean_squared_error(y_test, predictions_lstm))
print("✅ LSTM Evaluation Complete.")

# --- 4. Evaluate the XGBoost Model ---
print("\n--- Evaluating XGBoost Model ---")
# XGBoost doesn't need scaling, but the data needs to be flat
X_train_xgb = X_train.reshape(X_train.shape[0], -1)
X_test_xgb = X_test.reshape(X_test.shape[0], -1)

# Build and train XGBoost
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)
xgbr.fit(X_train_xgb, y_train)

# Make predictions
predictions_xgb = xgbr.predict(X_test_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, predictions_xgb))
print("✅ XGBoost Evaluation Complete.")

# --- 5. Compare Results ---
print("\n--- 🏆 Model Comparison 🏆 ---")
print(f"LSTM Model RMSE: {rmse_lstm:.2f} goals")
print(f"XGBoost Model RMSE: {rmse_xgb:.2f} goals")

if rmse_lstm < rmse_xgb:
    print("\nResult: The LSTM model was more accurate.")
else:
    print("\nResult: The XGBoost model was more accurate.")

✅ Data prepared for James Philip Milner

--- Evaluating LSTM Model ---


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step
✅ LSTM Evaluation Complete.

--- Evaluating XGBoost Model ---
✅ XGBoost Evaluation Complete.

--- 🏆 Model Comparison 🏆 ---
LSTM Model RMSE: 3.09 goals
XGBoost Model RMSE: 3.53 goals

Result: The LSTM model was more accurate.
