In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# --- 1. Load the Performance Dataset ---
try:
    df_perf = pd.read_csv('Full Dataset (league standings with players stats).csv')
    print("✅ Successfully loaded the performance dataset.")
except FileNotFoundError:
    print("❌ File not found. Make sure 'Full Dataset...' is in your project folder.")
    exit()

# --- 2. Clean Data and Select a Player ---
df_perf = df_perf.loc[:, ~df_perf.columns.str.contains('^Unnamed')]
df_perf['fullname'] = df_perf['firstname'] + ' ' + df_perf['lastname']

player_name = "James Philip Milner"
df_player = df_perf[df_perf['fullname'] == player_name].copy()
df_player = df_player.sort_values('season').dropna(subset=['totalGoals', 'goalAssists', 'totalShots'])

print(f"\n--- Preparing data for {player_name} ---")

# --- 3. Prepare Data for XGBoost (Different from LSTM) ---
# We want to use past seasons' stats to predict the next season's goals.
features = ['totalGoals', 'goalAssists', 'totalShots']
target = 'totalGoals'

X, y = [], []
# We'll use a lookback of 2 seasons
lookback = 2 

for i in range(lookback, len(df_player)):
    # Flatten the features from the past 'lookback' seasons into a single row
    feature_vector = df_player[features].iloc[i-lookback:i].values.flatten()
    X.append(feature_vector)
    # The target is the goal count of the current season
    y.append(df_player[target].iloc[i])

X = np.array(X)
y = np.array(y)

print(f"Generated {X.shape[0]} training examples.")

# --- 4. Build and Train the XGBoost Model ---
# Split data into training and a small test set for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the model
xgbr = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1)
print("\nTraining the XGBoost model...")
xgbr.fit(X_train, y_train)

# --- 5. Evaluate the Model ---
# Make predictions on the unseen test data
predictions = xgbr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f"\n✅ XGBoost model training complete!")
print(f"The model's prediction error (RMSE) on the test data is: {rmse:.2f} goals")

# Predict goals for the next season using the very last sequence of data
last_sequence = df_player[features].iloc[-lookback:].values.flatten().reshape(1, -1)
next_season_prediction = xgbr.predict(last_sequence)

print(f"\n--- Model Prediction ---")
print(f"The XGBoost model predicts {next_season_prediction[0]:.2f} goals for the next season.")

✅ Successfully loaded the performance dataset.

--- Preparing data for James Philip Milner ---
Generated 4 training examples.

Training the XGBoost model...

✅ XGBoost model training complete!
The model's prediction error (RMSE) on the test data is: 3.53 goals

--- Model Prediction ---
The XGBoost model predicts 1.99 goals for the next season.
