In [30]:
import tensorflow as tf
import numpy as np
import pandas as pd
import tensorflowjs as tfjs
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

tf.keras.backend.clear_session()

# Load data
competitions = pd.read_csv('match_data/competitions.csv')
games = pd.read_csv('match_data/games.csv')
appearances = pd.read_csv('match_data/appearances.csv')
lineups = pd.read_csv('match_data/game_lineups.csv')

# Define the competition IDs for the top 5 major leagues
top_5_league_ids = ['GB1', 'FR1', 'L1', 'IT1', 'ES1']

# Filter games to include only those from the top 5 leagues
games = games[games['competition_id'].isin(top_5_league_ids)]

games[['home_score', 'away_score']] = games['aggregate'].str.split(':', expand=True)
games['home_score'] = games['home_score'].astype(int)
games['away_score'] = games['away_score'].astype(int)

lineups['date'] = pd.to_datetime(lineups['date'])

positions = ['Defensive Midfield', 'Centre-Back', 'Left Winger', 'Left-Back',
             'Right Winger', 'Goalkeeper', 'Attacking Midfield',
             'Centre-Forward', 'Central Midfield', 'Right-Back',
             'Right Midfield', 'Left Midfield', 'Second Striker', 'Defender',
             'midfield', 'Attack', 'Sweeper']
position_mapping = {position: idx for idx, position in enumerate(positions)}
lineups['position_id'] = lineups['position'].map(position_mapping).fillna(-1).astype(int)

# Normalize appearance metrics
scaler = StandardScaler()
appearances[['n_goals', 'n_assists', 'n_yellow_cards', 'n_red_cards', 'n_minutes_played']] = scaler.fit_transform(
    appearances[['goals', 'assists', 'yellow_cards', 'red_cards', 'minutes_played']]
)

# Ensure appearances have the performance score calculated
appearances['performance_score'] = (
    appearances['n_goals'] +
    appearances['n_assists'] +
    appearances['n_yellow_cards'] +
    appearances['n_red_cards'] +
    appearances['n_minutes_played']
)

# Merge normalized metrics into lineups
lineups = pd.merge(lineups, appearances[['game_id', 'player_id', 'n_goals', 'n_assists', 'n_yellow_cards', 'n_red_cards', 'n_minutes_played', 'performance_score']], on=['game_id', 'player_id'], how='left')
lineups = lineups.fillna(0)

# Verify the merge worked correctly
assert 'performance_score' in lineups.columns, "performance_score column is missing in lineups DataFrame"

# Aggregate team performance scores and other metrics
team_performance_score_sum = lineups.groupby(['game_id', 'club_id']).agg({
    'performance_score': 'sum',
    'n_goals': 'sum',
    'n_assists': 'sum',
    'n_yellow_cards': 'sum',
    'n_red_cards': 'sum',
    'n_minutes_played': 'mean'
}).reset_index()

# Merge aggregated metrics back into the games DataFrame for home and away teams
games = games.merge(team_performance_score_sum, left_on=['game_id', 'home_club_id'], right_on=['game_id', 'club_id'], how='left')
games = games.rename(columns={
    'performance_score': 'home_club_performance',
    'n_goals': 'home_n_goals',
    'n_assists': 'home_n_assists',
    'n_yellow_cards': 'home_n_yellow_cards',
    'n_red_cards': 'home_n_red_cards',
    'n_minutes_played': 'home_n_minutes_played'
}).drop(columns=['club_id'])

games = games.merge(team_performance_score_sum, left_on=['game_id', 'away_club_id'], right_on=['game_id', 'club_id'], how='left')
games = games.rename(columns={
    'performance_score': 'away_club_performance',
    'n_goals': 'away_n_goals',
    'n_assists': 'away_n_assists',
    'n_yellow_cards': 'away_n_yellow_cards',
    'n_red_cards': 'away_n_red_cards',
    'n_minutes_played': 'away_n_minutes_played'
}).drop(columns=['club_id'])

# Features and target
features = [
    'home_club_id', 'away_club_id', 'home_club_goals', 'away_club_goals',
    'home_club_position', 'away_club_position',
    'home_n_goals', 'home_n_assists', 'home_n_yellow_cards', 'home_n_red_cards', 'home_n_minutes_played',
    'away_n_goals', 'away_n_assists', 'away_n_yellow_cards', 'away_n_red_cards', 'away_n_minutes_played'
]
target_home = 'home_score'
target_away = 'away_score'

X = games[features]
y_home = games[target_home]
y_away = games[target_away]

# Preprocessing for numerical features
numerical_features = [
    'home_club_goals', 'away_club_goals', 'home_club_position',
    'away_club_position', 'home_club_id', 'away_club_id',
    'home_n_goals', 'home_n_assists', 'home_n_yellow_cards', 'home_n_red_cards', 'home_n_minutes_played',
    'away_n_goals', 'away_n_assists', 'away_n_yellow_cards', 'away_n_red_cards', 'away_n_minutes_played'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
    ]
)

# Transform the features
X_processed = preprocessor.fit_transform(X)

# Ensure no NaN or infinite values
X_df = pd.DataFrame(X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed)
X_df.fillna(0, inplace=True)
X_df.replace([np.inf, -np.inf], 0, inplace=True)

# Check only numeric columns
numeric_columns = X_df.select_dtypes(include=[np.number]).columns
assert np.all(np.isfinite(X_df[numeric_columns])), "Data contains NaN or infinite values"

# Split the data into training and testing sets
X_train, X_test, y_train_home, y_test_home, y_train_away, y_test_away = train_test_split(
    X_df, y_home, y_away, test_size=0.2, random_state=42)

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define the model using Sequential
def build_model(input_shape):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_shape,), name='dense'),
        Dropout(0.5, name='dropout'),
        Dense(128, activation='relu', name='dense_1'),
        Dropout(0.5, name='dropout_1'),
        Dense(64, activation='relu', name='dense_2'),
        Dropout(0.5, name='dropout_2'),
        Dense(32, activation='relu', name='dense_3'),
        Dropout(0.5, name='dropout_3'),
        Dense(1, name='dense_4')
    ])
    model.compile(optimizer=Adam(learning_rate=0.0005), loss='mae', metrics=['mae'])
    return model

# Build and train the models
input_shape = X_train.shape[1]
print(f'Input shape: {input_shape}')
model_home = build_model(input_shape)
model_away = build_model(input_shape)

# Ensure no NaN values in training data
for data in [X_train, y_train_home, y_train_away]:
    assert np.all(np.isfinite(data)), f"NaN values found in training data {data}"

# Train the model for home score
history_home = model_home.fit(
    X_train, y_train_home, 
    epochs=100, 
    batch_size=32, 
    validation_split=0.2, 
    callbacks=[early_stopping]
)

# Train the model for away score
history_away = model_away.fit(
    X_train, y_train_away, 
    epochs=100, 
    batch_size=32, 
    validation_split=0.2, 
    callbacks=[early_stopping]
)

# # Evaluate the model for home score
# loss_home = model_home.evaluate(
#     [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
#      X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional], 
#     y_test_home
# )
# print(f'Mean Absolute Error for Home Score: {loss_home[1]}')

# # Evaluate the model for away score
# loss_away = model_away.evaluate(
#     [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
#      X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional], 
#     y_test_away
# )
# print(f'Mean Absolute Error for Away Score: {loss_away[1]}')

# Evaluate the model for home score
# Evaluate the model for home score
loss_home = model_home.evaluate(X_test, y_test_home)
print(f'Mean Absolute Error for Home Score: {loss_home[1]}')

# Evaluate the model for away score
loss_away = model_away.evaluate(X_test, y_test_away)
print(f'Mean Absolute Error for Away Score: {loss_away[1]}')

# # Print predictions right after training
# y_pred_home_train = model_home.predict(
#     [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
#      X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, X_train_additional]
# )
# y_pred_away_train = model_away.predict(
#     [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
#      X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, X_train_additional]
# )

# # Print predictions right after training
# y_pred_home_train = model_home.predict(
#     [X_train_additional, X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
#      X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played]
# )
# y_pred_away_train = model_away.predict(
#     [X_train_additional, X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
#      X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played]
# )

# print("Training Predicted Home Scores:", y_pred_home_train[:10].flatten())
# print("Training Predicted Away Scores:", y_pred_away_train[:10].flatten())


# # Predict with a specified batch size
# y_pred_home_test = model_home.predict(
#     [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
#      X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional],
# )
# y_pred_away_test = model_away.predict(
#     [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
#      X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional],
# )

# # Predict with a specified batch size
# y_pred_home_test = model_home.predict(
#     [X_test_additional, X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
#      X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played]
# )
# y_pred_away_test = model_away.predict(
#     [X_test_additional, X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
#      X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played]
# )

# print("Test Predicted Home Scores:", y_pred_home_test[:10].flatten())
# print("Test Predicted Away Scores:", y_pred_away_test[:10].flatten())

# model_home.summary()
# model_away.summary()

# Print predictions right after training
y_pred_home_train = model_home.predict(X_train)
y_pred_away_train = model_away.predict(X_train)

print("Training Predicted Home Scores:", y_pred_home_train[:10].flatten())
print("Training Predicted Away Scores:", y_pred_away_train[:10].flatten())

# Predict with a specified batch size
y_pred_home_test = model_home.predict(X_test)
y_pred_away_test = model_away.predict(X_test)

print("Test Predicted Home Scores:", y_pred_home_test[:10].flatten())
print("Test Predicted Away Scores:", y_pred_away_test[:10].flatten())

model_home.summary()
model_away.summary()

# # After training your models and before saving them
# model_home.save('model_home.h5')
# model_away.save('model_away.h5')



Input shape: 16
Epoch 1/100


  kernel_constraint: Constraint function applied to


[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 900us/step - loss: 0.9620 - mae: 0.9620 - val_loss: 0.6543 - val_mae: 0.6543
Epoch 2/100
[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 748us/step - loss: 0.5800 - mae: 0.5800 - val_loss: 0.5415 - val_mae: 0.5415
Epoch 3/100
[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 781us/step - loss: 0.4852 - mae: 0.4852 - val_loss: 0.4449 - val_mae: 0.4449
Epoch 4/100
[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 762us/step - loss: 0.4271 - mae: 0.4271 - val_loss: 0.3484 - val_mae: 0.3484
Epoch 5/100
[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step - loss: 0.3880 - mae: 0.3880 - val_loss: 0.3605 - val_mae: 0.3605
Epoch 6/100
[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 745us/step - loss: 0.3617 - mae: 0.3617 - val_loss: 0.3461 - val_mae: 0.3461
Epoch 7/100
[1m435/435[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 813u

In [34]:
import json

scaler = preprocessor.named_transformers_['num']
scaler_params = {
    'means': scaler.mean_.tolist(),
    'stds': scaler.scale_.tolist()
}

# Save to a JSON file
with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f)


In [32]:
print(X_test)

             0         1         2         3         4         5         6   \
2862   0.345606 -0.184184  0.137617  1.686896 -0.289762 -0.072772  0.415355   
640   -1.179482 -1.041881  1.724080  1.334891 -0.126417 -0.015068 -1.143698   
16293  0.345606  0.673513 -0.038656 -0.601136 -0.316623 -0.389238  0.265327   
6669  -0.416938 -0.184184  1.195259  0.278876 -0.216438 -0.164953 -0.439186   
4770   1.108151 -0.184184 -0.391204  1.686896 -0.055635 -0.034665  1.044854   
...         ...       ...       ...       ...       ...       ...       ...   
19218 -0.416938  0.673513  1.724080  1.158889  1.077253  0.551088 -1.143698   
967    0.345606  0.673513 -0.214930  0.806884 -0.211357  5.658461 -0.364172   
11451  0.345606 -1.041881 -0.743751  0.278876 -0.383050 -0.020148 -0.364172   
7477  -0.416938 -0.184184  1.195259 -0.777138 -0.059990 -0.292338 -0.514200   
1947  -1.179482  0.673513 -1.096298  0.278876  0.909552  0.014692 -1.143698   

             7         8         9         10      

In [31]:
tfjs.converters.save_keras_model(model_home, 'assets/models/home/')
tfjs.converters.save_keras_model(model_away, 'assets/models/away/')



failed to lookup keras version from the file,
    this is likely a weight only file
failed to lookup keras version from the file,
    this is likely a weight only file
