In [35]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random
import re
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate, Layer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Set random seed for reproducibility
random.seed(40)
np.random.seed(40)
tf.random.set_seed(40)

# Load data
competitions = pd.read_csv('match_data/competitions.csv')
games = pd.read_csv('match_data/games.csv')
appearances = pd.read_csv('match_data/appearances.csv')
lineups = pd.read_csv('match_data/game_lineups.csv')

# Define the competition IDs for the top 5 major leagues
top_5_league_ids = ['GB1', 'FR1', 'DE1', 'IT1', 'ES1']

# Filter games to include only those from the top 5 leagues
games = games[games['competition_id'].isin(top_5_league_ids)]

# Data preprocessing
games[['home_score', 'away_score']] = games['aggregate'].str.split(':', expand=True)
games['home_score'] = games['home_score'].astype(int)
games['away_score'] = games['away_score'].astype(int)

lineups['date'] = pd.to_datetime(lineups['date'])

positions = ['Defensive Midfield', 'Centre-Back', 'Left Winger', 'Left-Back',
             'Right Winger', 'Goalkeeper', 'Attacking Midfield',
             'Centre-Forward', 'Central Midfield', 'Right-Back',
             'Right Midfield', 'Left Midfield', 'Second Striker', 'Defender',
             'midfield', 'Attack', 'Sweeper']
position_mapping = {position: idx for idx, position in enumerate(positions)}
lineups['position_id'] = lineups['position'].map(position_mapping).fillna(-1).astype(int)

# Normalize appearance metrics
scaler = MinMaxScaler()
appearances[['n_goals', 'n_assists', 'n_yellow_cards', 'n_red_cards', 'n_minutes_played']] = scaler.fit_transform(
    appearances[['goals', 'assists', 'yellow_cards', 'red_cards', 'minutes_played']])

class PerformanceScoreLayer(Layer):
    def __init__(self, **kwargs):
        super(PerformanceScoreLayer, self).__init__(**kwargs)
        self.w_goals = self.add_weight(name='w_goals', initializer='random_normal', trainable=True)
        self.w_assists = self.add_weight(name='w_assists', initializer='random_normal', trainable=True)
        self.w_yellow_cards = self.add_weight(name='w_yellow_cards', initializer='random_normal', trainable=True)
        self.w_red_cards = self.add_weight(name='w_red_cards', initializer='random_normal', trainable=True)
        self.w_minutes_played = self.add_weight(name='w_minutes_played', initializer='random_normal', trainable=True)
    
    def call(self, inputs):
        n_goals, n_assists, n_yellow_cards, n_red_cards, n_minutes_played = inputs
        performance_score = (
            self.w_goals * n_goals +
            self.w_assists * n_assists +
            self.w_yellow_cards * n_yellow_cards +
            self.w_red_cards * n_red_cards +
            self.w_minutes_played * n_minutes_played
        )
        return performance_score

# Ensure appearances have the performance score calculated
appearances['performance_score'] = (
    appearances['n_goals'] +
    appearances['n_assists'] +
    appearances['n_yellow_cards'] +
    appearances['n_red_cards'] +
    appearances['n_minutes_played']
)

# Merge normalized metrics into lineups
lineups = pd.merge(lineups, appearances[['game_id', 'player_id', 'n_goals', 'n_assists', 'n_yellow_cards', 'n_red_cards', 'n_minutes_played', 'performance_score']], on=['game_id', 'player_id'], how='left')
lineups = lineups.fillna(0)

# Verify the merge worked correctly
assert 'performance_score' in lineups.columns, "performance_score column is missing in lineups DataFrame"

# Aggregate team performance scores and other metrics
team_performance_score_sum = lineups.groupby(['game_id', 'club_id']).agg({
    'performance_score': 'sum',
    'n_goals': 'sum',
    'n_assists': 'sum',
    'n_yellow_cards': 'sum',
    'n_red_cards': 'sum',
    'n_minutes_played': 'sum'
}).reset_index()

# Merge aggregated metrics back into the games DataFrame for home and away teams
games = games.merge(team_performance_score_sum, left_on=['game_id', 'home_club_id'], right_on=['game_id', 'club_id'], how='left')
games = games.rename(columns={
    'performance_score': 'home_club_performance',
    'n_goals': 'home_n_goals',
    'n_assists': 'home_n_assists',
    'n_yellow_cards': 'home_n_yellow_cards',
    'n_red_cards': 'home_n_red_cards',
    'n_minutes_played': 'home_n_minutes_played'
}).drop(columns=['club_id'])

games = games.merge(team_performance_score_sum, left_on=['game_id', 'away_club_id'], right_on=['game_id', 'club_id'], how='left')
games = games.rename(columns={
    'performance_score': 'away_club_performance',
    'n_goals': 'away_n_goals',
    'n_assists': 'away_n_assists',
    'n_yellow_cards': 'away_n_yellow_cards',
    'n_red_cards': 'away_n_red_cards',
    'n_minutes_played': 'away_n_minutes_played'
}).drop(columns=['club_id'])

# Preprocess formations
games['home_club_formation'] = games['home_club_formation'].fillna('') \
    .str.replace('Starting Line-up: ', '', regex=False) \
    .str.replace(' Diamond', '', regex=False) \
    .str.replace(' double 6', '', regex=False) \
    .str.replace(' Attacking', '', regex=False) \
    .str.replace(' Defending', '', regex=False) \
    .str.replace(' flat', '', regex=False) \
    .apply(lambda x: re.sub(r'(\d+)/(\d+)/(\d+)', r'\1-\2-\3', x)) \
    .apply(lambda x: re.sub(r'(\d+)/(\d+)/(\d+)/(\d+)', r'\1-\2-\3-\4', x))

games['away_club_formation'] = games['away_club_formation'].fillna('') \
    .str.replace('Starting Line-up: ', '', regex=False) \
    .str.replace(' Diamond', '', regex=False) \
    .str.replace(' double 6', '', regex=False) \
    .str.replace(' Attacking', '', regex=False) \
    .str.replace(' Defending', '', regex=False) \
    .str.replace(' flat', '', regex=False) \
    .apply(lambda x: re.sub(r'(\d+)/(\d+)/(\d+)', r'\1-\2-\3', x)) \
    .apply(lambda x: re.sub(r'(\d+)/(\d+)/(\d+)/(\d+)', r'\1-\2-\3-\4', x))


# Features and target
features = [
    'home_club_id', 'away_club_id', 'home_club_goals', 'away_club_goals',
    'home_club_position', 'away_club_position', 'home_club_formation', 
    'away_club_formation', 'home_club_performance', 'away_club_performance',
    'home_n_goals', 'home_n_assists', 'home_n_yellow_cards', 'home_n_red_cards', 'home_n_minutes_played',
    'away_n_goals', 'away_n_assists', 'away_n_yellow_cards', 'away_n_red_cards', 'away_n_minutes_played'
]
target_home = 'home_score'
target_away = 'away_score'

X = games[features]
y_home = games[target_home]
y_away = games[target_away]

# Preprocessing for numerical and categorical features
numerical_features = [
    'home_club_goals', 'away_club_goals', 'home_club_position', 
    'away_club_position', 'home_club_performance', 'away_club_performance',
    'home_club_id', 'away_club_id'
]
categorical_features = ['home_club_formation', 'away_club_formation']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# Transform the features
X_additional = X.drop(columns=[
    'home_n_goals', 'home_n_assists', 'home_n_yellow_cards', 'home_n_red_cards', 'home_n_minutes_played',
    'away_n_goals', 'away_n_assists', 'away_n_yellow_cards', 'away_n_red_cards', 'away_n_minutes_played'
])
X_additional_processed = preprocessor.fit_transform(X_additional)

# Ensure no NaN or infinite values
X_additional_df = pd.DataFrame(X_additional_processed.toarray() if hasattr(X_additional_processed, 'toarray') else X_additional_processed)
X_additional_df.fillna(0, inplace=True)
X_additional_df.replace([np.inf, -np.inf], 0, inplace=True)

# Check only numeric columns
numeric_columns = X_additional_df.select_dtypes(include=[np.number]).columns
assert np.all(np.isfinite(X_additional_df[numeric_columns])), "Data contains NaN or infinite values"

# Fill NaN values in performance data using .loc
X.loc[:, 'home_n_goals'] = X['home_n_goals'].fillna(0)
X.loc[:, 'home_n_assists'] = X['home_n_assists'].fillna(0)
X.loc[:, 'home_n_yellow_cards'] = X['home_n_yellow_cards'].fillna(0)
X.loc[:, 'home_n_red_cards'] = X['home_n_red_cards'].fillna(0)
X.loc[:, 'home_n_minutes_played'] = X['home_n_minutes_played'].fillna(0)

X.loc[:, 'away_n_goals'] = X['away_n_goals'].fillna(0)
X.loc[:, 'away_n_assists'] = X['away_n_assists'].fillna(0)
X.loc[:, 'away_n_yellow_cards'] = X['away_n_yellow_cards'].fillna(0)
X.loc[:, 'away_n_red_cards'] = X['away_n_red_cards'].fillna(0)
X.loc[:, 'away_n_minutes_played'] = X['away_n_minutes_played'].fillna(0)

# Split the data into training and testing sets
X_train_additional, X_test_additional, y_train_home, y_test_home, y_train_away, y_test_away = train_test_split(
    X_additional_df, y_home, y_away, test_size=0.2, random_state=42)
X_train_home_goals, X_test_home_goals = train_test_split(X['home_n_goals'], test_size=0.2, random_state=42)
X_train_home_assists, X_test_home_assists = train_test_split(X['home_n_assists'], test_size=0.2, random_state=42)
X_train_home_yellow_cards, X_test_home_yellow_cards = train_test_split(X['home_n_yellow_cards'], test_size=0.2, random_state=42)
X_train_home_red_cards, X_test_home_red_cards = train_test_split(X['home_n_red_cards'], test_size=0.2, random_state=42)
X_train_home_minutes_played, X_test_home_minutes_played = train_test_split(X['home_n_minutes_played'], test_size=0.2, random_state=42)

X_train_away_goals, X_test_away_goals = train_test_split(X['away_n_goals'], test_size=0.2, random_state=42)
X_train_away_assists, X_test_away_assists = train_test_split(X['away_n_assists'], test_size=0.2, random_state=42)
X_train_away_yellow_cards, X_test_away_yellow_cards = train_test_split(X['away_n_yellow_cards'], test_size=0.2, random_state=42)
X_train_away_red_cards, X_test_away_red_cards = train_test_split(X['away_n_red_cards'], test_size=0.2, random_state=42)
X_train_away_minutes_played, X_test_away_minutes_played = train_test_split(X['away_n_minutes_played'], test_size=0.2, random_state=42)

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Define the model
def build_model(input_shape):
    input_home_goals = Input(shape=(1,), name='home_n_goals')
    input_home_assists = Input(shape=(1,), name='home_n_assists')
    input_home_yellow_cards = Input(shape=(1,), name='home_n_yellow_cards')
    input_home_red_cards = Input(shape=(1,), name='home_n_red_cards')
    input_home_minutes_played = Input(shape=(1,), name='home_n_minutes_played')
    
    input_away_goals = Input(shape=(1,), name='away_n_goals')
    input_away_assists = Input(shape=(1,), name='away_n_assists')
    input_away_yellow_cards = Input(shape=(1,), name='away_n_yellow_cards')
    input_away_red_cards = Input(shape=(1,), name='away_n_red_cards')
    input_away_minutes_played = Input(shape=(1,), name='away_n_minutes_played')
    
    additional_features = Input(shape=(input_shape,), name='additional_features')
    
    home_performance_score = PerformanceScoreLayer()([input_home_goals, input_home_assists, input_home_yellow_cards, input_home_red_cards, input_home_minutes_played])
    away_performance_score = PerformanceScoreLayer()([input_away_goals, input_away_assists, input_away_yellow_cards, input_away_red_cards, input_away_minutes_played])
    
    combined = Concatenate()([home_performance_score, away_performance_score, additional_features])
    
    x = Dense(256, activation='relu')(combined)
    x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.5)(x)
    output = Dense(1)(x)
    
    model = Model(inputs=[input_home_goals, input_home_assists, input_home_yellow_cards, input_home_red_cards, input_home_minutes_played,
                          input_away_goals, input_away_assists, input_away_yellow_cards, input_away_red_cards, input_away_minutes_played,
                          additional_features], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.0001), loss='mae', metrics=['mae'])
    
    return model

# Build and train the models
input_shape = X_train_additional.shape[1]
model_home = build_model(input_shape)
model_away = build_model(input_shape)

# Ensure no NaN values in training data
for data in [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
             X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, 
             X_train_additional, y_train_home, y_train_away]:
    assert np.all(np.isfinite(data)), f"NaN values found in training data {data}"

# Train the model for home score
history_home = model_home.fit(
    [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
     X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, X_train_additional],
    y_train_home, 
    epochs=100, 
    batch_size=32, 
    validation_split=0.2, 
    callbacks=[early_stopping]
)

# Train the model for away score
history_away = model_away.fit(
    [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
     X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, X_train_additional],
    y_train_away, 
    epochs=100, 
    batch_size=32, 
    validation_split=0.2, 
    callbacks=[early_stopping]
)

# Evaluate the model for home score
loss_home = model_home.evaluate(
    [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
     X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional], 
    y_test_home
)
print(f'Mean Absolute Error for Home Score: {loss_home[1]}')

# Evaluate the model for away score
loss_away = model_away.evaluate(
    [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
     X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional], 
    y_test_away
)
print(f'Mean Absolute Error for Away Score: {loss_away[1]}')

# Check a sample of the training data
print("Training Data - Goals:", X_train_home_goals[:10])
print("Training Data - Assists:", X_train_home_assists[:10])
print("Training Data - Yellow Cards:", X_train_home_yellow_cards[:10])
print("Training Data - Red Cards:", X_train_home_red_cards[:10])
print("Training Data - Minutes Played:", X_train_home_minutes_played[:10])

# Check a sample of the test data
print("Test Data - Goals:", X_test_home_goals[:10])
print("Test Data - Assists:", X_test_home_assists[:10])
print("Test Data - Yellow Cards:", X_test_home_yellow_cards[:10])
print("Test Data - Red Cards:", X_test_home_red_cards[:10])
print("Test Data - Minutes Played:", X_test_home_minutes_played[:10])


# Print predictions right after training
y_pred_home_train = model_home.predict(
    [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
     X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, X_train_additional]
)
y_pred_away_train = model_away.predict(
    [X_train_home_goals, X_train_home_assists, X_train_home_yellow_cards, X_train_home_red_cards, X_train_home_minutes_played,
     X_train_away_goals, X_train_away_assists, X_train_away_yellow_cards, X_train_away_red_cards, X_train_away_minutes_played, X_train_additional]
)
print("Training Predicted Home Scores:", y_pred_home_train[:10].flatten())
print("Training Predicted Away Scores:", y_pred_away_train[:10].flatten())

# Predict with a specified batch size
y_pred_home_test = model_home.predict(
    [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
     X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional],
)
y_pred_away_test = model_away.predict(
    [X_test_home_goals, X_test_home_assists, X_test_home_yellow_cards, X_test_home_red_cards, X_test_home_minutes_played,
     X_test_away_goals, X_test_away_assists, X_test_away_yellow_cards, X_test_away_red_cards, X_test_away_minutes_played, X_test_additional],
)
print("Test Predicted Home Scores:", y_pred_home_test[:10].flatten())
print("Test Predicted Away Scores:", y_pred_away_test[:10].flatten())



Epoch 1/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.2357 - mae: 1.2357 - val_loss: 0.7334 - val_mae: 0.7334
Epoch 2/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 991us/step - loss: 0.7954 - mae: 0.7954 - val_loss: 0.5748 - val_mae: 0.5748
Epoch 3/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.7160 - mae: 0.7160 - val_loss: 0.5265 - val_mae: 0.5265
Epoch 4/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6816 - mae: 0.6816 - val_loss: 0.4928 - val_mae: 0.4928
Epoch 5/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 984us/step - loss: 0.6241 - mae: 0.6241 - val_loss: 0.4966 - val_mae: 0.4966
Epoch 6/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5911 - mae: 0.5911 - val_loss: 0.4764 - val_mae: 0.4764
Epoch 7/100
[1m362/362[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 