In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import os

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# *DATA PREPROCESSING AND FEATURE ENGINEERING* 

In [None]:
def preprocess_data(path):
    """
    Preprocess the data by loading it from a CSV file, renaming columns, and converting data types.
    """
    df = pd.read_csv(path)
    df.sort_values(['date'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['date'] = pd.to_datetime(df['date'])
    df['result'] = pd.to_numeric(df['result'].replace({'w': 3, 'd': 1, 'l': 0}), errors='coerce')
    df['result'] = df['result'].fillna(0).round().astype(int)
    df.head(20)

    # Safe division function
    def safe_divide(a, b):
        return np.divide(a, b, out=np.zeros_like(a), where=b!=0)

    # Create team views with safe calculations
    team_cols = ['date', 'team', 'team_form', 'xG', 'shots', 'shotOnTarget', 'deep', 'is_home', 'possession_proxy', 'final_pos']
    
    df_home = df.assign(
        team=df['home_team'],
        team_form=df['result'],
        is_home=1,
        final_pos= df['home_final_pos'],
        possession_proxy=(
            0.3 * safe_divide(df['xG_home'], df['xG_home']+df['xG_away']) +
            0.3 * safe_divide(df['home_shots'], df['home_shots']+df['away_shots']) +
            0.4 * safe_divide(df['home_deep'], df['home_deep']+df['away_deep'])
        )
    ).rename(columns={
        'xG_home': 'xG',
        'home_shots': 'shots',
        'home_shotOnTarget': 'shotOnTarget',
        'home_deep': 'deep'
    })[team_cols]

    df_away = df.assign(
        team=df['away_team'],
        team_form=df['result'].replace({3: 0, 0: 3}),
        is_home=0,
        final_pos= df['away_final_pos'],
        possession_proxy=(
            0.3 * safe_divide(df['xG_away'], df['xG_home']+df['xG_away']) +
            0.3 * safe_divide(df['away_shots'], df['home_shots']+df['away_shots']) +
            0.4 * safe_divide(df['away_deep'], df['home_deep']+df['away_deep'])
        )
    ).rename(columns={
        'xG_away': 'xG',
        'away_shots': 'shots',
        'away_shotOnTarget': 'shotOnTarget',
        'away_deep': 'deep'
    })[team_cols]

    # Combine and sort
    df_team = pd.concat([df_home, df_away]).sort_values(['team','date']).reset_index(drop=True)

    # Calculate rolling features
    stats = ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']
    for stat in stats:
        df_team[f'rolling_{stat}'] = (
            df_team.groupby('team')[stat]
            .transform(lambda x: x.rolling(window=5, min_periods=1).mean().shift(1))
            # Optional: fill initial NaN per team
            .fillna(0)
        )
    return df_team, df


In [None]:
team_df, match_df = preprocess_data('/kaggle/input/epl-with-labels-matchday/EPL.csv')
team_df.groupby('team', as_index=False).head(50)
match_df.head(5)

In [None]:
def merge(match_df, team_df):
    # print(team_df.columns.tolist())
    prepared_df = match_df.merge(
        team_df[team_df['is_home']==1][['date', 'team', *[f'rolling_{stat}' for stat in ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']]]],
        left_on=['date', 'home_team'],
        right_on=['date', 'team'],
        suffixes=('', '_home')
    )
    prepared_df = prepared_df.merge(
        team_df[team_df['is_home']==0][['date', 'team', *[f'rolling_{stat}' for stat in ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']]]],
        left_on=['date', 'away_team'],
        right_on=['date', 'team'],
        suffixes=('', '_away')
    )
    return prepared_df



In [None]:
prepared_df = merge(match_df, team_df)
team_df.groupby('team').first()

In [None]:
elo_df = pd.read_csv('/kaggle/input/club-football-match-data-2000-2025/EloRatings.csv')
elo_df['date'] = pd.to_datetime(elo_df['date'])  # Convert first if needed
elo_df[
    (elo_df['country'] == "ENG") & 
    (elo_df['date'].dt.year == 2020) & (elo_df['club'] == "Tottenham")
]


In [None]:
from scipy.special import expit as sigmoid

# Configuration
k = 400  # Elo rating scale factor
elo_baseline = 1500  # Baseline Elo rating
stats_cols = ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']
# 1. Prepare the base statistics (using year-specific means)
def get_year_specific_base_stats(year):
    year_mask = (team_df['date'].dt.year == year)
    return {
        'xG': team_df.loc[year_mask, 'rolling_xG'].mean(),
        'shots': team_df.loc[year_mask, 'rolling_shots'].mean(),
        'shotOnTarget': team_df.loc[year_mask, 'rolling_shotOnTarget'].mean(),
        'deep': team_df.loc[year_mask, 'rolling_deep'].mean(),
        'team_form': team_df.loc[year_mask, 'rolling_team_form'].mean(),
        'possession_proxy': team_df.loc[year_mask, 'rolling_possession_proxy'].mean()
    }

# 2. Define the imputation function
def elo_impute(base_stat, elo_team, k=400, elo_baseline=1500):
    return base_stat * (1 + sigmoid((elo_team - elo_baseline) / k))

# 3. Process each row that needs imputation
for idx, row in team_df.iterrows():
    if all(row[f'rolling_{col}'] == 0 for col in stats_cols):  # Check if any rolling stat is 0
        year = row['date'].year
        team = row['team']
        
        # Get year-specific base stats
        base_stats = get_year_specific_base_stats(year)
        
       # Find team's Elo rating (most recent before current date)
        filtered_elo = elo_df[
            (elo_df['club'] == team) & 
            (elo_df['date'] <= row['date'])
        ].sort_values('date', ascending=False)

        # Use baseline if no past Elo found
        team_elo = filtered_elo['elo'].values[0] if not filtered_elo.empty else elo_baseline
        
        # Impute each stat
        for stat in stats_cols:
            if row[f'rolling_{stat}'] == 0:
                base_stat = base_stats[stat]
                team_df.at[idx, f'rolling_{stat}'] = elo_impute(base_stat, team_elo, k, elo_baseline)

In [None]:
team_df.head(50)

In [None]:
prepared_df = merge(match_df, team_df)
prepared_df.groupby('home_team').first()

In [None]:
#prepared_df = prepared_df.drop(['team_away', 'is_home'], axis=1)
prepared_df.columns


In [None]:
prepared_df.sort_values(by=['team', 'date']).head(20).filter(['date', 'home_team', 'away_team', 'team'])

In [None]:
team_df.sort_values(by=['team', 'date']).head(20)

In [None]:
train1_df = team_df.merge(
    prepared_df[['date','home_team', 'away_team', 'xG_away', *[f'away_{stat}' for stat in ['shots', 'shotOnTarget', 'deep']], *[f'rolling_{stat}_away' for stat in ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']]]], 
    left_on= ['date', 'team'],
    right_on = ['date', 'home_team'],
    how = 'left'
)
#train_df.head(10).filter(['date', 'team', 'away_team', 'rolling_xG_away'])
train1_df.columns
rename_dict = {
                'team_form': 'form',
               'away_team': 'opponent',
               'xG_home': 'xG',
               'xG_away': 'opponent_xG',
               **{f'away_{stat}': f'opponent_{stat}' for stat in ['shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']},
               **{f'rolling_{stat}_away': f'opponent_rolling_{stat}' for stat in ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']}}
train1_df = train1_df.rename(columns =rename_dict)
train1_df.drop(columns = ['home_team'])
train1_df.columns

In [None]:
train1_df[train1_df['team']== 'Crystal Palace'].head(5)

In [None]:
train2_df = team_df.merge(
    prepared_df[['date','home_team', 'away_team', 'xG_home', *[f'home_{stat}' for stat in ['shots', 'shotOnTarget', 'deep']], *[f'rolling_{stat}' for stat in ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']]]], 
    left_on= ['date', 'team'],
    right_on = ['date', 'away_team'],
    how = 'left',
    suffixes = ['','_home']
)
#train_df.head(10).filter(['date', 'team', 'away_team', 'rolling_xG_away'])
train2_df.columns
rename_dict = {
                'team_form': 'form',
               'home_team': 'opponent',
               'xG_home': 'opponent_xG',
               **{f'home_{stat}': f'opponent_{stat}' for stat in ['shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']},
               **{f'rolling_{stat}_home': f'opponent_rolling_{stat}' for stat in ['xG', 'shots', 'shotOnTarget', 'deep', 'team_form', 'possession_proxy']}}
train2_df = train2_df.rename(columns =rename_dict)
train2_df.head(5)

In [None]:
final_df = train1_df.copy()

# Fill in missing values in train1_df with values from train2_df
final_df = final_df.fillna(train2_df)

In [None]:
final_df.head()

In [None]:
final_df.to_csv('csv_file.csv')

In [None]:
#arrangements according to the season instead of date.
final_df['date'] = pd.to_datetime(final_df['date'])
final_df['season'] = final_df['date'].apply(lambda x: x.year if x.month >= 8 else x.year-1)

final_df = final_df.sort_values(['season', 'team', 'date'])
grouped = final_df.groupby(['team', 'season'])

final_df.groupby('team').head(2)

In [None]:
final_df = final_df.sort_values(["season", "team", "date"]).reset_index()

In [None]:
final_df.filter([
    'season',
    'team',
    'opponent',
    'final_pos'
]).groupby(
    'season'
).head(1)
final_df.columns

In [None]:
final_df=final_df.drop(columns=['home_team'])

In [None]:
final_df.columns


In [None]:
!pip install scikit-learn
# !pip install --upgrade tensorflow

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
scale_features = [
        'form', 'xG', 'shots', 'shotOnTarget', 'deep',
       'possession_proxy', 'rolling_xG', 'rolling_shots',
       'rolling_shotOnTarget', 'rolling_deep', 'rolling_team_form',
       'rolling_possession_proxy', 'opponent_xG', 'opponent_shots',
       'opponent_shotOnTarget', 'opponent_deep', 'opponent_rolling_xG',
       'opponent_rolling_shots', 'opponent_rolling_shotOnTarget',
       'opponent_rolling_deep', 'opponent_rolling_team_form',
       'opponent_rolling_possession_proxy'
]
#encoding certain categorical features
final_df['final_pos'] = final_df['final_pos'] - 1

# Training set: 2014-2020 (7 seasons)
train_df = final_df[final_df["season"].isin(range(2014, 2021))]  # 2014 to 2020 inclusive

# Validation set: 2021 (1 season) - For hyperparameter tuning
val_df = final_df[final_df["season"] == 2021]

# Test set: 2022 (most recent season) - Final evaluation
test_df = final_df[final_df["season"] == 2022]

scaler = StandardScaler()
# 1. Fit and transform TRAIN data
scaled_train = scaler.fit_transform(train_df[scale_features])
final_train_df = pd.concat([
    pd.DataFrame(scaled_train, columns=scale_features, index=train_df.index),
    train_df[['is_home', 'final_pos', 'date', 'season', 'team', 'opponent']]
], axis=1)

# 2. Transform VALIDATION data
scaled_val = scaler.transform(val_df[scale_features])
final_val_df = pd.concat([
    pd.DataFrame(scaled_val, columns=scale_features, index=val_df.index),
    val_df[['is_home', 'final_pos', 'date', 'season', 'team', 'opponent']]
], axis=1)

# 3. Transform TEST data
scaled_test = scaler.transform(test_df[scale_features])
final_test_df = pd.concat([
    pd.DataFrame(scaled_test, columns=scale_features, index=test_df.index),
    test_df[['is_home', 'final_pos', 'date', 'season', 'team', 'opponent']]
], axis=1)

In [None]:
final_train_df.head(5)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
feature_cols = [
        'form', 'xG', 'shots', 'shotOnTarget', 'deep',
       'possession_proxy', 'is_home', 'rolling_xG', 'rolling_shots',
       'rolling_shotOnTarget', 'rolling_deep', 'rolling_team_form',
       'rolling_possession_proxy', 'opponent_xG', 'opponent_shots',
       'opponent_shotOnTarget', 'opponent_deep', 'opponent_rolling_xG',
       'opponent_rolling_shots', 'opponent_rolling_shotOnTarget',
       'opponent_rolling_deep', 'opponent_rolling_team_form',
       'opponent_rolling_possession_proxy'
]
def create_sequences(data, min_length = 5):
    sequences, targets = [], []
    grouped = data.groupby(['team', 'season'])
    for (team, season), group in grouped:
        group = group.sort_values('date')
        X = group[feature_cols].values # it includes array 2-d (34, 23)
        y = group['final_pos'].iloc[0]

        for n in range(min_length, len(group)+1):
            sequences.append(X[:n])
            targets.append(y)
    padded_sequences = pad_sequences(sequences, maxlen=34, padding="post", truncating='post',dtype='float32')
    return np.array(padded_sequences), np.array(targets)

In [None]:
# Generate train/test sequences
X_train, y_train = create_sequences(final_train_df)
X_val, y_val = create_sequences(final_val_df)
X_test, y_test = create_sequences(final_test_df)


In [None]:
!pip install matplotlib seaborn
import matplotlib.pyplot as plt
import seaborn as sns


***DATA VISUALIZATION FOR SEQUENCES / TEMPORAL PATTERN***


In [None]:

team_data = final_train_df[(train_df['team']== "Chelsea") & (final_train_df['season']== 2014)].sort_values(by='date')

plt.figure(figsize=(15, 8))  # Single figure with subplots

for i, feature in enumerate(['rolling_team_form', 'rolling_xG', 'rolling_deep'], 1):
    plt.subplot(3, 1, i)  # 3 rows, 1 column, position i
    sns.lineplot(data=team_data, x='date', y=feature, marker='o', color='blue')
    plt.title(f"Chelsea 2014: {feature}")
    plt.xticks(rotation=45)
    plt.grid(True)

plt.tight_layout()  # Prevent overlapping labels
plt.show()

In [None]:

# Define teams and features to compare
teams = ["Chelsea", "Manchester City", "Liverpool"]  # Add more teams if needed
features = ['form', 'xG', 'deep']
season = 2014

# Set up subplots: 1 row per team, 3 columns per feature
fig, axes = plt.subplots(len(teams), len(features), figsize=(18, 10), sharex=True)

for i, team in enumerate(teams):
    # Filter team data for the season
    team_data = final_train_df[
        (final_train_df['team'] == team) & 
        (final_train_df['season'] == season)
    ].sort_values(by='date')
    
    for j, feature in enumerate(features):
        ax = axes[i, j]
        sns.lineplot(
            data=team_data, 
            x='date', 
            y=feature, 
            ax=ax,
            marker='o',
            color=['#FF6B6B', '#4ECDC4', '#45B7D1'][j]  # Different colors per feature
        )
        ax.set_title(f"{team} {season}: {feature}")
        ax.set_ylabel(feature)
        ax.tick_params(axis='x', rotation=45)
        ax.grid(True)

plt.tight_layout()
plt.show()

# Check for infinite values (unchanged from your code)
numeric_cols = final_train_df.select_dtypes(include=[np.number]).columns
inf_counts = (np.isinf(final_train_df[numeric_cols])).sum()
print("Infinite values in numeric columns:")
print(inf_counts)

In [None]:
def plot_feature_distributions(features=['xG', 'shots', 'is_home']):
    """Compare feature distributions across train/val/test sets"""
    plt.figure(figsize=(15, 4))
    for i, feature in enumerate(features):
        plt.subplot(1, len(features), i+1)
        sns.kdeplot(train_df[feature], label='Train')
        sns.kdeplot(val_df[feature], label='Validation')
        sns.kdeplot(test_df[feature], label='Test')
        plt.title(feature)
        plt.legend()
    plt.tight_layout()
    plt.show()

plot_feature_distributions()

In [None]:
def plot_sequence(sequence_idx, max_timesteps=10, n_features=5):
    """Plot first N timesteps of a sample sequence"""
    plt.figure(figsize=(12, 6))
    
    # Get first sequence (padded)
    sample_seq = X_train[sequence_idx][:max_timesteps, :n_features]
    
    # Create heatmap
    sns.heatmap(sample_seq.T, annot=True, cmap='viridis', 
                yticklabels=feature_cols[:n_features])
    plt.title(f"Sequence {sequence_idx} (First {max_timesteps} Games)")
    plt.xlabel("Timesteps (Games)")
    plt.ylabel("Features")
    plt.show()

# Example: First sequence, first 5 features
plot_sequence(32, max_timesteps=5, n_features=5)

In [None]:
print("Training mean:", scaler.mean_[:5])  # First 5 features' means
print("Training std:", scaler.scale_[:5])  # First 5 features' std devs

# Check test data didn't influence scaling
assert np.allclose(scaler.mean_, train_df[scale_features].mean()), "Data leakage!"

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(x=y_train)
plt.title("Distribution of Final Positions (Training Set)")
plt.show()

In [None]:
X_train.shape

In [None]:
print("y_train:", y_train[:35])
print("\nUnique targets:", np.unique(y_train))# Check possible positions (e.g., 1-20)
y_train.shape

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

m = X_train.shape[0]
time_steps = X_train.shape[1]
n = X_train.shape[2]

In [None]:
print("Last timestep (should be 0):", X_train[0][-1, :3])  # First 3 features

In [None]:
model = Sequential([
    Masking(mask_value=0.0, input_shape=(time_steps, n)),
    LSTM(64, return_sequences = False, kernel_regularizer=l2(0.01)), #uses l2(ridge) regularization slower and smoother than l1 
    Dropout(0.3), #factor 0.3 mean 30 percent of the neurons are skipped while training to avoid of.
    Dense(20, activation= 'softmax')
])

model.compile(
    optimizer= Adam(learning_rate=0.001),
    loss = SparseCategoricalCrossentropy(),
    metrics = ['accuracy']
)

model.summary()

os.makedirs('/kaggle/working/checkpoints', exist_ok=True)

# using EarlyStopping and ModelChekpoint callbacks

early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, mode='max', restore_best_weights=True)

checkpoint = ModelCheckpoint(
    
    filepath='/kaggle/working/checkpoints/model_epoch{epoch:02d}.keras',
    monitor='val_accuracy',
    save_best_only=False,
    save_weights_only=False,
    verbose=1,
    
)

#Model training
model.fit(
    X_train,
    y_train,
    epochs=80,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, checkpoint]
)

