<a href="https://colab.research.google.com/github/Sujoy-004/La-Liga-Score-Prediction/blob/main/predict_score_fr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Loading and Exploration

In [5]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load historical data (2019-2024)
historical_data = pd.read_csv("matches_full.csv")
print("Historical Data Shape:", historical_data.shape)
print("\nHistorical Data Columns:")
print(historical_data.columns.tolist())
print("\nFirst 5 rows of historical data:")
print(historical_data.head().to_string())
# Check data types
print("Historical Data Types:")
print(historical_data.dtypes)
# Check for missing values
print("Missing values in historical data:")
print(historical_data.isnull().sum())
# Check unique values in key columns
print("Unique teams in historical data:", historical_data['team'].nunique())
print("Teams:", sorted(historical_data['team'].unique()))
print("Unique venues:", historical_data['venue'].unique())
print("Unique results:", historical_data['result'].unique())
print("Unique seasons:", sorted(historical_data['season'].unique()))

In [None]:
# Load fixtures for 2025-26
fixtures_2025 = pd.read_excel("la-liga-2025-UTC.xlsx")
print("Fixtures 2025-26 Shape:", fixtures_2025.shape)
print("\nFixtures 2025-26 Columns:")
print(fixtures_2025.columns.tolist())
print("\nFirst 5 rows of fixtures:")
print(fixtures_2025.head().to_string())
# Check data types
print("Fixtures Data Types:")
print(fixtures_2025.dtypes)
# Check for missing values
print("Missing values in fixtures:")
print(fixtures_2025.isnull().sum())

### Data Preprocessing and Target Variable Creation

In [8]:
# Convert date to datetime
historical_data['date'] = pd.to_datetime(historical_data['date'])

In [None]:
print("Sample of team-match combinations:")
sample_matches = historical_data[['date', 'team', 'opponent', 'venue', 'result', 'gf', 'ga']].head(10)
print(sample_matches)

In [12]:
test_match = historical_data[(historical_data['date'] == '2024-08-17') &
                            (historical_data['team'] == 'Barcelona')]
print("Barcelona's first match:")
print(test_match[['date', 'team', 'opponent', 'venue', 'result', 'gf', 'ga']])

away_record = historical_data[(historical_data['date'] == '2024-08-17') &
                             (historical_data['opponent'] == 'Barcelona') &
                             (historical_data['team'] == test_match['opponent'].iloc[0])]
print(f"\n{test_match['opponent'].iloc[0]}'s record for same match:")
print(away_record[['date', 'team', 'opponent', 'venue', 'result', 'gf', 'ga']])

Barcelona's first match:
        date       team  opponent venue result   gf   ga
0 2024-08-17  Barcelona  Valencia  Away      W  2.0  1.0

Valencia's record for same match:
          date      team   opponent venue result   gf   ga
440 2024-08-17  Valencia  Barcelona  Home      L  1.0  2.0


In [None]:
# Create match-level dataset with home team perspective
matches_list = []

for idx, row in historical_data.iterrows():
    if row['venue'] == 'Home':
        # This team is playing at home
        match_data = {
            'date': row['date'],
            'home_team': row['team'],
            'away_team': row['opponent'],
            'home_goals': row['gf'],
            'away_goals': row['ga'],
            'home_xg': row['xg'],
            'away_xg': row['xga'],
            'home_poss': row['poss'],
            'home_shots': row['sh'],
            'home_sot': row['sot'],
            'season': row['season'],
            'round': row['round']
        }

        # Create target variable (from home team perspective)
        if row['result'] == 'W':
            match_data['target'] = 'W'  # Home win
        elif row['result'] == 'D':
            match_data['target'] = 'D'  # Draw
        else:
            match_data['target'] = 'L'  # Home loss

        matches_list.append(match_data)

# Convert to DataFrame
matches_df = pd.DataFrame(matches_list)
print(f"Created match-level dataset with {len(matches_df)} matches")
print("\nFirst 5 matches:")
print(matches_df.head().to_string())

print("\nTarget distribution:")
print(matches_df['target'].value_counts())
print("\nTarget distribution (percentages):")
print(matches_df['target'].value_counts(normalize=True) * 100)

print("\nMatch data by season:")
print(matches_df['season'].value_counts().sort_index())


### Feature Engineering

In [13]:
# Sort matches by date for time-based features
matches_df = matches_df.sort_values('date').reset_index(drop=True)

In [14]:
# 1. TEAM FORM FEATURES (last N matches performance)
def calculate_team_form(matches_df, team, date, n_matches=5):
    """Calculate team's form (wins, draws, losses) in last n matches before given date"""
    team_matches = matches_df[
        ((matches_df['home_team'] == team) | (matches_df['away_team'] == team)) &
        (matches_df['date'] < date)
    ].tail(n_matches)

    if len(team_matches) == 0:
        return {'wins': 0, 'draws': 0, 'losses': 0, 'points': 0, 'goals_for': 0, 'goals_against': 0}

    wins = draws = losses = goals_for = goals_against = 0

    for _, match in team_matches.iterrows():
        if match['home_team'] == team:
            # Team playing at home
            goals_for += match['home_goals']
            goals_against += match['away_goals']
            if match['target'] == 'W':
                wins += 1
            elif match['target'] == 'D':
                draws += 1
            else:
                losses += 1
        else:
            # Team playing away
            goals_for += match['away_goals']
            goals_against += match['home_goals']
            if match['target'] == 'L':  # Away win = home loss
                wins += 1
            elif match['target'] == 'D':
                draws += 1
            else:
                losses += 1

    points = wins * 3 + draws
    return {'wins': wins, 'draws': draws, 'losses': losses, 'points': points,
            'goals_for': goals_for, 'goals_against': goals_against}


In [15]:
# 2. HEAD-TO-HEAD FEATURES
def calculate_h2h(matches_df, home_team, away_team, date, n_matches=10):
    """Calculate head-to-head record between two teams"""
    h2h_matches = matches_df[
        (((matches_df['home_team'] == home_team) & (matches_df['away_team'] == away_team)) |
         ((matches_df['home_team'] == away_team) & (matches_df['away_team'] == home_team))) &
        (matches_df['date'] < date)
    ].tail(n_matches)

    if len(h2h_matches) == 0:
        return {'h2h_home_wins': 0, 'h2h_draws': 0, 'h2h_away_wins': 0}

    home_wins = draws = away_wins = 0

    for _, match in h2h_matches.iterrows():
        if match['home_team'] == home_team:
            # Current home team was home in historical match
            if match['target'] == 'W':
                home_wins += 1
            elif match['target'] == 'D':
                draws += 1
            else:
                away_wins += 1
        else:
            # Current home team was away in historical match
            if match['target'] == 'L':  # Home loss = away win
                home_wins += 1
            elif match['target'] == 'D':
                draws += 1
            else:
                away_wins += 1

    return {'h2h_home_wins': home_wins, 'h2h_draws': draws, 'h2h_away_wins': away_wins}


In [18]:
# Create feature list
features_list = []

for idx, match in matches_df.iterrows():
    if idx % 500 == 0:
        print("Processing match....")

    # Basic match features
    features = {
        'home_team': match['home_team'],
        'away_team': match['away_team'],
        'date': match['date'],
        'target': match['target'],
        'season': match['season']
    }

    # Team form features (last 5 matches)
    home_form = calculate_team_form(matches_df, match['home_team'], match['date'], 5)
    away_form = calculate_team_form(matches_df, match['away_team'], match['date'], 5)

    # Add form features
    features.update({
        'home_form_wins': home_form['wins'],
        'home_form_draws': home_form['draws'],
        'home_form_losses': home_form['losses'],
        'home_form_points': home_form['points'],
        'home_form_gf': home_form['goals_for'],
        'home_form_ga': home_form['goals_against'],

        'away_form_wins': away_form['wins'],
        'away_form_draws': away_form['draws'],
        'away_form_losses': away_form['losses'],
        'away_form_points': away_form['points'],
        'away_form_gf': away_form['goals_for'],
        'away_form_ga': away_form['goals_against']
    })

    # Head-to-head features
    h2h = calculate_h2h(matches_df, match['home_team'], match['away_team'], match['date'])
    features.update(h2h)

    features_list.append(features)

print("All match processed")

Processing match....
Processing match....
Processing match....
Processing match....
Processing match....
All match processed


In [19]:
# Convert to DataFrame
features_df = pd.DataFrame(features_list)

print(f"\nFeature engineering completed!")
print(f"Features DataFrame shape: {features_df.shape}")
print(f"Features created: {features_df.columns.tolist()}")

print("\nSample of engineered features:")
print(features_df.head().to_string())

print("\nFeature statistics:")
print(features_df.describe().to_string())


Feature engineering completed!
Features DataFrame shape: (2159, 20)
Features created: ['home_team', 'away_team', 'date', 'target', 'season', 'home_form_wins', 'home_form_draws', 'home_form_losses', 'home_form_points', 'home_form_gf', 'home_form_ga', 'away_form_wins', 'away_form_draws', 'away_form_losses', 'away_form_points', 'away_form_gf', 'away_form_ga', 'h2h_home_wins', 'h2h_draws', 'h2h_away_wins']

Sample of engineered features:
       home_team    away_team       date target  season  home_form_wins  home_form_draws  home_form_losses  home_form_points  home_form_gf  home_form_ga  away_form_wins  away_form_draws  away_form_losses  away_form_points  away_form_gf  away_form_ga  h2h_home_wins  h2h_draws  h2h_away_wins
0  Athletic Club    Barcelona 2019-08-16      W    2020               0                0                 0                 0           0.0           0.0               0                0                 0                 0           0.0           0.0              0      

###  Model Training and Evaluation

In [20]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

In [21]:
# Prepare features for ML (exclude non-numeric columns)
ml_features = features_df.copy()

In [22]:
# Encode team names
le_home = LabelEncoder()
le_away = LabelEncoder()

In [23]:
# Get all unique teams from both home and away
all_teams = list(set(ml_features['home_team'].unique()) | set(ml_features['away_team'].unique()))
print(f"Total unique teams: {len(all_teams)}")
print(f"Teams: {sorted(all_teams)}")

Total unique teams: 33
Teams: ['Alaves', 'Alavés', 'Almeria', 'Almería', 'Athletic Club', 'Atletico Madrid', 'Atlético Madrid', 'Barcelona', 'Betis', 'Cadiz', 'Celta Vigo', 'Cádiz', 'Eibar', 'Elche', 'Espanyol', 'Getafe', 'Girona', 'Granada', 'Huesca', 'Las Palmas', 'Leganes', 'Leganés', 'Levante', 'Mallorca', 'Osasuna', 'Rayo Vallecano', 'Real Betis', 'Real Madrid', 'Real Sociedad', 'Sevilla', 'Valencia', 'Valladolid', 'Villarreal']


In [24]:
# Fit encoders on all teams
le_home.fit(all_teams)
le_away.fit(all_teams)

ml_features['home_team_encoded'] = le_home.transform(ml_features['home_team'])
ml_features['away_team_encoded'] = le_away.transform(ml_features['away_team'])

In [25]:
# Encode target variable for XGBoost (D=0, L=1, W=2)
le_target = LabelEncoder()
ml_features['target_encoded'] = le_target.fit_transform(ml_features['target'])
print(f"Target encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

Target encoding: {'D': np.int64(0), 'L': np.int64(1), 'W': np.int64(2)}


In [26]:
# Select features for training
feature_columns = [
    'home_team_encoded', 'away_team_encoded', 'season',
    'home_form_wins', 'home_form_draws', 'home_form_losses', 'home_form_points',
    'home_form_gf', 'home_form_ga',
    'away_form_wins', 'away_form_draws', 'away_form_losses', 'away_form_points',
    'away_form_gf', 'away_form_ga',
    'h2h_home_wins', 'h2h_draws', 'h2h_away_wins'
]

X = ml_features[feature_columns]
y = ml_features['target']  # Keep string labels for sklearn models
y_encoded = ml_features['target_encoded']  # Numeric labels for XGBoost

print(f"Feature matrix shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")

Feature matrix shape: (2159, 18)
Target distribution:
target
W    963
L    603
D    593
Name: count, dtype: int64


In [27]:
# Train on 2020-2023, validate on 2024, test on 2025
train_mask = ml_features['season'].isin([2020, 2021, 2022, 2023])
val_mask = ml_features['season'] == 2024
test_mask = ml_features['season'] == 2025

X_train = X[train_mask]
y_train = y[train_mask]
y_train_encoded = y_encoded[train_mask]
X_val = X[val_mask]
y_val = y[val_mask]
y_val_encoded = y_encoded[val_mask]
X_test = X[test_mask]
y_test = y[test_mask]
y_test_encoded = y_encoded[test_mask]

print(f"Training set: {X_train.shape[0]} matches from 2020-2023")
print(f"Validation set: {X_val.shape[0]} matches from 2024")
print(f"Test set: {X_test.shape[0]} matches from 2025")

Training set: 1520 matches from 2020-2023
Validation set: 380 matches from 2024
Test set: 259 matches from 2025


In [29]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"\nTarget distribution in training set:\n{y_train.value_counts(normalize=True)}")
print(f"Target distribution in validation set:\n{y_val.value_counts(normalize=True)}")
print(f"Target distribution in test set:\n{y_test.value_counts(normalize=True)}")


Target distribution in training set:
target
W    0.446711
L    0.280921
D    0.272368
Name: proportion, dtype: float64
Target distribution in validation set:
target
W    0.439474
D    0.281579
L    0.278947
Name: proportion, dtype: float64
Target distribution in test set:
target
W    0.451737
D    0.277992
L    0.270270
Name: proportion, dtype: float64


In [30]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, random_state=42, max_depth=6),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=6),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(kernel='rbf', random_state=42, probability=True),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, max_iter=500)
}

In [31]:
model_results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Train model
    if name == 'XGBoost':
        # XGBoost needs numeric labels
        model.fit(X_train, y_train_encoded)
        val_pred_encoded = model.predict(X_val)
        test_pred_encoded = model.predict(X_test)
        # Convert back to string labels
        val_pred = le_target.inverse_transform(val_pred_encoded)
        test_pred = le_target.inverse_transform(test_pred_encoded)
    elif name in ['Logistic Regression', 'SVM', 'Neural Network']:
        # Models that need scaled features
        model.fit(X_train_scaled, y_train)
        val_pred = model.predict(X_val_scaled)
        test_pred = model.predict(X_test_scaled)
    else:
        # Other tree-based models (don't need scaling)
        model.fit(X_train, y_train)
        val_pred = model.predict(X_val)
        test_pred = model.predict(X_test)

    # Calculate accuracies
    val_accuracy = accuracy_score(y_val, val_pred)
    test_accuracy = accuracy_score(y_test, test_pred)

    model_results[name] = {
        'model': model,
        'val_accuracy': val_accuracy,
        'test_accuracy': test_accuracy,
        'val_pred': val_pred,
        'test_pred': test_pred
    }

    print(f"{name} Results:")
    print(f"  Validation Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")
    print(f"  Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

    print(f"  Test Set Classification Report:")
    print(classification_report(y_test, test_pred, target_names=['Draw', 'Loss', 'Win']))


Training Random Forest...
Random Forest Results:
  Validation Accuracy: 0.5026 (50.26%)
  Test Accuracy: 0.4749 (47.49%)
  Test Set Classification Report:
              precision    recall  f1-score   support

        Draw       0.33      0.07      0.11        72
        Loss       0.38      0.36      0.37        70
         Win       0.52      0.79      0.63       117

    accuracy                           0.47       259
   macro avg       0.41      0.41      0.37       259
weighted avg       0.43      0.47      0.42       259


Training XGBoost...
XGBoost Results:
  Validation Accuracy: 0.4632 (46.32%)
  Test Accuracy: 0.4556 (45.56%)
  Test Set Classification Report:
              precision    recall  f1-score   support

        Draw       0.41      0.25      0.31        72
        Loss       0.36      0.43      0.39        70
         Win       0.53      0.60      0.56       117

    accuracy                           0.46       259
   macro avg       0.43      0.43      0.42    

In [32]:
# Find best models
results_df = pd.DataFrame({
    'Model': list(model_results.keys()),
    'Validation_Accuracy': [result['val_accuracy'] for result in model_results.values()],
    'Test_Accuracy': [result['test_accuracy'] for result in model_results.values()]
})

results_df = results_df.sort_values('Test_Accuracy', ascending=False)
print(results_df)

best_model_name = results_df.iloc[0]['Model']
best_accuracy = results_df.iloc[0]['Test_Accuracy']

print(f"\nBest Model: {best_model_name}")
print(f"Best Test Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")

if best_accuracy > 0.55:
    print("🎉 SUCCESS: Achieved target accuracy >55%!")
else:
    print("⚠️  Target accuracy >55% not achieved. Consider ensemble methods.")


                 Model  Validation_Accuracy  Test_Accuracy
0        Random Forest             0.502632       0.474903
3  Logistic Regression             0.526316       0.474903
4                  SVM             0.505263       0.474903
1              XGBoost             0.463158       0.455598
2    Gradient Boosting             0.465789       0.444015
5       Neural Network             0.402632       0.382239

Best Model: Random Forest
Best Test Accuracy: 0.4749 (47.49%)
⚠️  Target accuracy >55% not achieved. Consider ensemble methods.


### Model Improvement & Ensemble Methods

In [33]:
# Get the best performing individual models
best_rf = model_results['Random Forest']['model']
best_lr = model_results['Logistic Regression']['model']
best_svm = model_results['SVM']['model']

In [34]:
# Create voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=200, random_state=42, max_depth=12)),
        ('lr', LogisticRegression(random_state=42, max_iter=1000, C=0.1)),
        ('xgb', xgb.XGBClassifier(n_estimators=200, random_state=42, max_depth=8, learning_rate=0.05))
    ],
    voting='soft'  # Use probabilities
)

In [35]:
# Train voting classifier (XGBoost needs encoded targets)
voting_clf.fit(X_train_scaled, y_train_encoded)
voting_val_pred_encoded = voting_clf.predict(X_val_scaled)
voting_test_pred_encoded = voting_clf.predict(X_test_scaled)

In [36]:
# Convert back to string labels
voting_val_pred = le_target.inverse_transform(voting_val_pred_encoded)
voting_test_pred = le_target.inverse_transform(voting_test_pred_encoded)

voting_val_acc = accuracy_score(y_val, voting_val_pred)
voting_test_acc = accuracy_score(y_test, voting_test_pred)

print(f"Voting Classifier Results:")
print(f"  Validation Accuracy: {voting_val_acc:.4f} ({voting_val_acc*100:.2f}%)")
print(f"  Test Accuracy: {voting_test_acc:.4f} ({voting_test_acc*100:.2f}%)")
print("  Test Classification Report:")
print(classification_report(y_test, voting_test_pred, target_names=['Draw', 'Loss', 'Win']))

Voting Classifier Results:
  Validation Accuracy: 0.4974 (49.74%)
  Test Accuracy: 0.4710 (47.10%)
  Test Classification Report:
              precision    recall  f1-score   support

        Draw       0.38      0.08      0.14        72
        Loss       0.39      0.39      0.39        70
         Win       0.51      0.76      0.61       117

    accuracy                           0.47       259
   macro avg       0.43      0.41      0.38       259
weighted avg       0.44      0.47      0.42       259



In [37]:
# Create additional features
advanced_features = features_df.copy()

In [38]:
# Form differential features
advanced_features['form_points_diff'] = advanced_features['home_form_points'] - advanced_features['away_form_points']
advanced_features['form_gf_diff'] = advanced_features['home_form_gf'] - advanced_features['away_form_gf']
advanced_features['form_ga_diff'] = advanced_features['home_form_ga'] - advanced_features['away_form_ga']

In [39]:
# Goal ratios (avoid division by zero)
advanced_features['home_attack_strength'] = advanced_features['home_form_gf'] / (advanced_features['home_form_gf'] + advanced_features['home_form_ga'] + 0.1)
advanced_features['away_attack_strength'] = advanced_features['away_form_gf'] / (advanced_features['away_form_gf'] + advanced_features['away_form_ga'] + 0.1)

In [40]:
# H2H dominance
advanced_features['h2h_home_dominance'] = (advanced_features['h2h_home_wins'] - advanced_features['h2h_away_wins']) / (advanced_features['h2h_home_wins'] + advanced_features['h2h_draws'] + advanced_features['h2h_away_wins'] + 0.1)

In [41]:
# Encode teams for advanced features
advanced_features['home_team_encoded'] = le_home.transform(advanced_features['home_team'])
advanced_features['away_team_encoded'] = le_away.transform(advanced_features['away_team'])
advanced_features['target_encoded'] = le_target.transform(advanced_features['target'])

In [42]:
# Enhanced feature set
enhanced_feature_columns = [
    'home_team_encoded', 'away_team_encoded', 'season',
    'home_form_wins', 'home_form_draws', 'home_form_losses', 'home_form_points',
    'home_form_gf', 'home_form_ga',
    'away_form_wins', 'away_form_draws', 'away_form_losses', 'away_form_points',
    'away_form_gf', 'away_form_ga',
    'h2h_home_wins', 'h2h_draws', 'h2h_away_wins',
    # New features
    'form_points_diff', 'form_gf_diff', 'form_ga_diff',
    'home_attack_strength', 'away_attack_strength', 'h2h_home_dominance'
]

X_enhanced = advanced_features[enhanced_feature_columns]
y_enhanced = advanced_features['target']
y_enhanced_encoded = advanced_features['target_encoded']


In [43]:
# Split enhanced data
train_mask_enh = advanced_features['season'].isin([2020, 2021, 2022, 2023])
val_mask_enh = advanced_features['season'] == 2024
test_mask_enh = advanced_features['season'] == 2025

X_train_enh = X_enhanced[train_mask_enh]
y_train_enh = y_enhanced[train_mask_enh]
y_train_enh_encoded = y_enhanced_encoded[train_mask_enh]

X_val_enh = X_enhanced[val_mask_enh]
y_val_enh = y_enhanced[val_mask_enh]
y_val_enh_encoded = y_enhanced_encoded[val_mask_enh]

X_test_enh = X_enhanced[test_mask_enh]
y_test_enh = y_enhanced[test_mask_enh]
y_test_enh_encoded = y_enhanced_encoded[test_mask_enh]

In [44]:
# Scale enhanced features
scaler_enh = StandardScaler()
X_train_enh_scaled = scaler_enh.fit_transform(X_train_enh)
X_val_enh_scaled = scaler_enh.transform(X_val_enh)
X_test_enh_scaled = scaler_enh.transform(X_test_enh)

print(f"Enhanced feature set: {len(enhanced_feature_columns)} features")
print(f"New features: {enhanced_feature_columns[-6:]}")

Enhanced feature set: 24 features
New features: ['form_points_diff', 'form_gf_diff', 'form_ga_diff', 'home_attack_strength', 'away_attack_strength', 'h2h_home_dominance']


In [45]:
# Train optimized models with enhanced features
optimized_models = {
    'Enhanced Random Forest': RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced'
    ),
    'Enhanced XGBoost': xgb.XGBClassifier(
        n_estimators=300,
        random_state=42,
        max_depth=10,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8
    ),
    'Enhanced Gradient Boosting': GradientBoostingClassifier(
        n_estimators=300,
        random_state=42,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8
    )
}

enhanced_results = {}

for name, model in optimized_models.items():
    print(f"\nTraining {name}...")

    if 'XGBoost' in name:
        # XGBoost with encoded labels
        model.fit(X_train_enh, y_train_enh_encoded)
        val_pred_encoded = model.predict(X_val_enh)
        test_pred_encoded = model.predict(X_test_enh)
        val_pred = le_target.inverse_transform(val_pred_encoded)
        test_pred = le_target.inverse_transform(test_pred_encoded)
    else:
        # Other models
        model.fit(X_train_enh, y_train_enh)
        val_pred = model.predict(X_val_enh)
        test_pred = model.predict(X_test_enh)

    val_acc = accuracy_score(y_val_enh, val_pred)
    test_acc = accuracy_score(y_test_enh, test_pred)

    enhanced_results[name] = {
        'model': model,
        'val_accuracy': val_acc,
        'test_accuracy': test_acc,
        'test_pred': test_pred
    }

    print(f"{name} Results:")
    print(f"  Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
    print(f"  Test Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")


Training Enhanced Random Forest...
Enhanced Random Forest Results:
  Validation Accuracy: 0.5053 (50.53%)
  Test Accuracy: 0.4826 (48.26%)

Training Enhanced XGBoost...
Enhanced XGBoost Results:
  Validation Accuracy: 0.5026 (50.26%)
  Test Accuracy: 0.4633 (46.33%)

Training Enhanced Gradient Boosting...
Enhanced Gradient Boosting Results:
  Validation Accuracy: 0.4974 (49.74%)
  Test Accuracy: 0.4749 (47.49%)


In [47]:
# Combine all results
all_results = {**model_results}
all_results['Voting Classifier'] = {
    'val_accuracy': voting_val_acc,
    'test_accuracy': voting_test_acc
}
all_results.update(enhanced_results)

final_comparison = pd.DataFrame({
    'Model': list(all_results.keys()),
    'Validation_Accuracy': [result['val_accuracy'] for result in all_results.values()],
    'Test_Accuracy': [result['test_accuracy'] for result in all_results.values()]
})

final_comparison = final_comparison.sort_values('Test_Accuracy', ascending=False)
print(final_comparison)

best_final_model = final_comparison.iloc[0]['Model']
best_final_accuracy = final_comparison.iloc[0]['Test_Accuracy']

print(f"\nBEST OVERALL MODEL: {best_final_model}")
print(f"BEST TEST ACCURACY: {best_final_accuracy:.4f} ({best_final_accuracy*100:.2f}%)")

success_model = all_results[best_final_model]['model']

                        Model  Validation_Accuracy  Test_Accuracy
7      Enhanced Random Forest             0.505263       0.482625
0               Random Forest             0.502632       0.474903
9  Enhanced Gradient Boosting             0.497368       0.474903
3         Logistic Regression             0.526316       0.474903
4                         SVM             0.505263       0.474903
6           Voting Classifier             0.497368       0.471042
8            Enhanced XGBoost             0.502632       0.463320
1                     XGBoost             0.463158       0.455598
2           Gradient Boosting             0.465789       0.444015
5              Neural Network             0.402632       0.382239

BEST OVERALL MODEL: Enhanced Random Forest
BEST TEST ACCURACY: 0.4826 (48.26%)


### Final Predictions for 2025-26 Season

In [49]:
# Use our best model: Enhanced Random Forest (48.26% accuracy)
best_model = enhanced_results['Enhanced Random Forest']['model']
print(f"Using Enhanced Random Forest model (Test Accuracy: 48.26%)")
print("Note: 48% is actually reasonable for football - the sport is inherently unpredictable!")

Using Enhanced Random Forest model (Test Accuracy: 48.26%)
Note: 48% is actually reasonable for football - the sport is inherently unpredictable!


In [50]:
# Prepare 2025-26 fixtures for prediction
print(f"\nPreparing {len(fixtures_2025)} fixtures for prediction...")


Preparing 380 fixtures for prediction...


In [51]:
# IMPROVED team name mapping - comprehensive mapping for 2025-26 season
def clean_team_name(name):
    """Comprehensive team names mapping to match historical data format"""
    name_mapping = {
        # Exact matches first
        'Barcelona': 'Barcelona',
        'Real Madrid': 'Real Madrid',
        'Atletico Madrid': 'Atletico Madrid',
        'Villarreal': 'Villarreal',
        'Real Sociedad': 'Real Sociedad',
        'Real Betis': 'Real Betis',
        'Valencia': 'Valencia',
        'Athletic Club': 'Athletic Club',
        'Osasuna': 'Osasuna',
        'Getafe': 'Getafe',
        'Sevilla': 'Sevilla',
        'Rayo Vallecano': 'Rayo Vallecano',
        'Mallorca': 'Mallorca',
        'Las Palmas': 'Las Palmas',
        'Girona': 'Girona',
        'Alaves': 'Alaves',
        'Celta Vigo': 'Celta Vigo',
        'Leganes': 'Leganes',
        'Espanyol': 'Espanyol',
        'Valladolid': 'Valladolid',

        # Common variations with FC/CF/etc
        'FC Barcelona': 'Barcelona',
        'Real Madrid CF': 'Real Madrid',
        'Atlético Madrid': 'Atletico Madrid',
        'Atletico de Madrid': 'Atletico Madrid',
        'Atlético de Madrid': 'Atletico Madrid',
        'Villarreal CF': 'Villarreal',
        'Real Betis Balompié': 'Real Betis',
        'Valencia CF': 'Valencia',
        'Athletic Bilbao': 'Athletic Club',
        'Athletic de Bilbao': 'Athletic Club',
        'CA Osasuna': 'Osasuna',
        'Getafe CF': 'Getafe',
        'Sevilla FC': 'Sevilla',
        'RCD Mallorca': 'Mallorca',
        'UD Las Palmas': 'Las Palmas',
        'Girona FC': 'Girona',
        'Deportivo Alavés': 'Alaves',
        'Deportivo Alaves': 'Alaves',
        'RC Celta de Vigo': 'Celta Vigo',
        'RC Celta': 'Celta Vigo',
        'Celta': 'Celta Vigo',
        'CD Leganes': 'Leganes',
        'CD Leganés': 'Leganes',
        'RCD Espanyol': 'Espanyol',
        'RCD Espanyol de Barcelona': 'Espanyol',
        'Real Valladolid CF': 'Valladolid',
        'Real Valladolid': 'Valladolid',

        # Promoted teams and variations
        'Elche CF': 'Elche',
        'Elche': 'Elche',
        'Real Oviedo': 'Real Oviedo',
        'Levante UD': 'Levante',
        'Levante': 'Levante',

        # Alternative names/typos
        'Real Sociedad de Fútbol': 'Real Sociedad',
        'Real Betis Sevilla': 'Real Betis',
        'Rayo': 'Rayo Vallecano',
    }

    # Return mapped name or original if not found
    return name_mapping.get(name, name)

In [52]:
# Apply improved name cleaning
fixtures_2025['home_team_clean'] = fixtures_2025['Home Team'].apply(clean_team_name)
fixtures_2025['away_team_clean'] = fixtures_2025['Away Team'].apply(clean_team_name)

In [53]:
# Check team coverage
historical_teams = set(features_df['home_team'].unique())
fixture_teams = set(fixtures_2025['home_team_clean'].unique()) | set(fixtures_2025['away_team_clean'].unique())

print(f"Teams in historical data: {len(historical_teams)}")
print(f"Teams in 2025-26 fixtures: {len(fixture_teams)}")

missing_teams = fixture_teams - historical_teams
if missing_teams:
    print(f"Teams in fixtures but not in historical data: {missing_teams}")
    print("Strategy: Using league averages for missing teams")

Teams in historical data: 27
Teams in 2025-26 fixtures: 20
Teams in fixtures but not in historical data: {'Real Oviedo'}
Strategy: Using league averages for missing teams


In [55]:
# Enhanced function to handle missing teams with league averages
def get_team_form_with_fallback(team_name, n_matches=5):
    """Get team's form with fallback to league averages for missing teams"""
    if team_name in historical_teams:
        # Get actual team data
        team_matches = features_df[
            (features_df['home_team'] == team_name) | (features_df['away_team'] == team_name)
        ].sort_values('date').tail(n_matches)

        if len(team_matches) == 0:
            # Fallback to league averages
            return get_league_average_form()

        wins = draws = losses = goals_for = goals_against = 0

        for _, match in team_matches.iterrows():
            if match['home_team'] == team_name:
                goals_for += match.get('home_goals', 0) if 'home_goals' in match else 0
                goals_against += match.get('away_goals', 0) if 'away_goals' in match else 0
                if match['target'] == 'W':
                    wins += 1
                elif match['target'] == 'D':
                    draws += 1
                else:
                    losses += 1
            else:
                goals_for += match.get('away_goals', 0) if 'away_goals' in match else 0
                goals_against += match.get('home_goals', 0) if 'home_goals' in match else 0
                if match['target'] == 'L':
                    wins += 1
                elif match['target'] == 'D':
                    draws += 1
                else:
                    losses += 1

        points = wins * 3 + draws
        return {'wins': wins, 'draws': draws, 'losses': losses, 'points': points,
                'goals_for': goals_for, 'goals_against': goals_against}
    else:
        # Use league averages for missing teams
        return get_league_average_form()

In [57]:
def get_league_average_form():
    """Calculate league average form stats"""
    # Calculate league averages from recent seasons
    recent_data = features_df[features_df['season'] >= 2020]  # Last few seasons

    # Average goals per match
    avg_goals_home = recent_data.get('home_goals', pd.Series([1.5] * len(recent_data))).mean() if 'home_goals' in recent_data else 1.5
    avg_goals_away = recent_data.get('away_goals', pd.Series([1.2] * len(recent_data))).mean() if 'away_goals' in recent_data else 1.2

    # Average results distribution (home advantage)
    result_dist = recent_data['target'].value_counts(normalize=True)
    avg_wins = result_dist.get('W', 0.4) * 5  # Approximate for 5 matches
    avg_draws = result_dist.get('D', 0.3) * 5
    avg_losses = result_dist.get('L', 0.3) * 5

    return {
        'wins': round(avg_wins),
        'draws': round(avg_draws),
        'losses': round(avg_losses),
        'points': round(avg_wins * 3 + avg_draws),
        'goals_for': round(avg_goals_home * 5),  # 5 matches
        'goals_against': round(avg_goals_away * 5)
    }

In [58]:
def get_h2h_record_with_fallback(home_team, away_team, n_matches=10):
    """Get head-to-head record with fallback for missing teams"""
    if home_team not in historical_teams or away_team not in historical_teams:
        # Return neutral H2H for missing teams
        return {'h2h_home_wins': 3, 'h2h_draws': 4, 'h2h_away_wins': 3}

    h2h_matches = features_df[
        (((features_df['home_team'] == home_team) & (features_df['away_team'] == away_team)) |
         ((features_df['home_team'] == away_team) & (features_df['away_team'] == home_team)))
    ].sort_values('date').tail(n_matches)

    if len(h2h_matches) == 0:
        return {'h2h_home_wins': 3, 'h2h_draws': 4, 'h2h_away_wins': 3}  # Neutral

    home_wins = draws = away_wins = 0

    for _, match in h2h_matches.iterrows():
        if match['home_team'] == home_team:
            if match['target'] == 'W':
                home_wins += 1
            elif match['target'] == 'D':
                draws += 1
            else:
                away_wins += 1
        else:
            if match['target'] == 'L':
                home_wins += 1
            elif match['target'] == 'D':
                draws += 1
            else:
                away_wins += 1

    return {'h2h_home_wins': home_wins, 'h2h_draws': draws, 'h2h_away_wins': away_wins}

In [59]:
# Enhanced team encoding with fallback
def get_team_encoding(team_name, encoder, is_home=True):
    """Get team encoding with fallback for unknown teams"""
    if team_name in encoder.classes_:
        return encoder.transform([team_name])[0]
    else:
        # Return encoding for most average team or 0 as fallback
        return 0

In [61]:
# Create improved predictions
print("\nGenerating improved predictions...")
predictions_list = []
unknown_count = 0

for idx, fixture in fixtures_2025.iterrows():
    if idx % 50 == 0:
        print(f"Processing fixture....")

    home_team = fixture['home_team_clean']
    away_team = fixture['away_team_clean']

    # Get team forms (with fallback for missing teams)
    home_form = get_team_form_with_fallback(home_team)
    away_form = get_team_form_with_fallback(away_team)

    # Get head-to-head (with fallback)
    h2h = get_h2h_record_with_fallback(home_team, away_team)

    # Create feature vector with improved encoding
    feature_dict = {
        'home_team_encoded': get_team_encoding(home_team, le_home, True),
        'away_team_encoded': get_team_encoding(away_team, le_away, False),
        'season': 2026,
        'home_form_wins': home_form['wins'],
        'home_form_draws': home_form['draws'],
        'home_form_losses': home_form['losses'],
        'home_form_points': home_form['points'],
        'home_form_gf': home_form['goals_for'],
        'home_form_ga': home_form['goals_against'],
        'away_form_wins': away_form['wins'],
        'away_form_draws': away_form['draws'],
        'away_form_losses': away_form['losses'],
        'away_form_points': away_form['points'],
        'away_form_gf': away_form['goals_for'],
        'away_form_ga': away_form['goals_against'],
        'h2h_home_wins': h2h['h2h_home_wins'],
        'h2h_draws': h2h['h2h_draws'],
        'h2h_away_wins': h2h['h2h_away_wins']
    }

    # Add enhanced features
    feature_dict.update({
        'form_points_diff': home_form['points'] - away_form['points'],
        'form_gf_diff': home_form['goals_for'] - away_form['goals_for'],
        'form_ga_diff': home_form['goals_against'] - away_form['goals_against'],
        'home_attack_strength': home_form['goals_for'] / (home_form['goals_for'] + home_form['goals_against'] + 0.1),
        'away_attack_strength': away_form['goals_for'] / (away_form['goals_for'] + away_form['goals_against'] + 0.1),
        'h2h_home_dominance': (h2h['h2h_home_wins'] - h2h['h2h_away_wins']) / (sum(h2h.values()) + 0.1)
    })

    # Create prediction
    feature_vector = np.array([list(feature_dict.values())])
    prediction = best_model.predict(feature_vector)[0]
    probabilities = best_model.predict_proba(feature_vector)[0]
    confidence = max(probabilities)

    # Track if using fallback data
    using_fallback = home_team not in historical_teams or away_team not in historical_teams
    if using_fallback:
        unknown_count += 1

    predictions_list.append({
        'Match_Number': fixture['Match Number'],
        'Round': fixture['Round Number'] if 'Round Number' in fixture else idx // 19 + 1,
        'Date': fixture['Date'],
        'Home_Team': home_team,
        'Away_Team': away_team,
        'Predicted_Result': prediction,
        'Confidence': round(confidence, 3),
        'Win_Prob': round(probabilities[2], 3) if len(probabilities) > 2 else 0,  # W
        'Draw_Prob': round(probabilities[0], 3),  # D
        'Loss_Prob': round(probabilities[1], 3) if len(probabilities) > 1 else 0,  # L
        'Using_Fallback': using_fallback
    })

print("Fixture processed.")


Generating improved predictions...
Processing fixture....
Processing fixture....
Processing fixture....
Processing fixture....
Processing fixture....
Processing fixture....
Processing fixture....
Processing fixture....
Fixture processed.


In [62]:
# Convert to DataFrame
predictions_df = pd.DataFrame(predictions_list)

print(f"\nCompleted predictions for {len(predictions_df)} fixtures!")
print(f"Predictions using fallback data: {unknown_count}")
print(f"Predictions with actual historical data: {len(predictions_df) - unknown_count}")


Completed predictions for 380 fixtures!
Predictions using fallback data: 38
Predictions with actual historical data: 342


In [63]:
print("\nPrediction Summary:")
print(predictions_df['Predicted_Result'].value_counts())


Prediction Summary:
Predicted_Result
W    286
L     89
D      5
Name: count, dtype: int64


In [64]:
print(f"\nAverage Confidence: {predictions_df['Confidence'].mean():.3f}")
high_conf_count = (predictions_df['Confidence'] > 0.6).sum()
medium_conf_count = (predictions_df['Confidence'] > 0.5).sum()
print(f"High Confidence Predictions (>0.6): {high_conf_count}")
print(f"Medium+ Confidence Predictions (>0.5): {medium_conf_count}")


Average Confidence: 0.421
High Confidence Predictions (>0.6): 0
Medium+ Confidence Predictions (>0.5): 17


In [65]:
print("\nFirst 10 predictions:")
display_cols = ['Date', 'Home_Team', 'Away_Team', 'Predicted_Result', 'Confidence']
print(predictions_df.head(10)[display_cols].to_string())


First 10 predictions:
                 Date      Home_Team        Away_Team Predicted_Result  Confidence
0 2025-07-15 00:00:00         Girona   Rayo Vallecano                L       0.415
1 2025-08-15 19:30:00     Villarreal      Real Oviedo                W       0.382
2 2025-08-16 15:00:00         Alaves          Levante                W       0.407
3 2025-08-16 17:30:00       Mallorca        Barcelona                L       0.434
4 2025-08-16 19:30:00       Valencia    Real Sociedad                W       0.462
5 2025-08-17 15:00:00     Celta Vigo           Getafe                W       0.444
6 2025-08-17 17:30:00  Athletic Club          Sevilla                W       0.441
7 2025-08-17 19:30:00       Espanyol  Atletico Madrid                L       0.423
8 2025-08-18 19:00:00          Elche       Real Betis                L       0.380
9 2025-08-19 19:00:00    Real Madrid          Osasuna                W       0.458


In [66]:
print("\nHighest confidence predictions:")
high_conf = predictions_df.nlargest(10, 'Confidence')
print(high_conf[['Date', 'Home_Team', 'Away_Team', 'Predicted_Result', 'Confidence']].to_string())


Highest confidence predictions:
                   Date        Home_Team       Away_Team Predicted_Result  Confidence
160 2025-12-21 20:00:00    Athletic Club        Espanyol                W       0.570
301 2026-04-12 19:00:00        Barcelona        Espanyol                W       0.522
243 2026-02-22 20:00:00        Barcelona         Levante                W       0.519
361 2026-05-17 19:00:00  Atletico Madrid          Girona                W       0.517
15  2025-08-24 17:30:00    Real Sociedad        Espanyol                W       0.516
341 2026-05-10 19:00:00  Atletico Madrid      Celta Vigo                W       0.516
102 2025-11-02 20:00:00        Barcelona           Elche                W       0.515
271 2026-03-15 20:00:00  Atletico Madrid          Getafe                W       0.513
18  2025-08-25 17:30:00    Athletic Club  Rayo Vallecano                W       0.512
48  2025-09-21 19:00:00      Real Madrid        Espanyol                W       0.512


In [67]:
print("\nSome interesting matchups:")
big_teams = ['Barcelona', 'Real Madrid', 'Atletico Madrid', 'Valencia', 'Sevilla']
interesting = predictions_df[
    (predictions_df['Home_Team'].isin(big_teams)) |
    (predictions_df['Away_Team'].isin(big_teams))
].head(10)
print(interesting[['Date', 'Home_Team', 'Away_Team', 'Predicted_Result', 'Confidence']].to_string())


Some interesting matchups:
                  Date        Home_Team        Away_Team Predicted_Result  Confidence
3  2025-08-16 17:30:00         Mallorca        Barcelona                L       0.434
4  2025-08-16 19:30:00         Valencia    Real Sociedad                W       0.462
6  2025-08-17 17:30:00    Athletic Club          Sevilla                W       0.441
7  2025-08-17 19:30:00         Espanyol  Atletico Madrid                L       0.423
9  2025-08-19 19:00:00      Real Madrid          Osasuna                W       0.458
12 2025-08-23 17:30:00  Atletico Madrid            Elche                W       0.499
13 2025-08-23 19:30:00          Levante        Barcelona                L       0.452
14 2025-08-24 15:00:00          Osasuna         Valencia                W       0.440
17 2025-08-24 19:30:00      Real Oviedo      Real Madrid                W       0.410
19 2025-08-25 19:30:00          Sevilla           Getafe                W       0.470


In [68]:
# Save improved predictions
predictions_df.to_csv('laliga_2025_26_predictions_improved.csv', index=False)
print(f"\n✅ Improved predictions saved to 'laliga_2025_26_predictions_improved.csv'")


✅ Improved predictions saved to 'laliga_2025_26_predictions_improved.csv'


###FULLY AUTOMATED LA LIGA MONITORING SYSTEM which Automatically fetches results and updates after every matchday

In [69]:
!pip install schedule
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
import json
import time
import schedule
import threading
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [70]:
print("System will automatically:")
print("✅ Fetch match results from web")
print("✅ Update predictions and table")
print("✅ Generate reports")
print("✅ Save data automatically")

System will automatically:
✅ Fetch match results from web
✅ Update predictions and table
✅ Generate reports
✅ Save data automatically


In [71]:
class AutomatedLaLigaMonitor:
    def __init__(self, predictions_df, model, encoders):
        """Initialize automated monitoring system"""
        self.predictions = predictions_df.copy()
        self.model = model
        self.le_home, self.le_away = encoders

        # Tracking data
        self.performance_history = []
        self.weekly_accuracy = []
        self.actual_results = []
        self.current_matchday = 0
        self.season_table = self.initialize_season_table()
        self.last_update = datetime.now()

        # Data sources configuration
        self.data_sources = {
            'primary': 'https://api.football-data.org/v4/competitions/PD/matches',
            'backup': 'https://www.espn.com/soccer/league/_/name/esp.1',
            'fallback': 'https://www.laliga.com/en-GB/laliga-easports'
        }

        # API configurations (you'll need to get free API keys)
        self.headers = {
            'X-Auth-Token': 'f5c7eef6b7d3495e9928d0fada922957',  # Free from football-data.org
            'User-Agent': 'LaLiga-Predictor/1.0'
        }

        print(f"✅ Automated system initialized with {len(predictions_df)} predictions")

    def initialize_season_table(self):
        """Initialize season table"""
        teams = list(set(self.predictions['Home_Team'].unique()) |
                    set(self.predictions['Away_Team'].unique()))

        table = pd.DataFrame({
            'Team': teams,
            'Matches': 0, 'Wins': 0, 'Draws': 0, 'Losses': 0,
            'Goals_For': 0, 'Goals_Against': 0, 'Goal_Diff': 0,
            'Points': 0, 'Form': [''] * len(teams),
            'Last_5': [''] * len(teams),
            'Last_Updated': [datetime.now()] * len(teams)
        })

        return table.set_index('Team')

    def fetch_latest_results(self):
        """Automatically fetch latest La Liga results"""
        print(f"🌐 Fetching latest results... ({datetime.now().strftime('%Y-%m-%d %H:%M')})")

        try:
            # Try primary API (Football-Data.org)
            results = self.fetch_from_football_data_api()
            if results:
                return results

            # Fallback to ESPN scraping
            print("📡 Primary API failed, trying ESPN...")
            results = self.fetch_from_espn()
            if results:
                return results

            # Fallback to La Liga official site
            print("📡 ESPN failed, trying La Liga official...")
            results = self.fetch_from_laliga_official()
            return results

        except Exception as e:
            print(f"❌ Error fetching results: {e}")
            return []

    def fetch_from_football_data_api(self):
        """Fetch from Football-Data.org API (primary source)"""
        try:
            # Get current season matches
            response = requests.get(self.data_sources['primary'], headers=self.headers)

            if response.status_code == 200:
                data = response.json()
                matches = data.get('matches', [])

                # Process only finished matches since last update
                new_results = []
                for match in matches:
                    if match['status'] == 'FINISHED':
                        match_date = datetime.fromisoformat(match['utcDate'].replace('Z', '+00:00'))

                        # Only process matches since last update
                        if match_date > self.last_update:
                            result = {
                                'home_team': self.clean_team_name(match['homeTeam']['name']),
                                'away_team': self.clean_team_name(match['awayTeam']['name']),
                                'home_goals': match['score']['fullTime']['home'],
                                'away_goals': match['score']['fullTime']['away'],
                                'date': match_date.strftime('%Y-%m-%d'),
                                'matchday': match.get('matchday', 0),
                                'source': 'football-data-api'
                            }
                            new_results.append(result)

                print(f"✅ Fetched {len(new_results)} new results from Football-Data API")
                return new_results

        except Exception as e:
            print(f"❌ Football-Data API error: {e}")
            return []

    def fetch_from_espn(self):
        """Backup: Scrape ESPN for results"""
        try:
            response = requests.get(self.data_sources['backup'])
            soup = BeautifulSoup(response.content, 'html.parser')

            # ESPN scraping logic (simplified)
            matches = []
            # This would need specific ESPN HTML parsing
            # For now, returning empty list

            return matches
        except:
            return []

    def fetch_from_laliga_official(self):
        """Fallback: Scrape La Liga official site"""
        try:
            # Similar scraping logic for La Liga official
            # Implementation would depend on their HTML structure
            return []
        except:
            return []

    def clean_team_name(self, name):
        """Clean team names to match our prediction format"""
        name_mapping = {
            'FC Barcelona': 'Barcelona',
            'Real Madrid CF': 'Real Madrid',
            'Atlético Madrid': 'Atletico Madrid',
            'Villarreal CF': 'Villarreal',
            'Real Sociedad': 'Real Sociedad',
            'Real Betis': 'Real Betis',
            'Valencia CF': 'Valencia',
            'Athletic Club': 'Athletic Club',
            'CA Osasuna': 'Osasuna',
            'Getafe CF': 'Getafe',
            'Sevilla FC': 'Sevilla',
            'Rayo Vallecano': 'Rayo Vallecano',
            'RCD Mallorca': 'Mallorca',
            'UD Las Palmas': 'Las Palmas',
            'Girona FC': 'Girona',
            'Deportivo Alavés': 'Alaves',
            'RC Celta de Vigo': 'Celta Vigo',
            'CD Leganés': 'Leganes',
            'RCD Espanyol': 'Espanyol',
            'Real Valladolid CF': 'Valladolid'
        }
        return name_mapping.get(name, name)

    def process_new_results(self, new_results):
        """Process newly fetched results"""
        if not new_results:
            print("ℹ️  No new results to process")
            return

        print(f"⚡ Processing {len(new_results)} new matches...")

        for result in new_results:
            self.add_single_result(result)

        self.last_update = datetime.now()

        # Auto-generate report if significant update
        if len(new_results) >= 5:  # Half a matchday or more
            self.generate_auto_report()

        # Save data automatically
        self.auto_save_data()

    def add_single_result(self, result):
        """Add single match result and update all tracking"""
        home_team = result['home_team']
        away_team = result['away_team']
        home_goals = result['home_goals']
        away_goals = result['away_goals']

        # Determine result
        if home_goals > away_goals:
            actual_result = 'W'
        elif home_goals < away_goals:
            actual_result = 'L'
        else:
            actual_result = 'D'

        # Find prediction
        pred_match = self.predictions[
            (self.predictions['Home_Team'] == home_team) &
            (self.predictions['Away_Team'] == away_team)
        ]

        if not pred_match.empty:
            predicted_result = pred_match.iloc[0]['Predicted_Result']
            confidence = pred_match.iloc[0]['Confidence']

            # Store result
            self.actual_results.append({
                'date': result['date'],
                'home_team': home_team,
                'away_team': away_team,
                'home_goals': home_goals,
                'away_goals': away_goals,
                'actual_result': actual_result,
                'predicted_result': predicted_result,
                'confidence': confidence,
                'correct': predicted_result == actual_result,
                'source': result.get('source', 'unknown'),
                'auto_updated': datetime.now().isoformat()
            })

            # Update season table
            self.update_season_table(home_team, away_team, home_goals, away_goals, actual_result)

            print(f"✅ {home_team} {home_goals}-{away_goals} {away_team} | "
                  f"Predicted: {predicted_result}, Actual: {actual_result} | "
                  f"{'✅' if predicted_result == actual_result else '❌'}")

    def update_season_table(self, home_team, away_team, home_goals, away_goals, result):
        """Update season table with match result"""
        # Update home team
        if home_team in self.season_table.index:
            self.season_table.loc[home_team, 'Matches'] += 1
            self.season_table.loc[home_team, 'Goals_For'] += home_goals
            self.season_table.loc[home_team, 'Goals_Against'] += away_goals
            self.season_table.loc[home_team, 'Last_Updated'] = datetime.now()

        # Update away team
        if away_team in self.season_table.index:
            self.season_table.loc[away_team, 'Matches'] += 1
            self.season_table.loc[away_team, 'Goals_For'] += away_goals
            self.season_table.loc[away_team, 'Goals_Against'] += home_goals
            self.season_table.loc[away_team, 'Last_Updated'] = datetime.now()

        # Update points and results
        if result == 'W':  # Home win
            if home_team in self.season_table.index:
                self.season_table.loc[home_team, 'Wins'] += 1
                self.season_table.loc[home_team, 'Points'] += 3
            if away_team in self.season_table.index:
                self.season_table.loc[away_team, 'Losses'] += 1
        elif result == 'L':  # Away win
            if away_team in self.season_table.index:
                self.season_table.loc[away_team, 'Wins'] += 1
                self.season_table.loc[away_team, 'Points'] += 3
            if home_team in self.season_table.index:
                self.season_table.loc[home_team, 'Losses'] += 1
        else:  # Draw
            if home_team in self.season_table.index:
                self.season_table.loc[home_team, 'Draws'] += 1
                self.season_table.loc[home_team, 'Points'] += 1
            if away_team in self.season_table.index:
                self.season_table.loc[away_team, 'Draws'] += 1
                self.season_table.loc[away_team, 'Points'] += 1

        # Update goal difference
        self.season_table['Goal_Diff'] = (self.season_table['Goals_For'] -
                                         self.season_table['Goals_Against'])

    def generate_auto_report(self):
        """Generate automated performance report"""
        print(f"\n📋 AUTOMATED REPORT - {datetime.now().strftime('%Y-%m-%d %H:%M')}")
        print("="*50)

        if not self.actual_results:
            print("No results processed yet")
            return

        # Overall performance
        total_matches = len(self.actual_results)
        correct_predictions = sum(1 for r in self.actual_results if r['correct'])
        accuracy = correct_predictions / total_matches if total_matches > 0 else 0

        print(f"🎯 PERFORMANCE UPDATE:")
        print(f"   Total Matches: {total_matches}")
        print(f"   Current Accuracy: {accuracy:.1%}")
        print(f"   Correct: {correct_predictions}")

        # Current top 6
        current_table = self.get_current_table()
        print(f"\n🏆 CURRENT TOP 6:")
        top6 = current_table.head(6)[['Points', 'Matches', 'Goals_For', 'Goals_Against', 'Goal_Diff']]
        print(top6.to_string())

        # Recent form
        recent_results = self.actual_results[-5:] if len(self.actual_results) >= 5 else self.actual_results
        recent_accuracy = sum(1 for r in recent_results if r['correct']) / len(recent_results)
        print(f"\n📈 RECENT FORM: {recent_accuracy:.1%} (last {len(recent_results)} matches)")

    def get_current_table(self):
        """Get current league table"""
        table = self.season_table.copy()
        return table.sort_values(['Points', 'Goal_Diff', 'Goals_For'], ascending=[False, False, False])

    def auto_save_data(self):
        """Automatically save all data with timestamps"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M')

        # Save results
        if self.actual_results:
            df = pd.DataFrame(self.actual_results)
            df.to_csv(f'auto_results_{timestamp}.csv', index=False)

        # Save current table
        table = self.get_current_table()
        table.to_csv(f'auto_table_{timestamp}.csv')

        # Save performance summary
        if self.actual_results:
            summary = {
                'last_updated': datetime.now().isoformat(),
                'total_matches': len(self.actual_results),
                'accuracy': sum(1 for r in self.actual_results if r['correct']) / len(self.actual_results),
                'data_sources_used': list(set(r.get('source', 'unknown') for r in self.actual_results))
            }

            with open(f'auto_summary_{timestamp}.json', 'w') as f:
                json.dump(summary, f, indent=2)

        print(f"💾 Auto-saved data with timestamp: {timestamp}")

    def start_automated_monitoring(self):
        """Start the automated monitoring system"""
        print("🚀 Starting automated monitoring...")

        # Schedule automatic updates
        schedule.every(2).hours.do(self.run_update_cycle)  # Every 2 hours
        schedule.every().day.at("09:00").do(self.run_update_cycle)  # Daily at 9 AM
        schedule.every().day.at("21:00").do(self.run_update_cycle)  # Daily at 9 PM

        # Run in background thread
        def run_scheduler():
            while True:
                schedule.run_pending()
                time.sleep(60)  # Check every minute

        scheduler_thread = threading.Thread(target=run_scheduler, daemon=True)
        scheduler_thread.start()

        print("✅ Automated monitoring started!")
        print("🕐 Updates scheduled: Every 2 hours + 9 AM/9 PM daily")
        print("💡 The system is now running in the background...")

        # Initial update
        self.run_update_cycle()

    def run_update_cycle(self):
        """Run complete update cycle"""
        print(f"\n🔄 Running automated update cycle...")
        try:
            new_results = self.fetch_latest_results()
            self.process_new_results(new_results)

            if new_results:
                print(f"✅ Update cycle completed - processed {len(new_results)} new results")
            else:
                print("ℹ️  Update cycle completed - no new results")

        except Exception as e:
            print(f"❌ Update cycle failed: {e}")

    def stop_monitoring(self):
        """Stop the automated monitoring"""
        schedule.clear()
        print("🛑 Automated monitoring stopped")
