In [26]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("\n‚úÖ Google Drive mounted successfully!")
print("   You should see your Drive files at: /content/drive/MyDrive/")





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

‚úÖ Google Drive mounted successfully!
   You should see your Drive files at: /content/drive/MyDrive/


In [27]:
import os
PROJECT_PATH = '/content/drive/MyDrive/final-year-project/'

os.chdir(PROJECT_PATH)
print(f"‚úÖ Working directory set to: {PROJECT_PATH}")
print(f"üìÇ Current folder contents: {os.listdir('.')}")

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("\n‚úÖ Libraries imported and ready!")

‚úÖ Working directory set to: /content/drive/MyDrive/final-year-project/
üìÇ Current folder contents: ['final-year-project', '.git', '.gitignore', 'README.md', 'data', 'docs', 'notebooks', 'output', 'requirements.txt', 'src']

‚úÖ Libraries imported and ready!


In [30]:
DRIVE_DATA_PATH = '/content/drive/MyDrive/final-year-project/data/processed/all_leagues_clean.csv'

print(f"üîç Loading from Google Drive: {DRIVE_DATA_PATH}")

if os.path.exists(DRIVE_DATA_PATH):
    print("‚úÖ File found in Google Drive! Loading...")
    df = pd.read_csv(DRIVE_DATA_PATH)
    print(f"‚úÖ Successfully loaded {len(df):,} matches")
    print(f"üìä Dataset shape: {df.shape[0]} rows √ó {df.shape[1]} columns")
    print("\nüîç First 3 matches:")
    preview_cols = []
    for col in ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'league']:
        if col in df.columns:
            preview_cols.append(col)

    if preview_cols:
        display(df[preview_cols].head(3))
    else:
        print("First few columns:", df.columns.tolist()[:10])

else:
    print(f"‚ùå File not found at Google Drive path.")
    print(f"üìÇ Let's check what's ACTUALLY in your Google Drive...")

    processed_folder = '/content/drive/MyDrive/final-year-project/data/processed/'

    if os.path.exists(processed_folder):
        print(f"‚úÖ Folder exists: {processed_folder}")
        files = os.listdir(processed_folder)
        print("üìÅ Files found in processed folder:")
        for file in files:
            file_path = os.path.join(processed_folder, file)
            size = os.path.getsize(file_path)
            print(f"  - {file} ({size:,} bytes)")

        cleaned_files = [f for f in files if 'clean' in f.lower()]
        if cleaned_files:
            print(f"\nüéØ Found cleaned files: {cleaned_files}")
            for file in cleaned_files:
                if file.endswith('.csv'):
                    file_to_load = os.path.join(processed_folder, file)
                    print(f"üìä Loading CSV: {file}")
                    df = pd.read_csv(file_to_load)
                    break
                elif file.endswith('.parquet'):
                    file_to_load = os.path.join(processed_folder, file)
                    print(f"üìä Loading Parquet: {file}")
                    df = pd.read_parquet(file_to_load)
                    break
                elif file.endswith('.pkl'):
                    file_to_load = os.path.join(processed_folder, file)
                    print(f"üìä Loading Pickle: {file}")
                    df = pd.read_pickle(file_to_load)
                    break

            print(f"‚úÖ Loaded {file} with {len(df):,} rows")
        else:
            print(f"\n‚ùå No 'clean' files found. Loading first available file...")
            if files:
                first_file = os.path.join(processed_folder, files[0])
                print(f"üìä Loading: {files[0]}")
                if files[0].endswith('.csv'):
                    df = pd.read_csv(first_file)
                elif files[0].endswith('.parquet'):
                    df = pd.read_parquet(first_file)
                elif files[0].endswith('.pkl'):
                    df = pd.read_pickle(first_file)
                else:
                    print(f"‚ö†Ô∏è Unsupported file type: {files[0]}")
                    df = None
            else:
                print("‚ùå No files in processed folder!")
                df = None
    else:
        print(f"‚ùå Processed folder doesn't exist: {processed_folder}")
        print("\nüìÅ Checking project structure...")

        project_folder = '/content/drive/MyDrive/final-year-project/'
        if os.path.exists(project_folder):
            print(f"‚úÖ Project folder exists: {project_folder}")
            print("Contents:")
            for item in os.listdir(project_folder):
                item_path = os.path.join(project_folder, item)
                if os.path.isdir(item_path):
                    print(f"  üìÅ {item}/")
                else:
                    print(f"  üìÑ {item}")
        else:
            print(f"‚ùå Project folder doesn't exist: {project_folder}")
            print("\nüí° Your Google Drive might be empty or mounted differently.")
            df = None

if 'df' not in locals() or df is None:
    print("\n‚ö†Ô∏è Creating sample data for testing...")
    np.random.seed(42)
    sample_data = {
        'Date': pd.date_range('2023-01-01', periods=100, freq='D'),
        'HomeTeam': [f'Team_{i%10}' for i in range(100)],
        'AwayTeam': [f'Team_{(i+5)%10}' for i in range(100)],
        'FTHG': np.random.randint(0, 4, 100),
        'FTAG': np.random.randint(0, 3, 100),
        'FTR': np.where(np.random.randint(0, 3, 100) == 0, 'H',
                       np.where(np.random.randint(0, 2, 100) == 0, 'D', 'A')),
        'league': ['Premier League'] * 50 + ['La Liga'] * 50
    }
    df = pd.DataFrame(sample_data)
    print("üìä Created 100 sample matches")

print(f"\n‚úÖ Data loaded. Shape: {df.shape}")

üîç Loading from Google Drive: /content/drive/MyDrive/final-year-project/data/processed/all_leagues_clean.csv
‚úÖ File found in Google Drive! Loading...
‚úÖ Successfully loaded 7,462 matches
üìä Dataset shape: 7462 rows √ó 17 columns

üîç First 3 matches:


Unnamed: 0,league
0,Bundesliga 2018
1,Bundesliga 2018
2,Bundesliga 2018



‚úÖ Data loaded. Shape: (7462, 17)


In [34]:
print("\n" + "="*60)
print("CREATING TEAM-PERFORMANCE DATAFRAME")
print("="*60)

team_performance_list = []

print("Processing each match to create team-level records...")

for idx, row in df.iterrows():
    team_performance_list.append({
        'Date': row['date'],
        'league': row.get('league', 'Unknown'),
        'Team': row['hometeam'],
        'Opponent': row['awayteam'],
        'is_home': 1,
        'Points': row.get('HomePoints', 0),
        'GoalsScored': row['fthg'],
        'GoalsConceded': row['ftag'],
        'GoalDifference': row.get('HomeGD', 0)
    })

    team_performance_list.append({
        'Date': row['date'],
        'league': row.get('league', 'Unknown'),
        'Team': row['awayteam'],
        'Opponent': row['hometeam'],
        'is_home': 0,
        'Points': row.get('AwayPoints', 0),
        'GoalsScored': row['ftag'],
        'GoalsConceded': row['fthg'],
        'GoalDifference': row.get('AwayGD', 0)
    })

team_df = pd.DataFrame(team_performance_list)
team_df['Date'] = pd.to_datetime(team_df['Date']) # Convert 'Date' column to datetime
team_df = team_df.sort_values(['Team', 'Date']).reset_index(drop=True)

print(f"‚úÖ Created team-performance dataframe")
print(f"   Total records: {len(team_df):,} (2 per match)")
print(f"   Unique teams: {team_df['Team'].nunique()}")
print(f"   Date range: {team_df['Date'].min().date()} to {team_df['Date'].max().date()}")

print("\nüîç Sample of team-performance data (first 4 rows):")
display(team_df.head(4))


CREATING TEAM-PERFORMANCE DATAFRAME
Processing each match to create team-level records...
‚úÖ Created team-performance dataframe
   Total records: 14,924 (2 per match)
   Unique teams: 84
   Date range: 2018-08-10 to 2025-05-25

üîç Sample of team-performance data (first 4 rows):


Unnamed: 0,Date,league,Team,Opponent,is_home,Points,GoalsScored,GoalsConceded,GoalDifference
0,2018-08-18,La Liga 2018,Alaves,Barcelona,0,0,0,3,0
1,2018-08-25,La Liga 2018,Alaves,Betis,1,0,0,0,0
2,2018-09-02,La Liga 2018,Alaves,Espanol,1,0,2,1,0
3,2018-09-16,La Liga 2018,Alaves,Valladolid,0,0,1,0,0


In [35]:
print("\n" + "="*60)
print("CALCULATING ROLLING FORM FEATURES")
print("="*60)

team_df = team_df.copy()

def calculate_team_form(group):
    """Calculate rolling averages for a team"""
    group = group.sort_values('Date')

    group['form_5'] = group['Points'].shift(1).rolling(5, min_periods=1).mean()

    group['avg_goals_scored_5'] = group['GoalsScored'].shift(1).rolling(5, min_periods=1).mean()
    group['avg_goals_conceded_5'] = group['GoalsConceded'].shift(1).rolling(5, min_periods=1).mean()
    group['avg_gd_5'] = group['GoalDifference'].shift(1).rolling(5, min_periods=1).mean()

    group['win'] = (group['Points'] == 3).astype(int)
    win_streak = []
    current = 0
    for w in group['win'].shift(1).fillna(0):
        if w == 1:
            current += 1
        else:
            current = 0
        win_streak.append(current)
    group['win_streak'] = win_streak

    group['unbeaten'] = (group['Points'] >= 1).astype(int)
    unbeaten_streak = []
    current = 0
    for u in group['unbeaten'].shift(1).fillna(0):
        if u == 1:
            current += 1
        else:
            current = 0
        unbeaten_streak.append(current)
    group['unbeaten_streak'] = unbeaten_streak

    return group

print("Calculating rolling features for each team...")
team_df = team_df.groupby('Team', group_keys=False).apply(calculate_team_form)

print("‚úÖ Rolling features calculated:")
print("   ‚Ä¢ form_5: Average points from last 5 matches")
print("   ‚Ä¢ avg_goals_scored_5: Average goals scored (last 5)")
print("   ‚Ä¢ avg_goals_conceded_5: Average goals conceded (last 5)")
print("   ‚Ä¢ avg_gd_5: Average goal difference (last 5)")
print("   ‚Ä¢ win_streak: Current win streak")
print("   ‚Ä¢ unbeaten_streak: Current unbeaten streak")

# Show sample for a specific team
sample_team = team_df['Team'].iloc[0]
print(f"\nüîç Sample for team '{sample_team}':")
team_sample = team_df[team_df['Team'] == sample_team].head(3)
display(team_sample[['Date', 'Team', 'Points', 'form_5', 'avg_gd_5', 'win_streak']])


CALCULATING ROLLING FORM FEATURES
Calculating rolling features for each team...
‚úÖ Rolling features calculated:
   ‚Ä¢ form_5: Average points from last 5 matches
   ‚Ä¢ avg_goals_scored_5: Average goals scored (last 5)
   ‚Ä¢ avg_goals_conceded_5: Average goals conceded (last 5)
   ‚Ä¢ avg_gd_5: Average goal difference (last 5)
   ‚Ä¢ win_streak: Current win streak
   ‚Ä¢ unbeaten_streak: Current unbeaten streak

üîç Sample for team 'Alaves':


Unnamed: 0,Date,Team,Points,form_5,avg_gd_5,win_streak
0,2018-08-18,Alaves,0,,,0
1,2018-08-25,Alaves,0,0.0,0.0,0
2,2018-09-02,Alaves,0,0.0,0.0,0


In [42]:
print("\n" + "="*60)
print("MERGING FEATURES BACK TO MATCH-LEVEL DATA")
print("="*60)

match_df = df.copy()

# Converting 'date' column in match_df to datetime for consistent merging
match_df['date'] = pd.to_datetime(match_df['date'])

print("Extracting home team features from team-performance data...")
home_features = team_df[team_df['is_home'] == 1].copy()

home_features = home_features.rename(columns={
    'form_5': 'home_form_5',
    'avg_goals_scored_5': 'home_avg_goals_scored_5',
    'avg_goals_conceded_5': 'home_avg_goals_conceded_5',
    'avg_gd_5': 'home_avg_gd_5',
    'win_streak': 'home_win_streak',
    'unbeaten_streak': 'home_unbeaten_streak'
})

print("Extracting away team features...")
away_features = team_df[team_df['is_home'] == 0].copy()

away_features = away_features.rename(columns={
    'form_5': 'away_form_5',
    'avg_goals_scored_5': 'away_avg_goals_scored_5',
    'avg_goals_conceded_5': 'away_avg_goals_conceded_5',
    'avg_gd_5': 'away_avg_gd_5',
    'win_streak': 'away_win_streak',
    'unbeaten_streak': 'away_unbeaten_streak'
})

print("Merging home team features...")
match_df = pd.merge(
    match_df,
    home_features[['Date', 'Team', 'home_form_5', 'home_avg_goals_scored_5',
                   'home_avg_goals_conceded_5', 'home_avg_gd_5',
                   'home_win_streak', 'home_unbeaten_streak']],
    left_on=['date', 'hometeam'],
    right_on=['Date', 'Team'],
    how='left'
).drop(columns=['Team'])

print("Merging away team features...")
match_df = pd.merge(
    match_df,
    away_features[['Date', 'Team', 'away_form_5', 'away_avg_goals_scored_5',
                   'away_avg_goals_conceded_5', 'away_avg_gd_5',
                   'away_win_streak', 'away_unbeaten_streak']],
    left_on=['date', 'awayteam'],
    right_on=['Date', 'Team'],
    how='left'
).drop(columns=['Team'])

print(f"‚úÖ Features merged successfully!")
print(f"   Original columns: {len(df.columns)}")
print(f"   New columns: {len(match_df.columns)}")
print(f"   New features added: {len(match_df.columns) - len(df.columns)}")

new_cols = [col for col in match_df.columns if col not in df.columns]
print(f"\nüîç New feature columns created:")
for i, col in enumerate(new_cols, 1):
    print(f"{i:2}. {col}")

print("\nüîç Sample match with new features:")
sample_idx = 0
sample_cols = ['date', 'hometeam', 'awayteam', 'FTR', 'home_form_5', 'away_form_5',
               'home_avg_gd_5', 'away_avg_gd_5', 'home_win_streak', 'away_win_streak']
available_cols = [col for col in sample_cols if col in match_df.columns]
display(match_df[available_cols].head(2))


MERGING FEATURES BACK TO MATCH-LEVEL DATA
Extracting home team features from team-performance data...
Extracting away team features...
Merging home team features...
Merging away team features...
‚úÖ Features merged successfully!
   Original columns: 17
   New columns: 31
   New features added: 14

üîç New feature columns created:
 1. Date_x
 2. home_form_5
 3. home_avg_goals_scored_5
 4. home_avg_goals_conceded_5
 5. home_avg_gd_5
 6. home_win_streak
 7. home_unbeaten_streak
 8. Date_y
 9. away_form_5
10. away_avg_goals_scored_5
11. away_avg_goals_conceded_5
12. away_avg_gd_5
13. away_win_streak
14. away_unbeaten_streak

üîç Sample match with new features:


Unnamed: 0,date,hometeam,awayteam,home_form_5,away_form_5,home_avg_gd_5,away_avg_gd_5,home_win_streak,away_win_streak
0,2018-08-24,Bayern Munich,Hoffenheim,,,,,0,0
1,2018-08-25,Fortuna Dusseldorf,Augsburg,,,,,0,0


In [44]:
print("\n" + "="*60)
print("CREATING DERIVED FEATURES")
print("="*60)

# Creating difference features
match_df['form_diff'] = match_df['home_form_5'] - match_df['away_form_5']
match_df['gd_diff'] = match_df['home_avg_gd_5'] - match_df['away_avg_gd_5']
match_df['win_streak_diff'] = match_df['home_win_streak'] - match_df['away_win_streak']

# Creating ratio features and adding a small constant
match_df['form_ratio'] = (match_df['home_form_5'] + 0.1) / (match_df['away_form_5'] + 0.1)
match_df['gd_ratio'] = (match_df['home_avg_gd_5'] + 1) / (match_df['away_avg_gd_5'] + 1)

match_df['home_advantage_form'] = match_df['home_form_5'] * 1.1
match_df['momentum_indicator'] = match_df['form_diff'] * match_df['win_streak_diff']

print("‚úÖ Derived features created:")
print("   ‚Ä¢ form_diff: Home form - Away form")
print("   ‚Ä¢ gd_diff: Home goal diff - Away goal diff")
print("   ‚Ä¢ win_streak_diff: Home streak - Away streak")
print("   ‚Ä¢ form_ratio: Home form / Away form")
print("   ‚Ä¢ gd_ratio: Home goal diff / Away goal diff")
print("   ‚Ä¢ home_advantage_form: Home form with 10% bonus")
print("   ‚Ä¢ momentum_indicator: form_diff √ó win_streak_diff")

# Showing statistics of new features
print("\nüìä Statistics of key derived features:")
derived_features = ['form_diff', 'gd_diff', 'form_ratio', 'gd_ratio']
for feature in derived_features:
    if feature in match_df.columns:
        print(f"\n{feature}:")
        print(f"  Mean: {match_df[feature].mean():.3f}")
        print(f"  Std:  {match_df[feature].std():.3f}")
        print(f"  Min:  {match_df[feature].min():.3f}")
        print(f"  Max:  {match_df[feature].max():.3f}")
        print(f"  NaN:  {match_df[feature].isnull().sum()}")

# Checking for any missing values
print(f"\nüîç Missing values check:")
missing_counts = match_df[new_cols + derived_features].isnull().sum()
missing_features = missing_counts[missing_counts > 0]
if len(missing_features) > 0:
    print("Features with missing values:")
    for feature, count in missing_features.items():
        percentage = (count / len(match_df)) * 100
        print(f"  ‚Ä¢ {feature}: {count} missing ({percentage:.1f}%)")
else:
    print("‚úÖ No missing values in new features!")


CREATING DERIVED FEATURES
‚úÖ Derived features created:
   ‚Ä¢ form_diff: Home form - Away form
   ‚Ä¢ gd_diff: Home goal diff - Away goal diff
   ‚Ä¢ win_streak_diff: Home streak - Away streak
   ‚Ä¢ form_ratio: Home form / Away form
   ‚Ä¢ gd_ratio: Home goal diff / Away goal diff
   ‚Ä¢ home_advantage_form: Home form with 10% bonus
   ‚Ä¢ momentum_indicator: form_diff √ó win_streak_diff

üìä Statistics of key derived features:

form_diff:
  Mean: 0.000
  Std:  0.000
  Min:  0.000
  Max:  0.000
  NaN:  55

gd_diff:
  Mean: 0.000
  Std:  0.000
  Min:  0.000
  Max:  0.000
  NaN:  55

form_ratio:
  Mean: 1.000
  Std:  0.000
  Min:  1.000
  Max:  1.000
  NaN:  55

gd_ratio:
  Mean: 1.000
  Std:  0.000
  Min:  1.000
  Max:  1.000
  NaN:  55

üîç Missing values check:
Features with missing values:
  ‚Ä¢ home_form_5: 39 missing (0.5%)
  ‚Ä¢ home_avg_goals_scored_5: 39 missing (0.5%)
  ‚Ä¢ home_avg_goals_conceded_5: 39 missing (0.5%)
  ‚Ä¢ home_avg_gd_5: 39 missing (0.5%)
  ‚Ä¢ away_form_

In [46]:
print("\n" + "="*60)
print("HANDLING MISSING VALUES & FINALISING DATASET")
print("="*60)

# Counting missing values before handling
print("Missing values before handling:")
missing_before = match_df.isnull().sum().sum()
print(f"  Total missing: {missing_before}")

# Filling missing values in rolling features
if missing_before > 0:
    print("\nüîß Handling missing values...")

    form_features = ['home_form_5', 'away_form_5', 'home_avg_gd_5', 'away_avg_gd_5',
                    'home_avg_goals_scored_5', 'away_avg_goals_scored_5',
                    'home_avg_goals_conceded_5', 'away_avg_goals_conceded_5']

    for feature in form_features:
        if feature in match_df.columns:
            match_df[feature] = match_df[feature].fillna(0)
            print(f"  ‚Ä¢ {feature}: filled with 0")

    remaining_missing = match_df.isnull().sum().sum()
    if remaining_missing > 0:
        print(f"\n‚ö†Ô∏è Still {remaining_missing} missing values after form fill.")
        print("Filling remaining with 0 or appropriate defaults...")
        match_df = match_df.fillna(0)

print(f"\n‚úÖ Missing values after handling: {match_df.isnull().sum().sum()}")

print("\nüéØ Creating final feature set for modeling...")

target_column = 'FTR'  # This is what we want to predict

base_columns = ['date', 'league', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', target_column]

all_features = [col for col in match_df.columns if col not in base_columns]

print(f"   Target variable: {target_column}")
print(f"   Total features available: {len(all_features)}")
print(f"   Total columns in dataset: {len(match_df.columns)}")

# Showing feature categories
print("\nüìä Feature categories created:")
feature_categories = {
    'Form Features': [f for f in all_features if 'form' in f.lower()],
    'Goal-based': [f for f in all_features if 'goal' in f.lower() or 'gd' in f],
    'Streak Features': [f for f in all_features if 'streak' in f.lower()],
    'Difference/Ratio': [f for f in all_features if 'diff' in f or 'ratio' in f]
}

for category, features in feature_categories.items():
    if features:
        print(f"  ‚Ä¢ {category}: {len(features)} features")
        if len(features) <= 5:  # Show all if few
            for f in features:
                print(f"    - {f}")
        else:
            print(f"    (e.g., {', '.join(features[:3])}...)")

print(f"\nüìà Final dataset ready for modeling:")
print(f"   Matches: {len(match_df):,}")
print(f"   Features: {len(all_features)}")
print(f"   Memory usage: {match_df.memory_usage(deep=True).sum() / 1e6:.1f} MB")

# Showing final columns
print(f"\nüîç First few columns of final dataset:")
columns_list = match_df.columns.tolist()
for i, col in enumerate(columns_list[:15], 1):
    print(f"{i:2}. {col}")
if len(columns_list) > 15:
    print(f"... and {len(columns_list)-15} more columns")


HANDLING MISSING VALUES & FINALISING DATASET
Missing values before handling:
  Total missing: 0

‚úÖ Missing values after handling: 0

üéØ Creating final feature set for modeling...
   Target variable: FTR
   Total features available: 36
   Total columns in dataset: 38

üìä Feature categories created:
  ‚Ä¢ Form Features: 5 features
    - home_form_5
    - away_form_5
    - form_diff
    - form_ratio
    - home_advantage_form
  ‚Ä¢ Goal-based: 10 features
    (e.g., goal_difference, total_goals, home_avg_goals_scored_5...)
  ‚Ä¢ Streak Features: 5 features
    - home_win_streak
    - home_unbeaten_streak
    - away_win_streak
    - away_unbeaten_streak
    - win_streak_diff
  ‚Ä¢ Difference/Ratio: 6 features
    (e.g., goal_difference, form_diff, gd_diff...)

üìà Final dataset ready for modeling:
   Matches: 7,462
   Features: 36
   Memory usage: 3.8 MB

üîç First few columns of final dataset:
 1. date
 2. hometeam
 3. awayteam
 4. fthg
 5. ftag
 6. hs
 7. as
 8. hst
 9. ast
10. h

In [49]:
print("\n" + "="*60)
print("SAVING MODELING DATASET")
print("="*60)

OUTPUT_PATH = '/content/drive/MyDrive/final-year-project/data/processed/modeling_dataset.csv'
print(f"üíæ Saving to: {OUTPUT_PATH}")

match_df.to_csv(OUTPUT_PATH, index=False)
print(f"‚úÖ Dataset saved successfully!")

SAMPLE_PATH = '/content/drive/MyDrive/final-year-project/data/processed/modeling_dataset_sample.csv'
match_df.head(100).to_csv(SAMPLE_PATH, index=False)
print(f"üìã Sample (first 100 matches) saved to: {SAMPLE_PATH}")

print("\nüéâ FEATURE ENGINEERING COMPLETE!")
print("="*60)
print(f"üìä Original dataset: {len(df):,} matches, {len(df.columns)} columns")
print(f"üöÄ Engineered dataset: {len(match_df):,} matches, {len(match_df.columns)} columns")
print(f"‚ú® New features created: {len(all_features)}")
print(f"üíæ Saved to: {OUTPUT_PATH}")


SAVING MODELING DATASET
üíæ Saving to: /content/drive/MyDrive/final-year-project/data/processed/modeling_dataset.csv
‚úÖ Dataset saved successfully!
üìã Sample (first 100 matches) saved to: /content/drive/MyDrive/final-year-project/data/processed/modeling_dataset_sample.csv

üéâ FEATURE ENGINEERING COMPLETE!
üìä Original dataset: 7,462 matches, 17 columns
üöÄ Engineered dataset: 7,462 matches, 38 columns
‚ú® New features created: 36
üíæ Saved to: /content/drive/MyDrive/final-year-project/data/processed/modeling_dataset.csv
