In [1]:
# Install required libraries
!pip install pandas numpy scikit-learn joblib



In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import joblib
from google.colab import files

In [3]:
# Upload the four CSV files
uploaded = files.upload()

Saving former_names.csv to former_names.csv
Saving goalscorers.csv to goalscorers.csv
Saving results.csv to results.csv
Saving shootouts.csv to shootouts.csv


In [6]:
# Load the datasets
results_df = pd.read_csv('results.csv')
shootouts_df = pd.read_csv('shootouts.csv')
former_names_df = pd.read_csv('former_names.csv')
goalscorers_df = pd.read_csv('goalscorers.csv')

# Display shapes and column names to verify
print("Results Shape:", results_df.shape, "\nColumns:", results_df.columns.tolist())
print("Shootouts Shape:", shootouts_df.shape, "\nColumns:", shootouts_df.columns.tolist())
print("Former Names Shape:", former_names_df.shape, "\nColumns:", former_names_df.columns.tolist())
print("Goalscorers Shape:", goalscorers_df.shape, "\nColumns:", goalscorers_df.columns.tolist())

Results Shape: (48207, 9) 
Columns: ['date', 'home_team', 'away_team', 'home_score', 'away_score', 'tournament', 'city', 'country', 'neutral']
Shootouts Shape: (645, 5) 
Columns: ['date', 'home_team', 'away_team', 'winner', 'first_shooter']
Former Names Shape: (34, 4) 
Columns: ['current', 'former', 'start_date', 'end_date']
Goalscorers Shape: (44362, 8) 
Columns: ['date', 'home_team', 'away_team', 'team', 'scorer', 'minute', 'own_goal', 'penalty']


In [7]:
# Create mapping dictionary from former_names.csv using 'former' and 'current'
name_mapping = dict(zip(former_names_df['former'], former_names_df['current']))

# Function to replace former names with current names
def standardize_team_name(team):
    return name_mapping.get(team, team)

# Apply standardization to all relevant columns
for df in [results_df, shootouts_df, goalscorers_df]:
    for col in ['home_team', 'away_team']:
        df[col] = df[col].apply(standardize_team_name)
    if 'team' in df.columns:
        df['team'] = df['team'].apply(standardize_team_name)
    if 'winner' in df.columns:
        df['winner'] = df['winner'].apply(standardize_team_name)

# Verify a few rows
print("Standardized Results Head:\n", results_df.head())
print("Standardized Shootouts Head:\n", shootouts_df.head())
print("Standardized Goalscorers Head:\n", goalscorers_df.head())

Standardized Results Head:
          date home_team away_team  home_score  away_score tournament     city  \
0  1872-11-30  Scotland   England           0           0   Friendly  Glasgow   
1  1873-03-08   England  Scotland           4           2   Friendly   London   
2  1874-03-07  Scotland   England           2           1   Friendly  Glasgow   
3  1875-03-06   England  Scotland           2           2   Friendly   London   
4  1876-03-04  Scotland   England           3           0   Friendly  Glasgow   

    country  neutral  
0  Scotland    False  
1   England    False  
2  Scotland    False  
3   England    False  
4  Scotland    False  
Standardized Shootouts Head:
          date    home_team         away_team       winner first_shooter
0  1967-08-22        India            Taiwan       Taiwan           NaN
1  1971-11-14  South Korea  Vietnam Republic  South Korea           NaN
2  1972-05-07  South Korea              Iraq         Iraq           NaN
3  1972-05-17     Thailand   

In [8]:
# Merge results_df with shootouts_df (drop 'first_shooter' as it’s not needed for prediction)
combined_df = results_df.merge(shootouts_df.drop(columns=['first_shooter']),
                              on=['date', 'home_team', 'away_team'], how='left')

# Add outcome column
combined_df['outcome'] = np.where(combined_df['home_score'] > combined_df['away_score'], 1,
                                  np.where(combined_df['home_score'] < combined_df['away_score'], 2, 0))

# Adjust outcome for shootouts
combined_df.loc[combined_df['winner'].notna() & (combined_df['outcome'] == 0), 'outcome'] = \
    combined_df.apply(lambda row: 1 if row['winner'] == row['home_team'] else 2, axis=1)

# Drop unnecessary columns
combined_df = combined_df.drop(columns=['winner'])

# Merge with goalscorers_df
combined_df = combined_df.merge(goalscorers_df, on=['date', 'home_team', 'away_team'], how='left')

# Fill NaN in goal-related columns
combined_df[['team', 'scorer', 'minute', 'own_goal', 'penalty']] = combined_df[
    ['team', 'scorer', 'minute', 'own_goal', 'penalty']].fillna({'team': '', 'scorer': '', 'minute': 0,
                                                                 'own_goal': False, 'penalty': False})

# Convert date to datetime
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Save combined dataset (optional)
combined_df.to_csv('combined_dataset.csv', index=False)
files.download('combined_dataset.csv')

# Display shape and head
("Combined Shape:", combined_df.shape)
"Combined Head:\n", combined_df.head()

  ['team', 'scorer', 'minute', 'own_goal', 'penalty']].fillna({'team': '', 'scorer': '', 'minute': 0,


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

('Combined Head:\n',
         date home_team away_team  home_score  away_score tournament     city  \
 0 1872-11-30  Scotland   England           0           0   Friendly  Glasgow   
 1 1873-03-08   England  Scotland           4           2   Friendly   London   
 2 1874-03-07  Scotland   England           2           1   Friendly  Glasgow   
 3 1875-03-06   England  Scotland           2           2   Friendly   London   
 4 1876-03-04  Scotland   England           3           0   Friendly  Glasgow   
 
     country  neutral  outcome team scorer  minute  own_goal  penalty  
 0  Scotland    False        0                 0.0     False    False  
 1   England    False        1                 0.0     False    False  
 2  Scotland    False        1                 0.0     False    False  
 3   England    False        0                 0.0     False    False  
 4  Scotland    False        1                 0.0     False    False  )

In [10]:
# Calculate team statistics
team_stats = {}
for team in set(combined_df['home_team']).union(combined_df['away_team']):
    home_matches = combined_df[combined_df['home_team'] == team]
    away_matches = combined_df[combined_df['away_team'] == team]
    total_matches = len(home_matches) + len(away_matches)
    wins = len(home_matches[home_matches['outcome'] == 1]) + len(away_matches[away_matches['outcome'] == 2])
    goals_scored = home_matches['home_score'].sum() + away_matches['away_score'].sum()
    goals_conceded = home_matches['away_score'].sum() + away_matches['home_score'].sum()

    team_stats[team] = {
        'win_rate': wins / total_matches if total_matches > 0 else 0,
        'avg_goals_scored': goals_scored / total_matches if total_matches > 0 else 0,
        'avg_goals_conceded': goals_conceded / total_matches if total_matches > 0 else 0
    }

# Add team stat features to combined_df
combined_df['home_win_rate'] = combined_df['home_team'].map(lambda x: team_stats[x]['win_rate'])
combined_df['away_win_rate'] = combined_df['away_team'].map(lambda x: team_stats[x]['win_rate'])
combined_df['home_avg_goals_scored'] = combined_df['home_team'].map(lambda x: team_stats[x]['avg_goals_scored'])
combined_df['away_avg_goals_scored'] = combined_df['away_team'].map(lambda x: team_stats[x]['avg_goals_scored'])
combined_df['home_avg_goals_conceded'] = combined_df['home_team'].map(lambda x: team_stats[x]['avg_goals_conceded'])
combined_df['away_avg_goals_conceded'] = combined_df['away_team'].map(lambda x: team_stats[x]['avg_goals_conceded'])

# Calculate goal counts separately and merge back
goal_counts = combined_df.groupby(['date', 'home_team', 'away_team', 'team'])['minute'].count().reset_index(name='goal_count')

# Home goals: filter where team is home_team
home_goals = goal_counts[goal_counts['team'] == goal_counts['home_team']].groupby(
    ['date', 'home_team', 'away_team'])['goal_count'].sum().reset_index(name='home_goals_count')

# Away goals: filter where team is away_team
away_goals = goal_counts[goal_counts['team'] == goal_counts['away_team']].groupby(
    ['date', 'home_team', 'away_team'])['goal_count'].sum().reset_index(name='away_goals_count')

# Merge goal counts back into combined_df
combined_df = combined_df.merge(home_goals[['date', 'home_team', 'away_team', 'home_goals_count']],
                               on=['date', 'home_team', 'away_team'], how='left')
combined_df = combined_df.merge(away_goals[['date', 'home_team', 'away_team', 'away_goals_count']],
                               on=['date', 'home_team', 'away_team'], how='left')

# Fill NaN goal counts with 0 (for matches with no goals recorded in goalscorers_df)
combined_df['home_goals_count'] = combined_df['home_goals_count'].fillna(0)
combined_df['away_goals_count'] = combined_df['away_goals_count'].fillna(0)

# Display sample features
print("Features Sample:\n", combined_df[['home_team', 'away_team', 'home_win_rate', 'away_win_rate',
                                         'home_avg_goals_scored', 'away_avg_goals_scored',
                                         'home_goals_count', 'away_goals_count']].head())

Features Sample:
   home_team away_team  home_win_rate  away_win_rate  home_avg_goals_scored  \
0  Scotland   England       0.494561       0.639152               1.964467   
1   England  Scotland       0.639152       0.494561               2.759621   
2  Scotland   England       0.494561       0.639152               1.964467   
3   England  Scotland       0.639152       0.494561               2.759621   
4  Scotland   England       0.494561       0.639152               1.964467   

   away_avg_goals_scored  home_goals_count  away_goals_count  
0               2.759621               0.0               0.0  
1               1.964467               0.0               0.0  
2               2.759621               0.0               0.0  
3               1.964467               0.0               0.0  
4               2.759621               0.0               0.0  


In [11]:
# Label encode categorical columns
le_home = LabelEncoder()
le_away = LabelEncoder()
combined_df['home_team_encoded'] = le_home.fit_transform(combined_df['home_team'])
combined_df['away_team_encoded'] = le_away.fit_transform(combined_df['away_team'])

# Save label encoders
joblib.dump(le_home, 'le_home.pkl')
joblib.dump(le_away, 'le_away.pkl')
files.download('le_home.pkl')
files.download('le_away.pkl')

# Select features for the model
features = ['home_team_encoded', 'away_team_encoded', 'home_win_rate', 'away_win_rate',
            'home_avg_goals_scored', 'away_avg_goals_scored', 'home_avg_goals_conceded',
            'away_avg_goals_conceded', 'neutral']
X = combined_df[features]
y = combined_df['outcome']

# Scale numerical features
scaler = StandardScaler()
X_scaled = X.copy()
numerical_cols = ['home_win_rate', 'away_win_rate', 'home_avg_goals_scored', 'away_avg_goals_scored',
                  'home_avg_goals_conceded', 'away_avg_goals_conceded']
X_scaled[numerical_cols] = scaler.fit_transform(X[numerical_cols])

# Save scaler
joblib.dump(scaler, 'scaler.pkl')
files.download('scaler.pkl')

# Display scaled features
print("Scaled Features Sample:\n", X_scaled.head())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scaled Features Sample:
    home_team_encoded  away_team_encoded  home_win_rate  away_win_rate  \
0                247                 87       0.472074       1.575067   
1                 86                241       1.432469       0.619721   
2                247                 87       0.472074       1.575067   
3                 86                241       1.432469       0.619721   
4                247                 87       0.472074       1.575067   

   home_avg_goals_scored  away_avg_goals_scored  home_avg_goals_conceded  \
0               0.159161               1.561127                -0.384908   
1               1.377888               0.305369                -0.915780   
2               0.159161               1.561127                -0.384908   
3               1.377888               0.305369                -0.915780   
4               0.159161               1.561127                -0.384908   

   away_avg_goals_conceded  neutral  
0                -0.914305    False  
1  

In [12]:
# Split data (train on data up to 2023, test on 2024)
train_df = combined_df[combined_df['date'].dt.year < 2024]
test_df = combined_df[combined_df['date'].dt.year >= 2024]

X_train = X_scaled.loc[train_df.index]
y_train = y.loc[train_df.index]
X_test = X_scaled.loc[test_df.index]
y_test = y.loc[test_df.index]

# Initialize and train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Save the model
joblib.dump(rf_model, 'rf_model.pkl')
files.download('rf_model.pkl')

# Confirm training
print("Model trained. Train shape:", X_train.shape, "Test shape:", X_test.shape)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model trained. Train shape: (76473, 9) Test shape: (1703, 9)


In [13]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Draw', 'Home Win', 'Away Win'])

# Print results
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.5372871403405755
Classification Report:
               precision    recall  f1-score   support

        Draw       0.25      0.14      0.18       353
    Home Win       0.63      0.73      0.67       853
    Away Win       0.48      0.49      0.48       497

    accuracy                           0.54      1703
   macro avg       0.45      0.45      0.45      1703
weighted avg       0.51      0.54      0.52      1703

