In [None]:
import pandas as pd
import re
import joblib

# LOAD DATA
df = pd.read_csv("/Users/saamsani/Desktop/CMPT /soccer_score_prediction/data/matches_phase2_features_FINAL_WITH_FORM.csv")
df["date"] = pd.to_datetime(df["date"], errors='coerce')
print(f" Data loaded: {len(df)} matches")

# CLEAN NAME FUNCTION
def clean_name(name):
    if pd.isna(name):
        return ""
    name = str(name).strip()
    name = re.sub(r'\s*\([A-Z]+\)\s*', '', name)
    name_lower = name.lower()
    replacements = {
        "internazionale milano": "Inter Milan",
        "atlético de madrid": "Atletico Madrid",
        "atletico de madrid": "Atletico Madrid",
        "sporting clube de braga": "Braga",
        "bayern münchen": "Bayern Munich",
        "real sociedad de fútbol": "Real Sociedad",
        "paris saint germain": "Paris Saint-Germain",
        "paris saint-germain fc": "Paris Saint-Germain",
        "sport lisboa e benfica": "Benfica",
        "feyenoord rotterdam": "Feyenoord",
        "1. fc union berlin": "Union Berlin",
        "fc københavn": "Copenhagen",
        "rb leipzig": "Leipzig",
        "ss lazio": "Lazio",
        "ssc napoli": "Napoli",
        "psv eindhoven": "PSV",
        "real madrid cf": "Real Madrid",
        "real madrid c.f.": "Real Madrid",
        "fc salzburg": "Salzburg",
        "fc red bull salzburg": "Salzburg",
        "manchester city fc": "Manchester City"
    }
    for key, value in replacements.items():
        if key in name_lower:
            return value
    name = re.sub(r'\b(FC|C\.F\.|CF|AC|SSC|RCD|AFC|KV|SV|SC|SL|FF|United|Club|Sporting|Red Bull|RB)\b', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\s+', ' ', name).strip()
    return name.title()


In [None]:
# LOAD TRAINED MODELS 

rf_model = joblib.load("/Users/saamsani/Desktop/CMPT /soccer_score_prediction/models/rf_model_final.joblib")
home_goal_model = joblib.load("/Users/saamsani/Desktop/CMPT /soccer_score_prediction/models/home_goal_model_final.joblib")
away_goal_model = joblib.load("/Users/saamsani/Desktop/CMPT /soccer_score_prediction/models/away_goal_model_final.joblib")


In [None]:
# FUNCTION: Build Feature Row for Prediction
def get_feature_row_with_recent_form(team1, team2, match_date):
    team1_c = clean_name(team1)
    team2_c = clean_name(team2)
    match_date = pd.to_datetime(match_date)

    # Past matchups
    matchup = df[
        ((df["team1"] == team1_c) & (df["team2"] == team2_c)) |
        ((df["team1"] == team2_c) & (df["team2"] == team1_c))
    ]
    matchup = matchup[matchup["date"] < match_date]

    past_meetings = len(matchup)
    past_wins = matchup[matchup["outcome"] == 1].shape[0]
    past_draws = matchup[matchup["outcome"] == 0].shape[0]
    past_losses = matchup[matchup["outcome"] == -1].shape[0]

    last_game = df[df["date"] < match_date]
    last_game = last_game.sort_values("date").dropna()

    strength_diff = 0
    win_rate_diff = 0

    for i in reversed(range(len(last_game))):
        row2 = last_game.iloc[i]
        if (row2["team1"] == team1_c) and (row2["team2"] == team2_c):
            strength_diff = row2["team_strength_diff"]
            win_rate_diff = row2["win_rate_diff"]
            break
        elif (row2["team1"] == team2_c) and (row2["team2"] == team1_c):
            strength_diff = -row2["team_strength_diff"]
            win_rate_diff = -row2["win_rate_diff"]
            break

    # Recent form stats for both teams
    t1_wins, t1_draws, t1_losses, t1_avg_scored, t1_avg_conceded = get_recent_form_stats(team1_c, match_date)
    t2_wins, t2_draws, t2_losses, t2_avg_scored, t2_avg_conceded = get_recent_form_stats(team2_c, match_date)

    features = {
        "team_strength_diff": strength_diff,
        "win_rate_diff": win_rate_diff,
        "past_meetings": past_meetings,
        "past_wins": past_wins,
        "past_draws": past_draws,
        "past_losses": past_losses,
        "team1_recent_wins": t1_wins,
        "team1_recent_draws": t1_draws,
        "team1_recent_losses": t1_losses,
        "team1_avg_goals_scored": t1_avg_scored,
        "team1_avg_goals_conceded": t1_avg_conceded,
        "team2_recent_wins": t2_wins,
        "team2_recent_draws": t2_draws,
        "team2_recent_losses": t2_losses,
        "team2_avg_goals_scored": t2_avg_scored,
        "team2_avg_goals_conceded": t2_avg_conceded
    }

    # Print recent form summary
    print("\n==============================")
    print(f"{team1} recent form (last 5):")
    print(f"    Wins: {t1_wins}, Draws: {t1_draws}, Losses: {t1_losses}")
    print(f"    Avg goals scored: {t1_avg_scored}")
    print(f"    Avg goals conceded: {t1_avg_conceded}")
    print("\n------------------------------")
    print(f"{team2} recent form (last 5):")
    print(f"    Wins: {t2_wins}, Draws: {t2_draws}, Losses: {t2_losses}")
    print(f"    Avg goals scored: {t2_avg_scored}")
    print(f"    Avg goals conceded: {t2_avg_conceded}")
    print("\n==============================")

    print("\nHead-to-Head:")
    print(f"    Past meetings: {past_meetings}")
    print(f"    Wins by {team1}: {past_wins}")
    print(f"    Draws: {past_draws}")
    print(f"    Losses by {team1}: {past_losses}")

    return pd.DataFrame([features])


In [None]:
# FUNCTION: Recent Form Stats
def get_recent_form_stats(team, match_date):
    team_games = df[
        ((df["team1"] == team) | (df["team2"] == team)) &
        (df["date"] < match_date)
    ].sort_values("date", ascending=False).head(5)

    wins = draws = losses = scored = conceded = 0

    for _, row in team_games.iterrows():
        if row["team1"] == team:
            goals_for = row["home_goals"]
            goals_against = row["away_goals"]
            result = row["outcome"]
        else:
            goals_for = row["away_goals"]
            goals_against = row["home_goals"]
            result = -row["outcome"]

        scored += goals_for
        conceded += goals_against

        if result == 1:
            wins += 1
        elif result == 0:
            draws += 1
        else:
            losses += 1

    games = len(team_games)
    avg_scored = scored / games if games > 0 else 0
    avg_conceded = conceded / games if games > 0 else 0

    return wins, draws, losses, avg_scored, avg_conceded


In [None]:
# FINAL PREDICTION FUNCTION (Outcome + Score)
def final_prediction(team1, team2, match_date):
    # Get feature row
    feature_row = get_feature_row_with_recent_form(team1, team2, match_date)
    
    # Predict match outcome
    outcome_pred = rf_model.predict(feature_row)[0]
    outcome_label = {1: f"{team1} Win", 0: "Draw", -1: f"{team2} Win"}
    
    # Predict goals for both teams
    goals1 = int(round(home_goal_model.predict(feature_row)[0]))
    goals2 = int(round(away_goal_model.predict(feature_row)[0]))
    
    # Adjust outcome if the score contradicts outcome prediction
    if goals1 > goals2:
        final_outcome = f"{team1} Win"
    elif goals1 < goals2:
        final_outcome = f"{team2} Win"
    else:
        final_outcome = "Draw"

    print("\n==============================")
    print(f"Match: {team1} vs {team2}")
    print(f"Predicted Outcome: {final_outcome} (Model originally predicted: {outcome_label[outcome_pred]})")
    print(f"Predicted Score: {goals1} - {goals2}")
    print("==============================")

    return final_outcome, goals1, goals2


In [None]:
# LOOP: Predict multiple matches until user quits
while True:
    print("\n--------------------------------------------")
    print("Enter the teams and date to predict the outcome and score!")
    print("NOTE: Enter date in this format: YYYY-MM-DD (e.g., 2025-05-15)")
    
    home_team = input("Home Team: ").strip()
    away_team = input("Away Team: ").strip()
    match_date = input("Match Date (YYYY-MM-DD): ").strip()

    try:
        final_prediction(home_team, away_team, match_date)
    except Exception as e:
        print(f" Error: {e}")

    cont = input("\nDo you want to predict another match? (yes/no): ").strip().lower()
    if cont != "yes":
        print(" Exiting prediction loop. Goodbye!")
        break
