In [None]:
import pandas as pd
import re

# ------------- LOAD FINAL CLEANED DATA ------------------

df = pd.read_csv("/Users/saamsani/Desktop/CMPT /soccer_score_prediction/data/matches_phase2_features_FINAL_WITH_FORM.csv")
print(f" Loaded {len(df)} matches")

df["date"] = pd.to_datetime(df["date"], errors="coerce")



# ------------- TEAM NAME CLEANER (same as training) ------------------
def clean_name(name):
    if pd.isna(name):
      return ""
    
    # Convert to string and strip whitespace
    name = str(name).strip()
    
    # Remove anything in brackets like (ESP) or (FRA)
    name = re.sub(r'\s*\([A-Z]+\)\s*', '', name)
    
    # Create a lowercase version for case-insensitive matching
    name_lower = name.lower()
    
    # Comprehensive replacements dictionary
    replacements = {
        "internazionale milano": "Inter Milan",
        "atlético de madrid": "Atletico Madrid",
        "atletico de madrid": "Atletico Madrid",
        "sporting clube de braga": "Braga",
        "bayern münchen": "Bayern Munich",
        "real sociedad de fútbol": "Real Sociedad",
        "paris saint germain": "Paris Saint-Germain",
        "paris saint-germain fc": "Paris Saint-Germain",
        "sport lisboa e benfica": "Benfica",
        "feyenoord rotterdam": "Feyenoord",
        "1. fc union berlin": "Union Berlin",
        "fc københavn": "Copenhagen",
        "rb leipzig": "Leipzig",
        "ss lazio": "Lazio", 
        "ssc napoli": "Napoli",
        "psv eindhoven": "PSV",
        "real madrid cf": "Real Madrid",
        "real madrid c.f.": "Real Madrid",
        "fc salzburg": "Salzburg",
        "fc red bull salzburg": "Salzburg",
        "manchester city fc": "Manchester City",
        "leeds united fc": "Leeds United",
        "queens park rangers fc": "QPR",
        "coventry city fc": "Coventry City",
        "watford fc": "Watford",
        "portsmouth fc": "Portsmouth",
        "cardiff city fc": "Cardiff City",
        "norwich city fc": "Norwich City",
        "preston north end fc": "Preston North End",
        "derby county fc": "Derby County",
        "oxford united fc": "Oxford United"
    }
    
    # Check for exact matches first (case insensitive)
    for key, value in replacements.items():
        if key in name_lower:
            return value
    
    # If no direct replacement, then clean the name
    # Remove common suffixes/prefixes 
    name = re.sub(r'\b(FC|C\.F\.|CF|AC|SSC|RCD|AFC|KV|SV|SC|SL|FF|United|Club|Sporting|Red Bull|RB)\b', '', name, flags=re.IGNORECASE)
    
    # Collapse extra spaces and strip
    name = re.sub(r'\s+', ' ', name).strip()
    
    # Title case to normalize casing
    name = name.title()
    
    return name


In [None]:
# ---- Function to get last 5 games stats ----
def get_recent_form_stats(team, match_date):
    team_games = df[
        ((df["team1"] == team) | (df["team2"] == team)) &
        (df["date"] < match_date)
    ].sort_values("date", ascending=False).head(5)

    wins = 0
    draws = 0
    losses = 0
    goals_scored = 0
    goals_conceded = 0

    for _, row in team_games.iterrows():
        if row["team1"] == team:
            scored = row["home_goals"]
            conceded = row["away_goals"]
            result = row["outcome"]
        else:
            scored = row["away_goals"]
            conceded = row["home_goals"]
            result = -row["outcome"]

        goals_scored += scored
        goals_conceded += conceded

        if result == 1:
            wins += 1
        elif result == 0:
            draws += 1
        else:
            losses += 1

    games_played = len(team_games)
    avg_goals_scored = goals_scored / games_played if games_played > 0 else 0
    avg_goals_conceded = goals_conceded / games_played if games_played > 0 else 0

    return wins, draws, losses, avg_goals_scored, avg_goals_conceded

# ---- MAIN FUNCTION ----
def get_feature_row_with_recent_form(team1, team2, match_date):
    team1_c = clean_name(team1)
    team2_c = clean_name(team2)
    match_date = pd.to_datetime(match_date)

    # Past matchups
    matchup = df[
        ((df["team1"] == team1_c) & (df["team2"] == team2_c)) |
        ((df["team1"] == team2_c) & (df["team2"] == team1_c))
    ]
    matchup = matchup[matchup["date"] < match_date]

    past_meetings = len(matchup)
    past_wins = matchup[matchup["outcome"] == 1].shape[0]
    past_draws = matchup[matchup["outcome"] == 0].shape[0]
    past_losses = matchup[matchup["outcome"] == -1].shape[0]

    # Get last known strength diff and win rate diff
    last_game = df[df["date"] < match_date]
    last_game = last_game.sort_values("date").dropna()

    strength_diff = 0
    win_rate_diff = 0

    for i in reversed(range(len(last_game))):
        row2 = last_game.iloc[i]
        if (row2["team1"] == team1_c) and (row2["team2"] == team2_c):
            strength_diff = row2["team_strength_diff"]
            win_rate_diff = row2["win_rate_diff"]
            break
        elif (row2["team1"] == team2_c) and (row2["team2"] == team1_c):
            strength_diff = -row2["team_strength_diff"]
            win_rate_diff = -row2["win_rate_diff"]
            break

    # Recent form stats
    t1_wins, t1_draws, t1_losses, t1_avg_scored, t1_avg_conceded = get_recent_form_stats(team1_c, match_date)
    t2_wins, t2_draws, t2_losses, t2_avg_scored, t2_avg_conceded = get_recent_form_stats(team2_c, match_date)

    features = {
        "team_strength_diff": strength_diff,
        "win_rate_diff": win_rate_diff,
        "past_meetings": past_meetings,
        "past_wins": past_wins,
        "past_draws": past_draws,
        "past_losses": past_losses,
        "team1_recent_wins": t1_wins,
        "team1_recent_draws": t1_draws,
        "team1_recent_losses": t1_losses,
        "team1_avg_goals_scored": t1_avg_scored,
        "team1_avg_goals_conceded": t1_avg_conceded,
        "team2_recent_wins": t2_wins,
        "team2_recent_draws": t2_draws,
        "team2_recent_losses": t2_losses,
        "team2_avg_goals_scored": t2_avg_scored,
        "team2_avg_goals_conceded": t2_avg_conceded
    }

    # Print summary
    print("\n==============================")
    print(f"{team1} recent form (last 5):")
    print(f"    Wins: {t1_wins}, Draws: {t1_draws}, Losses: {t1_losses}")
    print(f"    Avg goals scored: {t1_avg_scored}")
    print(f"    Avg goals conceded: {t1_avg_conceded}")

    print("\n------------------------------")

    print(f"{team2} recent form (last 5):")
    print(f"    Wins: {t2_wins}, Draws: {t2_draws}, Losses: {t2_losses}")
    print(f"    Avg goals scored: {t2_avg_scored}")
    print(f"    Avg goals conceded: {t2_avg_conceded}")

    print("\n==============================")

    print("\nHead-to-Head:")
    print(f"    Past meetings: {past_meetings}")
    print(f"    Wins by {team1}: {past_wins}")
    print(f"    Draws: {past_draws}")
    print(f"    Losses by {team1}: {past_losses}")

    return pd.DataFrame([features])

 
    



In [None]:
# Add recent form stats to EVERY row in dataframe 

# First clean team names again to ensure consistency
df["team1"] = df["team1"].apply(clean_name)
df["team2"] = df["team2"].apply(clean_name)

# Lists to store recent form stats for each team
t1_wins = []
t1_draws = []
t1_losses = []
t1_scored = []
t1_conceded = []

t2_wins = []
t2_draws = []
t2_losses = []
t2_scored = []
t2_conceded = []

# Loop through every row and calculate recent form stats for each team
for idx, row in df.iterrows():
    team1 = row["team1"]
    team2 = row["team2"]
    match_date = row["date"]

    # Get recent form for team 1
    w1, d1, l1, s1, c1 = get_recent_form_stats(team1, match_date)
    t1_wins.append(w1)
    t1_draws.append(d1)
    t1_losses.append(l1)
    t1_scored.append(s1)
    t1_conceded.append(c1)

    # Get recent form for team 2
    w2, d2, l2, s2, c2 = get_recent_form_stats(team2, match_date)
    t2_wins.append(w2)
    t2_draws.append(d2)
    t2_losses.append(l2)
    t2_scored.append(s2)
    t2_conceded.append(c2)

print("Finished calculating recent form for all matches!")


#  Add new columns to dataframe

df["team1_recent_wins"] = t1_wins
df["team1_recent_draws"] = t1_draws
df["team1_recent_losses"] = t1_losses
df["team1_avg_goals_scored"] = t1_scored
df["team1_avg_goals_conceded"] = t1_conceded

df["team2_recent_wins"] = t2_wins
df["team2_recent_draws"] = t2_draws
df["team2_recent_losses"] = t2_losses
df["team2_avg_goals_scored"] = t2_scored
df["team2_avg_goals_conceded"] = t2_conceded

print("Recent form columns added to dataframe!")


In [None]:
final_path = "/Users/saamsani/Desktop/CMPT /soccer_score_prediction/data/matches_phase2_features_FINAL_WITH_FORM.csv"
df.to_csv(final_path, index=False)
print("Final CSV with recent form stats saved:", final_path)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib


# Prepare features and target

features = [
    "team_strength_diff", "win_rate_diff", "past_meetings", "past_wins", "past_draws", "past_losses",
    "team1_recent_wins", "team1_recent_draws", "team1_recent_losses",
    "team1_avg_goals_scored", "team1_avg_goals_conceded",
    "team2_recent_wins", "team2_recent_draws", "team2_recent_losses",
    "team2_avg_goals_scored", "team2_avg_goals_conceded"
]

X = df[features]
y = df["outcome"]

# Split into train/test


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))

# Train Random Forest

rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=15,
    class_weight="balanced",
    random_state=42
)

rf_model.fit(X_train, y_train)

print("\n Model training complete!")

# Evaluate model

y_pred = rf_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("\nAccuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))




In [None]:
def final_prediction(team1, team2, match_date):
    # Get feature row
    feature_row = get_feature_row_with_recent_form(team1, team2, match_date)
    
    # Predict match outcome
    outcome_pred = rf_model.predict(feature_row)[0]
    outcome_label = {1: "Team 1 Win", 0: "Draw", -1: "Team 2 Win"}
    
    # Predict goals for both teams using the correct model names
    goals1 = int(round(home_goal_model.predict(feature_row)[0]))
    goals2 = int(round(away_goal_model.predict(feature_row)[0]))
    
    # Adjust outcome if score contradicts classification prediction
    if goals1 > goals2:
        final_outcome = "Team 1 Win"
    elif goals1 < goals2:
        final_outcome = "Team 2 Win"
    else:
        final_outcome = "Draw"

    print("\n==============================")
    print(f"Match: {team1} vs {team2}")
    print(f"Predicted Outcome: {final_outcome} (Model originally predicted: {outcome_label[outcome_pred]})")
    print(f"Predicted Score: {goals1} - {goals2}")
    print("==============================")

    return final_outcome, goals1, goals2


In [None]:
from sklearn.ensemble import RandomForestRegressor

# Use the same features
X = df[features]

# Targets for scores
y_home = df["home_goals"]
y_away = df["away_goals"]

# Train two models
home_goal_model = RandomForestRegressor(random_state=42)
away_goal_model = RandomForestRegressor(random_state=42)

home_goal_model.fit(X, y_home)
away_goal_model.fit(X, y_away)

print(" Score prediction models trained!")



In [None]:
# Save the outcome model (already done, but to keep consistent)
joblib.dump(rf_model, "/Users/saamsani/Desktop/CMPT /soccer_score_prediction/models/rf_model_final.joblib")

# Save the score models
joblib.dump(home_goal_model, "/Users/saamsani/Desktop/CMPT /soccer_score_prediction/models/home_goal_model_final.joblib")
joblib.dump(away_goal_model, "/Users/saamsani/Desktop/CMPT /soccer_score_prediction/models/away_goal_model_final.joblib")

