## Step 3 ‚Äî Count wins and compute a simple win rate for each team

In [None]:
# Simple: a team wins when its score is higher than the opponent's
# We'll count wins (home and away) and total matches played
all_teams = sorted(set(matches['home_team'].unique()) | set(matches['away_team'].unique()))
rows = []
for team in all_teams:
    home_games = matches[matches['home_team'] == team]
    away_games = matches[matches['away_team'] == team]
    home_wins = (home_games['home_score'] > home_games['away_score']).sum()
    away_wins = (away_games['away_score'] > away_games['home_score']).sum()
    total_matches = len(home_games) + len(away_games)
    total_wins = int(home_wins + away_wins)
    win_rate = (total_wins / total_matches * 100) if total_matches > 0 else 0
    rows.append({'Team': team, 'Matches': total_matches, 'Wins': total_wins, 'Win_Rate': round(win_rate,1)})

stats = pd.DataFrame(rows).sort_values('Win_Rate', ascending=False).reset_index(drop=True)
print('Top 10 teams by simple win rate (2022-present):')
print(stats.head(10).to_string(index=False))
# Apply minimum-match filter to produce a filtered ranking used for the prediction
if MIN_MATCHES > 0:
    filtered_stats = stats[stats['Matches'] >= MIN_MATCHES].reset_index(drop=True)
    print(f'\nTop teams with at least {MIN_MATCHES} matches:')
    if len(filtered_stats) > 0:
        print(filtered_stats.head(10).to_string(index=False))
    else:
        print('(No teams meet the minimum-match threshold ‚Äî prediction will fall back to unfiltered list)')
{
    "cells": [
        {
            "cell_type": "markdown",
            "metadata": { "id": "cell-1", "language": "markdown" },
            "source": [
                "# FIFA 2026 Winner Prediction (Beginner Friendly)",
                "**By: Sahar Karimi** | CS 401 - Software Engineering",
                "",
                "This notebook is designed for a junior-level project: it uses recent match results (2022‚Äì2025) and a simple, explainable method to pick a predicted winner from the list of qualified teams in `worldcup_predictor_teams.csv`."
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": { "id": "cell-2", "language": "markdown" },
            "source": ["## Step 1 ‚Äî Imports (very small set of libraries)"]
        },
        {
            "cell_type": "code",
            "metadata": { "id": "cell-3", "language": "python" },
            "source": [
                "# We keep dependencies minimal to make this easy to run for graders\n",
                "import pandas as pd\n",
                "print('‚úì pandas imported')"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": { "id": "cell-4", "language": "markdown" },
            "source": ["## Step 2 ‚Äî Load data\n", "We will restrict predictions to teams listed in `worldcup_predictor_teams.csv` (the qualified 32). This avoids predicting for teams that will not be in the tournament."]
        },
        {
            "cell_type": "code",
            "metadata": { "id": "cell-5", "language": "python" },
            "source": [
                "# Load the recent matches and the list of qualified teams\n",
                "matches = pd.read_csv('recent_wc_matches.csv')\n",
                "teams_df = pd.read_csv('worldcup_predictor_teams.csv')\n",
                "\n",
                "# Extract a set of qualified team names from the teams file\n",
                "qualified_teams = set(teams_df['team'].astype(str).str.strip()) if 'team' in teams_df.columns else set(teams_df.iloc[:,0].astype(str).str.strip())\n",
                "\n",
                "print(f'Loaded {len(matches)} recent matches')\n",
                "print(f'Found {len(qualified_teams)} qualified teams (from worldcup_predictor_teams.csv)')\n",
                "print('Sample matches:')\n",
                "print(matches[['date','home_team','away_team','home_score','away_score']].head())"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": { "id": "cell-6", "language": "markdown" },
            "source": ["## Step 3 ‚Äî Compute win rates (only for qualified teams)\n", "We count wins (both home and away) and total matches played for each qualified team, then compute a win rate percentage."]
        },
        {
            "cell_type": "code",
            "metadata": { "id": "cell-7", "language": "python" },
            "source": [
                "# Build statistics only for teams in the qualified list\n",
                "stats_rows = []\n",
                "for team in sorted(qualified_teams):\n",
                "    # Select matches where this team played at home or away\n",
                "    home_games = matches[matches['home_team'] == team]\n",
                "    away_games = matches[matches['away_team'] == team]\n",
                "    # Wins at home: home_score > away_score\n",
                "    home_wins = (home_games['home_score'] > home_games['away_score']).sum()\n",
                "    # Wins away: away_score > home_score\n",
                "    away_wins = (away_games['away_score'] > away_games['home_score']).sum()\n",
                "    total_matches = len(home_games) + len(away_games)\n",
                "    total_wins = int(home_wins + away_wins)\n",
                "    win_rate = (total_wins / total_matches * 100) if total_matches > 0 else 0\n",
                "    stats_rows.append({'Team': team, 'Matches': total_matches, 'Wins': total_wins, 'Win_Rate': round(win_rate,1)})\n",
                "\n",
                "stats = pd.DataFrame(stats_rows).sort_values('Win_Rate', ascending=False).reset_index(drop=True)\n",
                "print('Ranking of qualified teams by win rate:')\n",
                "print(stats.head(15).to_string(index=False))"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": { "id": "cell-8", "language": "markdown" },
            "source": ["## Step 4 ‚Äî Small-sample filter and pick the predicted winner\n", "Teams that played very few matches can show misleading 100% win rates. We filter out teams with fewer than `MIN_MATCHES` matches before picking the winner."]
        },
        {
            "cell_type": "code",
            "metadata": { "id": "cell-9", "language": "python" },
            "source": [
                "# Set minimum matches threshold (tweak as needed)\n",
                "MIN_MATCHES = 5  # change to 0 to disable filtering\n",
                "print(f'Using MIN_MATCHES = {MIN_MATCHES}')\n",
                "\n",
                "filtered = stats[stats['Matches'] >= MIN_MATCHES].reset_index(drop=True)\n",
                "if len(filtered) == 0:\n",
                "    print('No qualified teams meet the minimum-match threshold; using unfiltered ranking for prediction')\n",
                "    candidate = stats\n",
                "else:\n",
                "    candidate = filtered\n",
                "\n",
                "print('\\nTop teams used for prediction:')\n",
                "print(candidate.head(10).to_string(index=False))\n",
                "\n",
                "# Predicted winner = highest win rate among candidate teams\n",
                "if len(candidate) > 0:\n",
                "    winner = candidate.iloc[0]\n",
                "    print('\\n' + '='*40)\n",
                "    print('üèÜ Predicted 2026 Winner (simple win-rate heuristic)')\n",
                "    print('='*40)\n",
                "    print(f\"Predicted winner: {winner['Team']}\")\n",
                "    print(f\"Win Rate: {winner['Win_Rate']}% (from {int(winner['Matches'])} matches)\")\n",
                "else:\n",
                "    print('No teams available to predict.')"
            ]
        },
        {
            "cell_type": "markdown",
            "metadata": { "id": "cell-10", "language": "markdown" },
            "source": ["## Step 5 ‚Äî Save results\n", "We save the ranking used for prediction and the full unfiltered ranking for reference."]
        },
        {
            "cell_type": "code",
            "metadata": { "id": "cell-11", "language": "python" },
            "source": [
                "# Save CSVs: filtered (used) and full reference\n",
                "candidate.head(32).to_csv('predictions_2026.csv', index=False)\n",
                "stats.head(32).to_csv('predictions_2026_full.csv', index=False)\n",
                "print('‚úì Saved predictions_2026.csv and predictions_2026_full.csv')"
            ]
        }
    ],
    "metadata": {
        "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" },
        "language_info": { "name": "python", "version": "3.11.0" }
    },
    "nbformat": 4,
    "nbformat_minor": 4
}

NameError: name 'matches' is not defined