# Get Injuries Information + Previous Season Statistics of the Team

In [None]:
import http.client
import json
import pandas as pd
import time
import numpy as np
from tqdm import tqdm
import concurrent.futures
import os
import random

#####################
# Configurable Variables
#####################
API_KEY = "xxx"   # Replace with your API key
SEASON = 2024
PREV_SEASON = SEASON - 1

SEASON_START = f"{SEASON}-08-01"   # e.g. "2024-08-01"
FIXTURE_END = f"{SEASON+1}-05-31"  # e.g. "2025-05-31"

NUM_PREV_FIXTURES = 5
PLAYER_IDS = list(range(1500, 2000000))  # Large range for demonstration

SAVE_EVERY_N_ROWS = 10
TARGET_NEGATIVE_RECORDS = 5000
CHECKPOINT_FILE = "negative_scan_checkpoint.json"

MAX_WORKERS = 10
MAX_RETRIES = 99999
REQUEST_TIMEOUT = 10
CACHE_DIR = "api_cache"

# We pick how many random fixtures to pick per no-injury player
NEG_SAMPLES_PER_PLAYER = 5

os.makedirs(CACHE_DIR, exist_ok=True)

#####################
# Cache & Connection Setup
#####################
cache = {}

def get_cache_key(endpoint: str) -> str:
    """Return a filesystem-safe cache key from an endpoint URL."""
    return endpoint.replace("/", "_").replace("?", "_").replace("&", "_").replace("=", "_")

def cache_api_response(endpoint: str, response: dict):
    """Store an API response in memory & on disk."""
    cache[endpoint] = response
    cache_file = os.path.join(CACHE_DIR, f"{get_cache_key(endpoint)}.json")
    try:
        with open(cache_file, "w") as f:
            json.dump(response, f)
    except Exception as e:
        print(f"Error caching to disk: {e}")

def get_cached_response(endpoint: str) -> dict:
    """Retrieve a cached API response from memory or disk, if available."""
    if endpoint in cache:
        return cache[endpoint]
    cache_file = os.path.join(CACHE_DIR, f"{get_cache_key(endpoint)}.json")
    if os.path.exists(cache_file):
        try:
            with open(cache_file, "r") as f:
                response = json.load(f)
                cache[endpoint] = response
                return response
        except Exception as e:
            print(f"Error reading cache: {e}")
    return {}

def create_connection():
    """Create an HTTPS connection to API-Football."""
    return http.client.HTTPSConnection("v3.football.api-sports.io", timeout=REQUEST_TIMEOUT)

def make_api_request(endpoint: str, headers: dict = None) -> dict:
    """
    Make an API request with caching, handling rate limits via exponential backoff.
    Returns a dict response (or empty dict on failure).
    """
    if headers is None:
        headers = {
            "x-rapidapi-host": "v3.football.api-sports.io",
            "x-rapidapi-key": API_KEY
        }
    cached = get_cached_response(endpoint)
    if cached:
        return cached
    
    base_wait = 5
    max_wait = 120
    
    for attempt in range(MAX_RETRIES):
        try:
            conn = create_connection()
            conn.request("GET", endpoint, headers=headers)
            res = conn.getresponse()
            data = res.read()
            response = json.loads(data.decode("utf-8"))
            
            # If rate-limited or partial errors
            is_rate_limited = (res.status == 429)
            has_rate_error = "rateLimit" in response.get("errors", {})
            if is_rate_limited or has_rate_error:
                wait_time = min(base_wait * (2**attempt), max_wait)
                print(f"Rate limited. Waiting {wait_time}s before retry {attempt+1} for {endpoint}")
                time.sleep(wait_time)
                continue
            
            cache_api_response(endpoint, response)
            return response
        except Exception as e:
            print(f"Error on attempt {attempt+1} for {endpoint}: {e}")
            if attempt < MAX_RETRIES - 1:
                wait_time = min(base_wait * (2**attempt), max_wait)
                time.sleep(wait_time)
            else:
                print(f"Max retries reached for {endpoint}. Giving up.")
                return {}
        finally:
            if 'conn' in locals():
                conn.close()
    return {}

#####################
# Flatten Player Stats
#####################
def flatten_player_stats(player_response: dict) -> list:
    """Flatten /players response into a list of dictionaries (1+ per league/team statistic)."""
    flattened_list = []
    player_info = player_response.get("player", {})
    player_dict = {
        "player_id": player_info.get("id"),
        "player_name": player_info.get("name"),
        "player_firstname": player_info.get("firstname"),
        "player_lastname": player_info.get("lastname"),
        "player_age": player_info.get("age"),
        "player_birth_date": player_info.get("birth", {}).get("date"),
        "player_birth_place": player_info.get("birth", {}).get("place"),
        "player_birth_country": player_info.get("birth", {}).get("country"),
        "player_nationality": player_info.get("nationality"),
        "player_height": player_info.get("height"),
        "player_weight": player_info.get("weight"),
        "player_injured": player_info.get("injured"),
        "player_photo": player_info.get("photo")
    }
    statistics = player_response.get("statistics", [])
    if not statistics:
        flattened_list.append(player_dict)
    else:
        for stat in statistics:
            record = player_dict.copy()
            team = stat.get("team", {})
            record["team_id"] = team.get("id")
            record["team_name"] = team.get("name")
            record["team_logo"] = team.get("logo")
            league = stat.get("league", {})
            record["league_id"] = league.get("id")
            record["league_name"] = league.get("name")
            record["league_country"] = league.get("country")
            record["league_logo"] = league.get("logo")
            record["league_flag"] = league.get("flag")
            record["league_season"] = league.get("season")
            games = stat.get("games", {})
            record["games_appearences"] = games.get("appearences")
            record["games_lineups"] = games.get("lineups")
            record["games_minutes"] = games.get("minutes")
            record["games_number"] = games.get("number")
            record["games_position"] = games.get("position")
            record["games_rating"] = games.get("rating")
            record["games_captain"] = games.get("captain")
            substitutes = stat.get("substitutes", {})
            record["substitutes_in"] = substitutes.get("in")
            record["substitutes_out"] = substitutes.get("out")
            record["substitutes_bench"] = substitutes.get("bench")
            shots = stat.get("shots", {})
            record["shots_total"] = shots.get("total")
            record["shots_on"] = shots.get("on")
            goals = stat.get("goals", {})
            record["goals_total"] = goals.get("total")
            record["goals_conceded"] = goals.get("conceded")
            record["goals_assists"] = goals.get("assists")
            record["goals_saves"] = goals.get("saves")
            passes = stat.get("passes", {})
            record["passes_total"] = passes.get("total")
            record["passes_key"] = passes.get("key")
            record["passes_accuracy"] = passes.get("accuracy")
            tackles = stat.get("tackles", {})
            record["tackles_total"] = tackles.get("total")
            record["tackles_blocks"] = tackles.get("blocks")
            record["tackles_interceptions"] = tackles.get("interceptions")
            duels = stat.get("duels", {})
            record["duels_total"] = duels.get("total")
            record["duels_won"] = duels.get("won")
            dribbles = stat.get("dribbles", {})
            record["dribbles_attempts"] = dribbles.get("attempts")
            record["dribbles_success"] = dribbles.get("success")
            record["dribbles_past"] = dribbles.get("past")
            fouls = stat.get("fouls", {})
            record["fouls_drawn"] = fouls.get("drawn")
            record["fouls_committed"] = fouls.get("committed")
            cards = stat.get("cards", {})
            record["cards_yellow"] = cards.get("yellow")
            record["cards_yellowred"] = cards.get("yellowred")
            record["cards_red"] = cards.get("red")
            penalty = stat.get("penalty", {})
            record["penalty_won"] = penalty.get("won")
            record["penalty_commited"] = penalty.get("commited")
            record["penalty_scored"] = penalty.get("scored")
            record["penalty_missed"] = penalty.get("missed")
            record["penalty_saved"] = penalty.get("saved")
            flattened_list.append(record)
    return flattened_list

#######################
# Negative Sample Logic
#######################
def gather_negative_matches_for_no_injury_player(player_id: int) -> list:
    """
    For a player with zero injuries:
      1) /players?id=xxx&season={SEASON} to find team_id (and league_id).
      2) /fixtures?team=xxx to gather that team's fixtures this season.
      3) Filter valid, completed fixtures (status FT, AET, PEN).
      4) Randomly sample up to NEG_SAMPLES_PER_PLAYER.
      5) Build negative sample dict (mirroring the positive sample structure).
    """
    negative_samples = []
    
    # 1) retrieve player's aggregated stats for this season
    endpoint_stats = f"/players?id={player_id}&season={SEASON}"
    json_stats = make_api_request(endpoint_stats)
    if not json_stats or not json_stats.get("response"):
        return negative_samples
    
    try:
        player_resp = json_stats["response"][0]
        flist = flatten_player_stats(player_resp)
        if not flist:
            return negative_samples
        team_id = flist[0].get("team_id")
        league_id = flist[0].get("league_id")
    except Exception as e:
        print(f"Error extracting team info for player {player_id}: {e}")
        return negative_samples
    
    # 2) gather that team's fixtures
    endpoint_fixtures = f"/fixtures?team={team_id}&season={SEASON}&from={SEASON_START}&to={FIXTURE_END}"
    json_fixtures = make_api_request(endpoint_fixtures)
    if not json_fixtures or not json_fixtures.get("response"):
        return negative_samples
    
    fixtures = json_fixtures.get("response", [])
    
    # 3) filter only valid, completed matches
    valid_fixtures = []
    for fix in fixtures:
        fdata = fix.get("fixture", {})
        if not fdata or not fdata.get("id") or not fdata.get("date"):
            continue
        status_short = fdata.get("status", {}).get("short")
        if status_short not in ("FT", "AET", "PEN"):
            continue
        
        valid_fixtures.append(fix)
    
    if not valid_fixtures:
        return negative_samples
    
    # 4) random sample
    sampled_fixtures = random.sample(valid_fixtures, min(NEG_SAMPLES_PER_PLAYER, len(valid_fixtures)))
    
    # 5) build negative sample records
    for fix in sampled_fixtures:
        try:
            fid = int(fix["fixture"]["id"])
        except:
            fid = None
        negative_samples.append({
            "injury_player_id": player_id,
            "fixture_id": fid,
            "injury_date": fix["fixture"]["date"],
            "injury_team_id": team_id,
            "injury_league_id": league_id,
            "injury_season": SEASON,
            "reason": "no_injury_in_season",
            "injuried": 0
        })
    return negative_samples

######################################
# Opponent + Previous 5 Fixtures Logic
######################################
def get_opponent_team_id(fixture_id, injury_team_id):
    """Find the opponent team ID by calling /fixtures?fixture=xxx or stats fallback."""
    if not fixture_id:
        return None
    endpoint = f"/fixtures?fixture={fixture_id}"
    resp = make_api_request(endpoint)
    fixtures = resp.get("response", []) if resp else []
    
    opp = None
    if fixtures:
        teams = fixtures[0].get("teams", {})
        home = teams.get("home", {}).get("id")
        away = teams.get("away", {}).get("id")
        try:
            home = int(home) if home is not None else None
            away = int(away) if away is not None else None
        except:
            home, away = None, None
        if injury_team_id == home:
            opp = away
        elif injury_team_id == away:
            opp = home
    
    if opp is None:
        # fallback: /fixtures/statistics
        endpoint2 = f"/fixtures/statistics?fixture={fixture_id}"
        stats_resp = make_api_request(endpoint2)
        responses = stats_resp.get("response", []) if stats_resp else []
        if len(responses) == 2:
            try:
                t1 = int(responses[0].get("team", {}).get("id"))
                t2 = int(responses[1].get("team", {}).get("id"))
            except:
                t1, t2 = None, None
            if injury_team_id == t1:
                opp = t2
            elif injury_team_id == t2:
                opp = t1
    return opp

def get_previous_fixture_ids(team_id, season, current_fixture_date, current_fixture_id,
                             k=NUM_PREV_FIXTURES, season_start=SEASON_START):
    """Get IDs of the previous k fixtures for a given team prior to current_fixture_date."""
    if not team_id or not current_fixture_date:
        return [None]*k
    date_only = current_fixture_date.split("T")[0]
    endpoint = f"/fixtures?team={team_id}&season={season}&from={season_start}&to={date_only}"
    resp = make_api_request(endpoint)
    fixtures = resp.get("response", []) if resp else []
    prev_fixtures = []
    for fix in fixtures:
        try:
            fid = int(fix["fixture"]["id"])
        except:
            fid = None
        if fid and fid != current_fixture_id:
            prev_fixtures.append((fix["fixture"]["date"], fid))
    if not prev_fixtures:
        return [None]*k
    # Sort descending
    prev_fixtures.sort(key=lambda x: x[0], reverse=True)
    prev_ids = [fid for _, fid in prev_fixtures[:k]]
    while len(prev_ids) < k:
        prev_ids.append(None)
    return prev_ids

def process_fixture_info_for_neg(row_data: dict) -> dict:
    """Replicate expansions for negative samples: opponent ID + 5 previous fixtures each side."""
    try:
        fix_id = int(row_data["fixture_id"]) if row_data["fixture_id"] else None
    except:
        fix_id = None
    try:
        team_id = int(row_data["injury_team_id"]) if row_data["injury_team_id"] else None
    except:
        team_id = None
    dt = row_data.get("injury_date")
    try:
        seas = int(row_data["injury_season"])
    except:
        seas = None
    
    opp_id = get_opponent_team_id(fix_id, team_id)
    inj_team_fixtures = get_previous_fixture_ids(team_id, seas, dt, fix_id)
    if opp_id:
        opp_team_fixtures = get_previous_fixture_ids(opp_id, seas, dt, fix_id)
    else:
        opp_team_fixtures = [None]*NUM_PREV_FIXTURES
    
    result = {"opponent_team_id": opp_id}
    for j in range(NUM_PREV_FIXTURES):
        result[f"prev_inj_team_fixture_{j+1}"] = inj_team_fixtures[j]
        result[f"prev_opp_team_fixture_{j+1}"] = opp_team_fixtures[j]
    return result

####################################
# Batching / Negative Sample Logic
####################################
def process_player_batch(player_ids: list) -> tuple:
    """
    For each player:
      1) Check if zero injuries for current season
      2) If zero, gather negative fixtures from that player's team
      3) Also gather previous-season stats
    """
    local_negative_list = []
    local_flattened_stats_prev = {}
    
    def process_single_player(player_id: int):
        # Check injuries
        endpoint_inj = f"/injuries?player={player_id}&season={SEASON}"
        resp_inj = make_api_request(endpoint_inj)
        injuries = resp_inj.get("response", []) if resp_inj else []
        if injuries:
            # This player has injuries => skip as negative
            return {"player_id": player_id, "neg_samples": [], "prev_stats": {}, "is_negative": False}
        
        # Gather negative samples
        neg_samples = gather_negative_matches_for_no_injury_player(player_id)
        
        # Gather previous season stats
        endp_prev = f"/players?id={player_id}&season={PREV_SEASON}"
        resp_prev = make_api_request(endp_prev)
        prev_stats = {}
        if resp_prev and resp_prev.get("response"):
            try:
                p_resp = resp_prev["response"][0]
                flist = flatten_player_stats(p_resp)
                if flist:
                    prev_stats = {"prev_" + k: v for k, v in flist[0].items()}
            except Exception as e:
                print(f"Error flattening stats for {player_id}: {e}")
        
        return {"player_id": player_id, "neg_samples": neg_samples, "prev_stats": prev_stats, "is_negative": True}
    
    results = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_map = {executor.submit(process_single_player, pid): pid for pid in player_ids}
        for fut in concurrent.futures.as_completed(future_map):
            pid = future_map[fut]
            try:
                r = fut.result()
                results.append(r)
            except Exception as e:
                print(f"Error processing player {pid}: {e}")
    
    for r in results:
        if r["is_negative"]:
            local_negative_list.extend(r["neg_samples"])
            if r["prev_stats"]:
                local_flattened_stats_prev[r["player_id"]] = r["prev_stats"]
    return local_negative_list, local_flattened_stats_prev

#####################
# Main Execution
#####################
def main():
    negative_list = []
    flattened_stats_prev = {}
    
    start_idx = 0
    if os.path.exists(CHECKPOINT_FILE):
        try:
            with open(CHECKPOINT_FILE, 'r') as f:
                checkpoint = json.load(f)
                start_idx = checkpoint.get("last_processed_idx", 0)
                negative_list = checkpoint.get("negative_list", [])
                flattened_stats_prev = checkpoint.get("flattened_stats_prev", {})
                print(f"Resuming from checkpoint at index {start_idx}, found {len(negative_list)} negative samples so far.")
        except Exception as e:
            print(f"Error loading checkpoint: {e}")
    
    remaining_pids = PLAYER_IDS[start_idx:]
    print(f"Scanning {len(remaining_pids)} players from {remaining_pids[0]} to {remaining_pids[-1]}")
    
    BATCH_SIZE = 100
    neg_count = len(negative_list)
    
    for batch_start in tqdm(range(0, len(remaining_pids), BATCH_SIZE), desc="Processing player batches"):
        if neg_count >= TARGET_NEGATIVE_RECORDS:
            print(f"Reached {TARGET_NEGATIVE_RECORDS} negative samples. Stopping early.")
            break
        
        batch_end = min(batch_start + BATCH_SIZE, len(remaining_pids))
        batch_ids = remaining_pids[batch_start:batch_end]
        
        batch_negatives, batch_prev_stats = process_player_batch(batch_ids)
        negative_list.extend(batch_negatives)
        neg_count += len(batch_negatives)
        flattened_stats_prev.update(batch_prev_stats)
        
        global_idx = start_idx + batch_end
        # Save checkpoint
        if batch_end % (BATCH_SIZE * 5) == 0 or neg_count >= TARGET_NEGATIVE_RECORDS:
            checkpoint = {
                "last_processed_idx": global_idx,
                "negative_list": negative_list,
                "flattened_stats_prev": flattened_stats_prev
            }
            with open(CHECKPOINT_FILE, 'w') as f:
                json.dump(checkpoint, f)
            print(f"Checkpoint saved at {global_idx}. Found {len(negative_list)} negative samples so far.")
    
    print(f"\nCollected {len(negative_list)} total negative samples.")
    if not negative_list:
        print("No negative samples found. Exiting.")
        return
    
    # Build negative DataFrame
    df_neg = pd.DataFrame(negative_list)
    df_neg.to_csv("negative_players.csv", index=False)
    print("Initial negative samples DataFrame (no expansions yet):")
    print(df_neg.head())
    
    # Expand with fixture info in parallel
    print("Expanding fixture info for negative samples (opponent + prev 5 fixtures)...")
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        future_map = {executor.submit(process_fixture_info_for_neg, row): i
                      for i, row in df_neg.iterrows()}
        expansions = []
        for fut in tqdm(concurrent.futures.as_completed(future_map),
                        total=len(future_map),
                        desc="Expanding"):
            idx = future_map[fut]
            try:
                ex_result = fut.result()
                expansions.append((idx, ex_result))
            except Exception as e:
                print(f"Error expanding fixture for row {idx}: {e}")
                expansions.append((idx, {}))
            
            # Periodic checkpoint
            if len(expansions) % SAVE_EVERY_N_ROWS == 0:
                df_neg.to_csv("checkpoint_negative_expansions.csv", index=False)
                print(f"Expansion checkpoint at {len(expansions)} processed rows.")
    
    # Sort expansions by row index
    expansions.sort(key=lambda x: x[0])
    # Prepare columns
    opponent_team_ids = []
    prev_inj_team_fixtures = {f"prev_inj_team_fixture_{i+1}": [] for i in range(NUM_PREV_FIXTURES)}
    prev_opp_team_fixtures = {f"prev_opp_team_fixture_{i+1}": [] for i in range(NUM_PREV_FIXTURES)}
    
    for _, expansion in expansions:
        opponent_team_ids.append(expansion.get("opponent_team_id"))
        for j in range(NUM_PREV_FIXTURES):
            col_inj = f"prev_inj_team_fixture_{j+1}"
            col_opp = f"prev_opp_team_fixture_{j+1}"
            prev_inj_team_fixtures[col_inj].append(expansion.get(col_inj))
            prev_opp_team_fixtures[col_opp].append(expansion.get(col_opp))
    
    df_neg["opponent_team_id"] = opponent_team_ids
    for c, arr in prev_inj_team_fixtures.items():
        df_neg[c] = arr
    for c, arr in prev_opp_team_fixtures.items():
        df_neg[c] = arr
    
    # Add previous-season info from flattened_stats_prev
    # (the dictionary with "prev_player_id", "prev_team_id", etc.)
    prev_info_list = []
    for pid in df_neg["injury_player_id"]:
        info = flattened_stats_prev.get(pid, {})
        prev_info_list.append(info)
    if prev_info_list:
        prev_df = pd.DataFrame(prev_info_list)
        df_neg = pd.concat([df_neg.reset_index(drop=True), prev_df.reset_index(drop=True)], axis=1)
    
    # Convert fixture IDs to object for consistency
    fixture_cols = ["fixture_id"] + [
        f"prev_inj_team_fixture_{i+1}" for i in range(NUM_PREV_FIXTURES)
    ] + [
        f"prev_opp_team_fixture_{i+1}" for i in range(NUM_PREV_FIXTURES)
    ]
    for col in fixture_cols:
        df_neg[col] = df_neg[col].astype("object")
    
    # Final output
    df_neg.to_csv("final_negative_df.csv", index=False)
    print("\nFinal Negative Samples DataFrame with expansions & previous-season stats:")
    print(df_neg.head(10))
    
    print("Done! Negative samples now include expansions + previous-season data, just like positives.")

if __name__ == "__main__":
    main()


In [None]:
pd.read_csv("final_negative_df.csv").dropna(subset=['prev_inj_team_fixture_1',
       'prev_inj_team_fixture_2', 'prev_inj_team_fixture_3',
       'prev_inj_team_fixture_4', 'prev_inj_team_fixture_5', 'prev_opp_team_fixture_1', 'prev_opp_team_fixture_2',
       'prev_opp_team_fixture_3', 'prev_opp_team_fixture_4',
       'prev_opp_team_fixture_5']).reset_index(drop=True).to_csv("final_negative_df.csv")

In [None]:
import pandas as pd
df = pd.read_csv("final_negative_df.csv")

In [None]:
df[['injury_player_id', 'fixture_id','injury_team_id',
       'injury_league_id', 'injury_season','opponent_team_id', 'prev_inj_team_fixture_1',
       'prev_inj_team_fixture_2', 'prev_inj_team_fixture_3',
       'prev_inj_team_fixture_4', 'prev_inj_team_fixture_5',
       'prev_opp_team_fixture_1', 'prev_opp_team_fixture_2',
       'prev_opp_team_fixture_3', 'prev_opp_team_fixture_4',
       'prev_opp_team_fixture_5']] = df[['injury_player_id', 'fixture_id','injury_team_id',
       'injury_league_id', 'injury_season','opponent_team_id', 'prev_inj_team_fixture_1',
       'prev_inj_team_fixture_2', 'prev_inj_team_fixture_3',
       'prev_inj_team_fixture_4', 'prev_inj_team_fixture_5',
       'prev_opp_team_fixture_1', 'prev_opp_team_fixture_2',
       'prev_opp_team_fixture_3', 'prev_opp_team_fixture_4',
       'prev_opp_team_fixture_5']].fillna(-1).astype(int)


# Get Previous 5 Matches Player, Team and Opponent Raw Statistics

In [None]:
import http.client
import json
import pandas as pd
import time
from tqdm import tqdm
import concurrent.futures
from functools import lru_cache

#####################
# 0. CONFIG / SETUP
#####################
API_KEY = "xxx"   # Replace with your API key
TIME_BETWEEN_CALLS = 0.05  # Small delay between parallel batches
MAX_RETRIES = 9999999            # How many times to retry if a call fails
RETRY_WAIT_SECONDS = 2.0   # Wait this long before each retry
MAX_WORKERS = 10           # Number of parallel workers (adjust based on API limits)

# Create connection pool
def create_connection():
    conn = http.client.HTTPSConnection("v3.football.api-sports.io")
    return conn

# Cache for API responses to avoid duplicate calls
CACHE = {}

#####################
# 1. HELPER FUNCTIONS
#####################

@lru_cache(maxsize=1000)
def do_api_call(endpoint):
    """
    Makes an HTTPS request to `endpoint`, parses JSON, and returns it.
    Uses LRU cache to avoid duplicate calls and implements retry logic.
    """
    # Check if in cache
    if endpoint in CACHE:
        return CACHE[endpoint]
    
    conn = create_connection()
    headers = {
        "x-rapidapi-host": "v3.football.api-sports.io",
        "x-rapidapi-key": API_KEY
    }
    
    for attempt in range(MAX_RETRIES):
        try:
            conn.request("GET", endpoint, headers=headers)
            res = conn.getresponse()
            data = res.read()
            
            # Check if rate limited (status code 429)
            if res.status == 429:
                wait_time = RETRY_WAIT_SECONDS * (2 ** attempt)  # Exponential backoff
                print(f"Rate limited. Waiting {wait_time}s before retry {attempt+1}")
                time.sleep(wait_time)
                continue
                
            # Attempt to parse JSON
            json_data = json.loads(data.decode("utf-8"))
            
            # Cache the result
            CACHE[endpoint] = json_data
            return json_data
            
        except Exception as e:
            print(f"Error on attempt {attempt+1} for endpoint {endpoint}: {e}")
            if attempt < MAX_RETRIES - 1:
                # Exponential backoff
                wait_time = RETRY_WAIT_SECONDS * (2 ** attempt)
                time.sleep(wait_time)
            else:
                print(f"Max retries reached for {endpoint}. Giving up.")
                return None
    
    return None

def get_player_stats_for_fixture(fixture_id, player_id):
    """Get stats for a specific player in a fixture"""
    if fixture_id is None:
        return None

    endpoint = f"/fixtures/players?fixture={fixture_id}"
    json_data = do_api_call(endpoint)
    if not json_data:
        return None
    
    # Go through the response to find player stats
    for team_info in json_data.get("response", []):
        players = team_info.get("players", [])
        for p in players:
            if p["player"]["id"] == player_id:
                return p["statistics"][0] if p["statistics"] else None
    return None

def get_team_stats_for_fixture(fixture_id, team_id):
    """Get team statistics for a specific fixture"""
    if fixture_id is None or team_id is None:
        return None

    endpoint = f"/fixtures/statistics?fixture={fixture_id}&team={team_id}"
    json_data = do_api_call(endpoint)
    if not json_data or not json_data.get("response"):
        return None
    
    team_stats_obj = json_data["response"][0]
    raw_stats = team_stats_obj.get("statistics", [])
    stats_dict = {}
    for stat_item in raw_stats:
        key = stat_item["type"]
        val = stat_item["value"]
        key_norm = key.lower().replace(" ", "_")
        stats_dict[key_norm] = val
    return stats_dict

#####################
# 2. PARALLEL PROCESSING
#####################

def process_row(row):
    """Process a single row in parallel"""
    player_id = row["injury_player_id"]
    NUM_PREV_FIXTURES = 5
    
    # Prepare lists to hold results
    player_team_stats_list = []
    team_agg_stats_list = []
    opp_team_stats_list = []
    
    # Collect all tasks we need to run
    tasks = []
    
    # Player's own team last 5 fixtures
    for i in range(1, NUM_PREV_FIXTURES+1):
        fix_id_col = f"prev_inj_team_fixture_{i}"
        fix_id = row[fix_id_col]
        if fix_id:
            # Add player stats task
            tasks.append({
                'type': 'player',
                'fixture_id': fix_id,
                'player_id': player_id,
                'index': i-1
            })
            
            # Add team stats task
            tasks.append({
                'type': 'team',
                'fixture_id': fix_id,
                'team_id': row["injury_team_id"],
                'index': i-1
            })
    
    # Opponent's last 5 fixtures
    for i in range(1, NUM_PREV_FIXTURES+1):
        fix_id_col = f"prev_opp_team_fixture_{i}"
        fix_id = row[fix_id_col]
        if fix_id:
            # Add opponent stats task
            tasks.append({
                'type': 'opponent',
                'fixture_id': fix_id,
                'team_id': row["opponent_team_id"],
                'index': i-1
            })
    
    # Initialize result lists with None placeholders
    player_team_stats_list = [None] * NUM_PREV_FIXTURES
    team_agg_stats_list = [None] * NUM_PREV_FIXTURES
    opp_team_stats_list = [None] * NUM_PREV_FIXTURES
    
    # Execute all tasks (in a single thread since we're already in a worker)
    for task in tasks:
        if task['type'] == 'player':
            result = get_player_stats_for_fixture(task['fixture_id'], task['player_id'])
            player_team_stats_list[task['index']] = result
        elif task['type'] == 'team':
            result = get_team_stats_for_fixture(task['fixture_id'], task['team_id'])
            team_agg_stats_list[task['index']] = result
        elif task['type'] == 'opponent':
            result = get_team_stats_for_fixture(task['fixture_id'], task['team_id'])
            opp_team_stats_list[task['index']] = result
    
    return {
        'player_team_fixture_stats': player_team_stats_list,
        'team_fixture_agg_stats': team_agg_stats_list,
        'opp_fixture_agg_stats': opp_team_stats_list
    }

def batch_process_rows(df):
    """Process all rows in parallel"""
    all_results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Create a mapping of future to row index
        future_to_idx = {
            executor.submit(process_row, row): idx 
            for idx, row in df.iterrows()
        }
        
        # Process as they complete
        for future in tqdm(concurrent.futures.as_completed(future_to_idx), 
                          total=len(future_to_idx),
                          desc="Processing rows"):
            idx = future_to_idx[future]
            try:
                result = future.result()
                all_results.append((idx, result))
            except Exception as e:
                print(f"Error processing row {idx}: {e}")
                all_results.append((idx, None))
    
    # Sort results by original index
    all_results.sort(key=lambda x: x[0])
    
    # Extract results in correct order
    player_team_fixture_stats = []
    team_fixture_agg_stats = []
    opp_fixture_agg_stats = []
    
    for _, result in all_results:
        if result:
            player_team_fixture_stats.append(result['player_team_fixture_stats'])
            team_fixture_agg_stats.append(result['team_fixture_agg_stats'])
            opp_fixture_agg_stats.append(result['opp_fixture_agg_stats'])
        else:
            # Handle error case
            player_team_fixture_stats.append([None] * 5)
            team_fixture_agg_stats.append([None] * 5)
            opp_fixture_agg_stats.append([None] * 5)
    
    return player_team_fixture_stats, team_fixture_agg_stats, opp_fixture_agg_stats

#####################
# 3. MAIN EXECUTION
#####################

def main():
    print("Existing columns in df:", df.columns)
    
    # Process all rows in parallel
    print("Starting parallel processing of API calls...")
    player_team_fixture_stats, team_fixture_agg_stats, opp_fixture_agg_stats = batch_process_rows(df)
    
    # Now store them in new columns
    df["raw_player_team_fixture_stats"] = player_team_fixture_stats
    df["raw_team_fixture_agg_stats"] = team_fixture_agg_stats
    df["raw_opp_fixture_agg_stats"] = opp_fixture_agg_stats
    
    print("\nStage 1 complete! df now has raw data columns:")
    print(df[["raw_player_team_fixture_stats", 
              "raw_team_fixture_agg_stats",
              "raw_opp_fixture_agg_stats"]].head())
    
    # Print cache stats
    print(f"Cache hits: {do_api_call.cache_info().hits}")
    print(f"Cache misses: {do_api_call.cache_info().misses}")
    print(f"Cache size: {len(CACHE)} entries")

if __name__ == "__main__":
    main()

In [None]:
df.to_csv("stage1.csv",index=False)

# Aggregate the Raw Features and Derive More Features

In [None]:
#########################
# STAGE 2: FEATURE ENGINEERING
#########################

import numpy as np
import pandas as pd

########################################################
# 0) Rename Columns if Needed
########################################################
# Suppose your main DataFrame is called df and currently has columns like:
#  ["injury_player_id", "injury_date", "fixture_id", "injury_team_id", ...]
# But we want to rename them to simpler names: "player_id", "date", "team_id", ...
# so the code below is consistent.

df = df.rename(columns={
    "injury_player_id": "player_id",
    "injury_date": "date",
    "injury_team_id": "team_id",
    "injury_league_id": "league_id",
})

# The same if your injuries DataFrame has those columns:
df_injuries = df[["player_id","date","reason","injuried"]].copy()

########################################################
# 1) Helpers for Parsing and Safe Operations
########################################################

def safe_float(x):
    """Convert x to float or return np.nan if not possible."""
    try:
        return float(x)
    except:
        return np.nan

def safe_int(x):
    """Convert x to int or return np.nan if not possible."""
    try:
        return int(x)
    except:
        return np.nan

########################################################
# 2) Aggregators for Player Stats
########################################################

def aggregate_player_stats(stats_list):
    """
    stats_list is a list of up to 5 dicts, each representing
    the player's stats for one fixture (from 'raw_player_team_fixture_stats').
    We'll sum or average relevant fields across these fixtures.
    """
    minutes_list, rating_list = [], []

    shots_total_list, shots_on_list = [], []
    goals_total_list, assists_list = [], []
    fouls_committed_list, fouls_drawn_list = [], []
    yellow_cards_list, red_cards_list = [], []
    duels_total_list, duels_won_list = [], []
    
    passes_total_list, passes_key_list, passes_accuracy_list = [], [], []
    tackles_total_list, tackles_blocks_list, tackles_interceptions_list = [], [], []
    
    for st in stats_list:
        if not st:
            # No stats => fill with NaN placeholders
            minutes_list.append(np.nan)
            rating_list.append(np.nan)

            shots_total_list.append(np.nan)
            shots_on_list.append(np.nan)
            goals_total_list.append(np.nan)
            assists_list.append(np.nan)

            fouls_committed_list.append(np.nan)
            fouls_drawn_list.append(np.nan)
            yellow_cards_list.append(np.nan)
            red_cards_list.append(np.nan)

            duels_total_list.append(np.nan)
            duels_won_list.append(np.nan)

            passes_total_list.append(np.nan)
            passes_key_list.append(np.nan)
            passes_accuracy_list.append(np.nan)

            tackles_total_list.append(np.nan)
            tackles_blocks_list.append(np.nan)
            tackles_interceptions_list.append(np.nan)
        else:
            g = st.get("games", {})
            minutes_list.append(safe_float(g.get("minutes", np.nan)))
            rating_list.append(safe_float(g.get("rating", np.nan)))  # often a string, e.g. "7.5"

            s = st.get("shots", {})
            shots_total_list.append(safe_float(s.get("total", np.nan)))
            shots_on_list.append(safe_float(s.get("on", np.nan)))

            gl = st.get("goals", {})
            goals_total_list.append(safe_float(gl.get("total", np.nan)))
            assists_list.append(safe_float(gl.get("assists", np.nan)))

            f = st.get("fouls", {})
            fouls_committed_list.append(safe_float(f.get("committed", np.nan)))
            fouls_drawn_list.append(safe_float(f.get("drawn", np.nan)))

            c = st.get("cards", {})
            yellow_cards_list.append(safe_float(c.get("yellow", np.nan)))
            red_cards_list.append(safe_float(c.get("red", np.nan)))

            d = st.get("duels", {})
            duels_total_list.append(safe_float(d.get("total", np.nan)))
            duels_won_list.append(safe_float(d.get("won", np.nan)))

            p = st.get("passes", {})
            passes_total_list.append(safe_float(p.get("total", np.nan)))
            passes_key_list.append(safe_float(p.get("key", np.nan)))
            passes_accuracy_list.append(safe_float(p.get("accuracy", np.nan)))

            t = st.get("tackles", {})
            tackles_total_list.append(safe_float(t.get("total", np.nan)))
            tackles_blocks_list.append(safe_float(t.get("blocks", np.nan)))
            tackles_interceptions_list.append(safe_float(t.get("interceptions", np.nan)))

    # Summations
    shots_total_5 = np.nansum(shots_total_list)
    duels_total_5 = np.nansum(duels_total_list)

    # Ratio example: duels won
    duels_win_ratio_5 = np.nan
    if duels_total_5 > 0:
        duels_win_ratio_5 = np.nansum(duels_won_list) / duels_total_5

    # Pass accuracy example
    pass_acc_mean = np.nanmean(passes_accuracy_list)  # average across matches

    features = {
        # Averages
        "player_minutes_avg_5":           np.nanmean(minutes_list),
        "player_rating_avg_5":            np.nanmean(rating_list),

        # Sums
        "player_shots_total_5":           shots_total_5,
        "player_shots_on_5":             np.nansum(shots_on_list),
        "player_goals_5":                 np.nansum(goals_total_list),
        "player_assists_5":               np.nansum(assists_list),
        "player_fouls_committed_5":       np.nansum(fouls_committed_list),
        "player_fouls_drawn_5":           np.nansum(fouls_drawn_list),
        "player_yellow_cards_5":          np.nansum(yellow_cards_list),
        "player_red_cards_5":             np.nansum(red_cards_list),
        "player_duels_total_5":           duels_total_5,
        "player_duels_won_5":             np.nansum(duels_won_list),
        "player_passes_total_5":          np.nansum(passes_total_list),
        "player_passes_key_5":            np.nansum(passes_key_list),
        "player_tackles_total_5":         np.nansum(tackles_total_list),
        "player_tackles_blocks_5":        np.nansum(tackles_blocks_list),
        "player_tackles_interceptions_5": np.nansum(tackles_interceptions_list),

        # Ratios
        "player_duels_win_ratio_5":       duels_win_ratio_5,
        "player_pass_acc_mean_5":         pass_acc_mean,
    }

    return features

########################################################
# 3) Aggregators for Team & Opponent Stats
########################################################

def aggregate_team_stats(stats_list):
    sog_list, sof_list, total_shots_list = [], [], []
    fouls_list, corners_list, offsides_list = [], [], []
    ball_poss_list = []
    yellow_cards_list, red_cards_list = [], []
    passes_list, passes_accurate_list = [], []

    for st in stats_list:
        if not st:
            sog_list.append(np.nan)
            sof_list.append(np.nan)
            total_shots_list.append(np.nan)
            fouls_list.append(np.nan)
            corners_list.append(np.nan)
            offsides_list.append(np.nan)
            ball_poss_list.append(np.nan)
            yellow_cards_list.append(np.nan)
            red_cards_list.append(np.nan)
            passes_list.append(np.nan)
            passes_accurate_list.append(np.nan)
        else:
            sog_list.append(safe_float(st.get("shots_on_goal", np.nan)))
            sof_list.append(safe_float(st.get("shots_off_goal", np.nan)))
            total_shots_list.append(safe_float(st.get("total_shots", np.nan)))
            fouls_list.append(safe_float(st.get("fouls", np.nan)))
            corners_list.append(safe_float(st.get("corner_kicks", np.nan)))
            offsides_list.append(safe_float(st.get("offsides", np.nan)))

            poss_str = st.get("ball_possession", None)
            if poss_str and isinstance(poss_str, str) and poss_str.endswith("%"):
                val = poss_str.replace("%", "")
                ball_poss_list.append(safe_float(val))
            else:
                ball_poss_list.append(np.nan)

            yellow_cards_list.append(safe_float(st.get("yellow_cards", np.nan)))
            red_cards_list.append(safe_float(st.get("red_cards", np.nan)))

            passes_list.append(safe_float(st.get("total_passes", np.nan)))
            passes_accurate_list.append(safe_float(st.get("passes_accurate", np.nan)))

    sog_sum = np.nansum(sog_list)
    passes_sum = np.nansum(passes_list)
    passes_acc_sum = np.nansum(passes_accurate_list)

    pass_accuracy_5 = np.nan
    if passes_sum > 0:
        pass_accuracy_5 = passes_acc_sum / passes_sum

    features = {
        "team_shots_on_goal_5":    sog_sum,
        "team_shots_off_goal_5":   np.nansum(sof_list),
        "team_total_shots_5":      np.nansum(total_shots_list),
        "team_fouls_5":            np.nansum(fouls_list),
        "team_corners_5":          np.nansum(corners_list),
        "team_offsides_5":         np.nansum(offsides_list),
        "team_ball_poss_avg_5":    np.nanmean(ball_poss_list),
        "team_yellow_cards_5":     np.nansum(yellow_cards_list),
        "team_red_cards_5":        np.nansum(red_cards_list),
        "team_passes_5":           passes_sum,
        "team_passes_acc_5":       passes_acc_sum,
        "team_pass_acc_ratio_5":   pass_accuracy_5,
    }
    return features


def aggregate_opponent_stats(stats_list):
    sog_list, sof_list, total_shots_list = [], [], []
    fouls_list, corners_list, offsides_list = [], [], []
    ball_poss_list = []
    yellow_cards_list, red_cards_list = [], []
    passes_list, passes_accurate_list = [], []

    for st in stats_list:
        if not st:
            sog_list.append(np.nan)
            sof_list.append(np.nan)
            total_shots_list.append(np.nan)
            fouls_list.append(np.nan)
            corners_list.append(np.nan)
            offsides_list.append(np.nan)
            ball_poss_list.append(np.nan)
            yellow_cards_list.append(np.nan)
            red_cards_list.append(np.nan)
            passes_list.append(np.nan)
            passes_accurate_list.append(np.nan)
        else:
            sog_list.append(safe_float(st.get("shots_on_goal", np.nan)))
            sof_list.append(safe_float(st.get("shots_off_goal", np.nan)))
            total_shots_list.append(safe_float(st.get("total_shots", np.nan)))
            fouls_list.append(safe_float(st.get("fouls", np.nan)))
            corners_list.append(safe_float(st.get("corner_kicks", np.nan)))
            offsides_list.append(safe_float(st.get("offsides", np.nan)))

            poss_str = st.get("ball_possession", None)
            if poss_str and isinstance(poss_str, str) and poss_str.endswith("%"):
                val = poss_str.replace("%", "")
                ball_poss_list.append(safe_float(val))
            else:
                ball_poss_list.append(np.nan)

            yellow_cards_list.append(safe_float(st.get("yellow_cards", np.nan)))
            red_cards_list.append(safe_float(st.get("red_cards", np.nan)))

            passes_list.append(safe_float(st.get("total_passes", np.nan)))
            passes_accurate_list.append(safe_float(st.get("passes_accurate", np.nan)))

    sog_sum = np.nansum(sog_list)
    passes_sum = np.nansum(passes_list)
    passes_acc_sum = np.nansum(passes_accurate_list)

    pass_accuracy_5 = np.nan
    if passes_sum > 0:
        pass_accuracy_5 = passes_acc_sum / passes_sum

    features = {
        "opp_shots_on_goal_5":   sog_sum,
        "opp_shots_off_goal_5":  np.nansum(sof_list),
        "opp_total_shots_5":     np.nansum(total_shots_list),
        "opp_fouls_5":           np.nansum(fouls_list),
        "opp_corners_5":         np.nansum(corners_list),
        "opp_offsides_5":        np.nansum(offsides_list),
        "opp_ball_poss_avg_5":   np.nanmean(ball_poss_list),
        "opp_yellow_cards_5":    np.nansum(yellow_cards_list),
        "opp_red_cards_5":       np.nansum(red_cards_list),
        "opp_passes_5":          passes_sum,
        "opp_passes_acc_5":      passes_acc_sum,
        "opp_pass_acc_ratio_5":  pass_accuracy_5,
    }
    return features

########################################################
# 4) Days Since Last Injury (Optional)
########################################################


import pandas as pd
import numpy as np

def compute_days_since_last_injury(df_main, df_injuries):
    """
    Compute days since the last injury for each player in df_main.
    
    Parameters:
    -----------
    df_main : pandas DataFrame
        Main DataFrame containing player_id and date columns
    df_injuries : pandas DataFrame
        DataFrame containing injury records with player_id and date columns
        
    Returns:
    --------
    pandas DataFrame
        Original df_main with added 'days_since_last_injury' column
    """
    # Make a copy to avoid modifying the original
    result_df = df_main.copy()
    
    # Convert date columns to datetime
    result_df['date'] = pd.to_datetime(result_df['date'], errors='coerce')
    injuries_df = df_injuries.copy()
    injuries_df['date'] = pd.to_datetime(injuries_df['date'], errors='coerce')
    
    # Make sure player_id is consistent type
    result_df['player_id'] = result_df['player_id'].astype(str)
    injuries_df['player_id'] = injuries_df['player_id'].astype(str)
    
    # Create an empty column for days since last injury
    result_df['days_since_last_injury'] = np.nan
    
    # Iterate through unique players
    for player_id in result_df['player_id'].unique():
        # Get all dates for this player
        player_dates = result_df.loc[result_df['player_id'] == player_id, 'date']
        if len(player_dates) == 0:
            continue
            
        # Get all injury dates for this player
        player_injury_dates = injuries_df.loc[injuries_df['player_id'] == player_id, 'date']
        if len(player_injury_dates) == 0:
            continue
        
        # For each date, find the most recent injury date before it
        for idx, current_date in zip(player_dates.index, player_dates):
            if pd.isna(current_date):
                continue
                
            # Find most recent injury before current date
            prior_injuries = player_injury_dates[player_injury_dates < current_date]
            if len(prior_injuries) > 0:
                most_recent_injury = max(prior_injuries)
                days_since = (current_date - most_recent_injury).days
                result_df.loc[idx, 'days_since_last_injury'] = days_since
    
    return result_df

########################################################
# 5) Injuries in Last X Months (Optional)
########################################################


def compute_injuries_in_last_x_months(df_main, df_injuries, months=6):
    """
    Count injuries in the last X months for each player in df_main.
    
    Parameters:
    -----------
    df_main : pandas DataFrame
        Main DataFrame containing player_id and date columns
    df_injuries : pandas DataFrame
        DataFrame containing injury records with player_id and date columns
    months : int, default=6
        Number of months to look back for injuries
        
    Returns:
    --------
    pandas DataFrame
        Original df_main with added 'inj_count_last_Xm' column
    """
    # Make a copy to avoid modifying the original
    result_df = df_main.copy()
    
    # Convert date columns to datetime
    result_df['date'] = pd.to_datetime(result_df['date'], errors='coerce')
    injuries_df = df_injuries.copy()
    injuries_df['date'] = pd.to_datetime(injuries_df['date'], errors='coerce')
    
    # Make sure player_id is consistent type
    result_df['player_id'] = result_df['player_id'].astype(str)
    injuries_df['player_id'] = injuries_df['player_id'].astype(str)
    
    # Create column name for injury count
    count_col = f'inj_count_last_{months}m'
    
    # Initialize injury count column
    result_df[count_col] = 0
    
    # Iterate through each row in the main dataframe
    for idx, row in result_df.iterrows():
        player_id = row['player_id']
        current_date = row['date']
        
        if pd.isna(current_date):
            result_df.loc[idx, count_col] = np.nan
            continue
            
        # Define the time window
        start_date = current_date - pd.DateOffset(months=months)
        
        # Count injuries in the window
        count = len(injuries_df[(injuries_df['player_id'] == player_id) & 
                              (injuries_df['date'] >= start_date) & 
                              (injuries_df['date'] < current_date)])
        
        result_df.loc[idx, count_col] = count
    
    return result_df

########################################################
# 6) Build All New Features from raw stats
########################################################

all_features_list = []
for idx, row in df.iterrows():
    # 1) player aggregator
    p_feats = aggregate_player_stats(row["raw_player_team_fixture_stats"])
    
    # 2) team aggregator
    t_feats = aggregate_team_stats(row["raw_team_fixture_agg_stats"])
    
    # 3) opponent aggregator
    o_feats = aggregate_opponent_stats(row["raw_opp_fixture_agg_stats"])
    
    # Combine them
    combined = {}
    combined.update(p_feats)
    combined.update(t_feats)
    combined.update(o_feats)
    
    all_features_list.append(combined)

# Convert to a DataFrame
features_df = pd.DataFrame(all_features_list)

# Join these columns onto df
df = pd.concat([df.reset_index(drop=True), features_df.reset_index(drop=True)], axis=1)

########################################################
# 7) Optionally, Add "days_since_last_injury" or "injuries in last X months"
########################################################

# If you have a separate injuries DataFrame called df_injuries with columns:
#   [player_id, date, reason, etc. for each injury event]
# Then do:

df = compute_days_since_last_injury(df, df_injuries)
df = compute_injuries_in_last_x_months(df, df_injuries, months=6)

########################################################
# 8) Additional Interactions (Team vs Opponent)
########################################################
# 1) Simple Differences
df["team_vs_opp_shots_diff_5"] = df["team_total_shots_5"] - df["opp_total_shots_5"]
df["team_vs_opp_sog_diff_5"] = df["team_shots_on_goal_5"] - df["opp_shots_on_goal_5"]
df["team_vs_opp_fouls_diff_5"] = df["team_fouls_5"] - df["opp_fouls_5"]
df["team_vs_opp_corners_diff_5"] = df["team_corners_5"] - df["opp_corners_5"]
df["team_vs_opp_offsides_diff_5"] = df["team_offsides_5"] - df["opp_offsides_5"]

# Ball possession is an average in percent, so a difference here can show which side 
# tends to have a higher possession rating in recent matches.
df["team_vs_opp_poss_diff_5"] = df["team_ball_poss_avg_5"] - df["opp_ball_poss_avg_5"]

# Pass accuracy ratio might also be an average or ratio. 
# But if you prefer to do difference of pass_acc_ratio_5:
df["team_vs_opp_pass_acc_diff_5"] = df["team_pass_acc_ratio_5"] - df["opp_pass_acc_ratio_5"]

# 2) Ratios
# For ratios, we use np.where() to avoid dividing by zero or NaN.

df["team_vs_opp_shots_ratio_5"] = np.where(
    (df["opp_total_shots_5"].isna()) | (df["opp_total_shots_5"] == 0),
    np.nan,
    df["team_total_shots_5"] / df["opp_total_shots_5"]
)

df["team_vs_opp_sog_ratio_5"] = np.where(
    (df["opp_shots_on_goal_5"].isna()) | (df["opp_shots_on_goal_5"] == 0),
    np.nan,
    df["team_shots_on_goal_5"] / df["opp_shots_on_goal_5"]
)

df["team_vs_opp_fouls_ratio_5"] = np.where(
    (df["opp_fouls_5"].isna()) | (df["opp_fouls_5"] == 0),
    np.nan,
    df["team_fouls_5"] / df["opp_fouls_5"]
)

df["team_vs_opp_corners_ratio_5"] = np.where(
    (df["opp_corners_5"].isna()) | (df["opp_corners_5"] == 0),
    np.nan,
    df["team_corners_5"] / df["opp_corners_5"]
)

df["team_vs_opp_offsides_ratio_5"] = np.where(
    (df["opp_offsides_5"].isna()) | (df["opp_offsides_5"] == 0),
    np.nan,
    df["team_offsides_5"] / df["opp_offsides_5"]
)

# If you want a ratio for ball possession, e.g. how big is 
# your team's possession share relative to opponent's
df["team_vs_opp_poss_ratio_5"] = np.where(
    (df["opp_ball_poss_avg_5"].isna()) | (df["opp_ball_poss_avg_5"] == 0),
    np.nan,
    df["team_ball_poss_avg_5"] / df["opp_ball_poss_avg_5"]
)

# Pass accuracy ratio (comparing your team's pass_acc_ratio_5 to opponent's)
df["team_vs_opp_pass_acc_ratio_5"] = np.where(
    (df["opp_pass_acc_ratio_5"].isna()) | (df["opp_pass_acc_ratio_5"] == 0),
    np.nan,
    df["team_pass_acc_ratio_5"] / df["opp_pass_acc_ratio_5"]
)

########################################################
# 9) Drop/Exclude ID Columns, Keep Features + Target
########################################################
# We'll assume your target is "injuried"
# We'll exclude typical ID columns, plus raw stats columns if you want.

exclude_cols = {
    "player_id", "fixture_id", "team_id", "league_id", "opponent_team_id",
    "prev_inj_team_fixture_1", "prev_inj_team_fixture_2", "prev_inj_team_fixture_3",
    "prev_inj_team_fixture_4", "prev_inj_team_fixture_5",
    "prev_opp_team_fixture_1", "prev_opp_team_fixture_2", "prev_opp_team_fixture_3",
    "prev_opp_team_fixture_4", "prev_opp_team_fixture_5",
    "raw_player_team_fixture_stats", "raw_team_fixture_agg_stats", "raw_opp_fixture_agg_stats",
    "date",
    'prev_player_id', 'prev_player_name', 'prev_player_firstname', 'prev_player_lastname', 'prev_player_birth_date', 
    'prev_player_birth_place', 'prev_player_birth_country', 'prev_player_nationality',"prev_player_injured",'prev_player_photo', 
    'prev_team_id', 'prev_team_name', 'prev_team_logo', 'prev_league_id', 'prev_league_name', 'prev_league_country', 'prev_league_logo', 
    'prev_league_flag', 'prev_league_season',
    # "inj_count_last_6m",  # If you want to exclude or keep it, your choice
    # If you want to exclude "date", you can add it here
}

all_cols = df.columns.tolist()
final_cols = []
for c in all_cols:
    if c in exclude_cols:
        continue
    # Keep target "injuried" plus any that start with "player_", "team_", "opp_",
    # or end with "_diff_5" or "_ratio_5", or "days_since_last_injury"
    if (c == "injuried"
        or c.startswith("player_")
        or c.startswith("team_")
        or c.startswith("opp_")
        or c.startswith("prev_")
        or c.startswith("inj_")
        or c.endswith("_diff_5")
        or c.endswith("_ratio_5")
        or c == "days_since_last_injury"):
        final_cols.append(c)

final_features_df = df[final_cols].copy()

print("Final feature columns for modeling:")
print(final_features_df.columns.tolist())

print("Sample of final features:")
print(final_features_df.head(10))


In [None]:
df.isna().sum().sort_values().tail(50)

In [None]:
df.to_csv("final_df2.csv",index=False)