In [None]:
from pathlib import Path
from typing import Iterable, Dict, Tuple
import pandas as pd
import re

# Load Data
season_data = pd.read_csv("cfb23.csv")
game_data = pd.read_csv("games2024.csv")

# Split "Win-Loss" into "wins" and "losses" columns
if "Win-Loss" in season_data.columns:
    season_data["Win-Loss"] = season_data["Win-Loss"].astype(str).str.strip()
    split_wins_losses = season_data["Win-Loss"].str.split("-", expand=True)
    season_data["wins"] = pd.to_numeric(split_wins_losses[0], errors="coerce").fillna(0).astype(int)
    season_data["losses"] = pd.to_numeric(split_wins_losses[1], errors="coerce").fillna(0).astype(int)
    season_data = season_data.drop(columns=["Win-Loss"])

print(season_data[["Team", "wins", "losses"]].head())

game_data = game_data[game_data["season_type"] != "Postseason"]

#NOT IN THE GAME DATA:
#central arkansas, FIU, Hawaii, Massachusetts, New Mexico St., Old Dominion
#Southern California, UConn, UTSA

# Clean Team Names
name_mapping = {
    "Air Force": "Air Force (Mountain West)",
    "Akron": "Akron (MAC)",
    "Alabama": "Alabama (SEC)",
    "Appalachian State": "App State (Sun Belt)",
    "Arizona": "Arizona (Pac-12)",
    "Arizona State": "Arizona St. (Pac-12)",
    "Arkansas": "Arkansas (SEC)",
    "Arkansas State": "Arkansas St. (Sun Belt)",
    "Army": "Army West Point (FBS Independent)",
    "Auburn": "Auburn (SEC)",
    "Ball State": "Ball St. (MAC)",
    "Baylor": "Baylor (Big 12)",
    "Boise State": "Boise St. (Mountain West)",
    "Boston College": "Boston College (ACC)",
    "Bowling Green": "Bowling Green (MAC)",
    "Buffalo": "Buffalo (MAC)",
    "BYU": "BYU (Big 12)",
    "California": "California (Pac-12)",
    "Central Michigan": "Central Mich. (MAC)",
    "Charlotte": "Charlotte (AAC)",
    "Cincinnati": "Cincinnati (Big 12)",
    "Clemson": "Clemson (ACC)",
    "Coastal Carolina": "Coastal Carolina (Sun Belt)",
    "Colorado": "Colorado (Pac-12)",
    "Colorado State": "Colorado St. (Mountain West)",
    "Duke": "Duke (ACC)",
    "East Carolina": "East Carolina (AAC)",
    "Eastern Michigan": "Eastern Mich. (MAC)",
    "FIU": "FIU (CUSA)",
    "Florida Atlantic": "Fla. Atlantic (AAC)",
    "Florida": "Florida (SEC)",
    "Florida State": "Florida St. (ACC)",
    "Fresno State": "Fresno St. (Mountain West)",
    "Georgia Southern": "Ga. Southern (Sun Belt)",
    "Georgia": "Georgia (SEC)",
    "Georgia State": "Georgia St. (Sun Belt)",
    "Georgia Tech": "Georgia Tech (ACC)",
    "Houston": "Houston (Big 12)",
    "Illinois": "Illinois (Big Ten)",
    "Indiana": "Indiana (Big Ten)",
    "Iowa": "Iowa (Big Ten)",
    "Iowa State": "Iowa St. (Big 12)",
    "Kansas": "Kansas (Big 12)",
    "Kansas State": "Kansas St. (Big 12)",
    "Kent State": "Kent St. (MAC)",
    "Kentucky": "Kentucky (SEC)",
    "Louisiana Monroe": "La.-Monroe (Sun Belt)",
    "Liberty": "Liberty (CUSA)",
    "Louisiana": "Louisiana (Sun Belt)",
    "Louisiana Tech": "Louisiana Tech (CUSA)",
    "Louisville": "Louisville (ACC)",
    "LSU": "LSU (SEC)",
    "Massachusetts": "Massachusetts (FBS Independent)",
    "Marshall": "Marshall (Sun Belt)",
    "Maryland": "Maryland (Big Ten)",
    "Memphis": "Memphis (AAC)",
    "Miami": "Miami (FL) (ACC)",
    "Miami (OH)": "Miami (OH) (MAC)",
    "Michigan": "Michigan (Big Ten)",
    "Michigan State": "Michigan St. (Big Ten)",
    "Middle Tennessee": "Middle Tenn. (CUSA)",
    "Minnesota": "Minnesota (Big Ten)",
    "Mississippi State": "Mississippi St. (SEC)",
    "Missouri": "Missouri (SEC)",
    "Navy": "Navy (AAC)",
    "NC State": "NC State (ACC)",
    "Nebraska": "Nebraska (Big Ten)",
    "Nevada": "Nevada (Mountain West)",
    "New Mexico": "New Mexico (Mountain West)",
    "New Mexico State": "New Mexico St. (CUSA)",
    "North Carolina": "North Carolina (ACC)",
    "North Texas": "North Texas (AAC)",
    "Northwestern": "Northwestern (Big Ten)",
    "Notre Dame": "Notre Dame (FBS Independent)",
    "Ohio": "Ohio (MAC)",
    "Ohio State": "Ohio St. (Big Ten)",
    "Oklahoma": "Oklahoma (Big 12)",
    "Oklahoma State": "Oklahoma St. (Big 12)",
    "Ole Miss": "Ole Miss (SEC)",
    "Oregon": "Oregon (Pac-12)",
    "Oregon State": "Oregon St. (Pac-12)",
    "Penn State": "Penn St. (Big Ten)",
    "Pittsburgh": "Pittsburgh (ACC)",
    "Purdue": "Purdue (Big Ten)",
    "Rice": "Rice (AAC)",
    "Rutgers": "Rutgers (Big Ten)",
    "San Diego State": "San Diego St. (Mountain West)",
    "San José State": "San Jose St. (Mountain West)",
    "SMU": "SMU (AAC)",
    "South Alabama": "South Alabama (Sun Belt)",
    "Southern California": "Southern California (Pac-12)",
    "South Carolina": "South Carolina (SEC)",
    "South Florida": "South Fla. (AAC)",
    "Southern Mississippi": "Southern Miss. (Sun Belt)",
    "Stanford": "Stanford (Pac-12)",
    "Syracuse": "Syracuse (ACC)",
    "TCU": "TCU (Big 12)",
    "Temple": "Temple (AAC)",
    "Tennessee": "Tennessee (SEC)",
    "Texas": "Texas (Big 12)",
    "Texas A&M": "Texas A&M (SEC)",
    "Texas State": "Texas St. (Sun Belt)",
    "Texas Tech": "Texas Tech (Big 12)",
    "Toledo": "Toledo (MAC)",
    "Troy": "Troy (Sun Belt)",
    "Tulane": "Tulane (AAC)",
    "Tulsa": "Tulsa (AAC)",
    "UAB": "UAB (AAC)",
    "UCF": "UCF (Big 12)",
    "UCLA": "UCLA (Pac-12)",
    "UNLV": "UNLV (Mountain West)",
    "Utah": "Utah (Pac-12)",
    "Utah State": "Utah St. (Mountain West)",
    "UTEP": "UTEP (CUSA)",
    "UTSA": "UTSA (AAC)",
    "Vanderbilt": "Vanderbilt (SEC)",
    "Virginia": "Virginia (ACC)",
    "Virginia Tech": "Virginia Tech (ACC)",
    "Wake Forest": "Wake Forest (ACC)",
    "Washington": "Washington (Pac-12)",
    "Washington State": "Washington St. (Pac-12)",
    "West Virginia": "West Virginia (Big 12)",
    "Western Kentucky": "Western Ky. (CUSA)",
    "Western Michigan": "Western Mich. (MAC)",
    "Wisconsin": "Wisconsin (Big Ten)",
    "Wyoming": "Wyoming (Mountain West)"
}


season_data["Team"] = season_data["Team"].replace(name_mapping)
game_data["home_team"] = game_data["home_team"].replace(name_mapping)
game_data["away_team"] = game_data["away_team"].replace(name_mapping)


# Remove rows if either home_team or away_team is not in the name_mapping values
valid_teams = set(name_mapping.values())
game_data = game_data[
    (game_data["home_team"].isin(valid_teams)) &
   (game_data["away_team"].isin(valid_teams))
]



season_data = season_data.rename(columns={'Team': 'team'})
season_data.columns = [col.lower() for col in season_data.columns]

# Merge Season Data with Game Data
merged = game_data.merge(
    season_data.rename(columns={"wins": "home_wins", "losses": "home_losses"}),
    how="left",
    left_on="home_team",
    right_on="team",
    suffixes=("", "_home")
).drop(columns=["team"])

merged = merged.merge(
    season_data.rename(columns={"wins": "away_wins", "losses": "away_losses"}),
    how="left",
    left_on="away_team",
    right_on="team",
    suffixes=("", "_away")
).drop(columns=["team"])

# Fill missing season stats with 0 and add missing indicators
home_cols = [col for col in merged.columns if col.endswith('_home')]
away_cols = [col for col in merged.columns if col.endswith('_away')]

# Add indicators
merged['home_stats_missing'] = merged[home_cols].isnull().any(axis=1).astype(int)
merged['away_stats_missing'] = merged[away_cols].isnull().any(axis=1).astype(int)

# Fill missing season stats with 0
merged[home_cols] = merged[home_cols].fillna(0)
merged[away_cols] = merged[away_cols].fillna(0)

merged["winner"] = (merged["home_points"] > merged["away_points"]).astype(int)

# Create Rolling Stats
home_df = merged[[
    "id", "season", "week", "home_team", "home_points", "away_points"
]].copy()
home_df = home_df.rename(columns={
    "home_team": "team",
    "home_points": "points_scored",
    "away_points": "points_allowed"
})
home_df["home_away"] = "home"

away_df = merged[[
    "id", "season", "week", "away_team", "away_points", "home_points"
]].copy()
away_df = away_df.rename(columns={
    "away_team": "team",
    "away_points": "points_scored",
    "home_points": "points_allowed"
})
away_df["home_away"] = "away"

long_df = pd.concat([home_df, away_df], ignore_index=True)

# Sort and Calculate Rolling Stats
long_df = long_df.sort_values(by=["team", "season", "week"])

# Rolling offense points scored
long_df["rolling_avg_points_scored"] = (
    long_df.groupby("team")["points_scored"]
    .shift(1)
    .rolling(window=3, min_periods=1)
    .mean()
)

# Rolling defense points allowed
long_df["rolling_avg_points_allowed"] = (
    long_df.groupby("team")["points_allowed"]
    .shift(1)
    .rolling(window=3, min_periods=1)
    .mean()
)

# Rolling win percentage
long_df["win"] = (long_df["points_scored"] > long_df["points_allowed"]).astype(int)
long_df["rolling_win_pct"] = (
    long_df.groupby("team")["win"]
    .shift(1)
    .rolling(window=3, min_periods=1)
    .mean()
)

# Merge Rolling Stats to Game-Level Data
# Home team rolling stats
home_rolling = long_df[long_df["home_away"] == "home"][[
    "id", "rolling_avg_points_scored", "rolling_avg_points_allowed", "rolling_win_pct"
]]
home_rolling = home_rolling.rename(columns={
    "rolling_avg_points_scored": "home_rolling_avg_points_scored",
    "rolling_avg_points_allowed": "home_rolling_avg_points_allowed",
    "rolling_win_pct": "home_rolling_win_pct"
})

# Away team rolling stats
away_rolling = long_df[long_df["home_away"] == "away"][[
    "id", "rolling_avg_points_scored", "rolling_avg_points_allowed", "rolling_win_pct"
]]
away_rolling = away_rolling.rename(columns={
    "rolling_avg_points_scored": "away_rolling_avg_points_scored",
    "rolling_avg_points_allowed": "away_rolling_avg_points_allowed",
    "rolling_win_pct": "away_rolling_win_pct"
})

# Merge back
final_merged = merged.merge(home_rolling, on="id", how="left")
final_merged = final_merged.merge(away_rolling, on="id", how="left")

# Step 7. Save Final
final_merged.to_csv("final_merged_dataset_test.csv", index=False)

                Team  wins  losses
0          LSU (SEC)    10       3
1    Oregon (Pac-12)    12       2
2  Oklahoma (Big 12)    10       3
3     Liberty (CUSA)    13       1
4      Georgia (SEC)    13       1


  merged["winner"] = (merged["home_points"] > merged["away_points"]).astype(int)
