**Player Per Game database**

In [None]:
import pandas as pd
import re

df = pd.read_csv("/Users/sethfried/Fantasy Football/data/player_game_logs/pfr_gamelogs_QB.csv")

gtm_num = pd.to_numeric(df['Gtm'], errors='coerce')

df = df[gtm_num > 0].copy()

df = df.drop(columns=[col for col in ["Rk", "Gtm"] if col in df.columns])

df = df.rename(columns={"Unnamed: 6_level_0": "home"})
df["home"] = df["home"].fillna("").map(lambda x: 0 if str(x).strip()=="@" else 1)

cols = list(df.columns)
if "Fumbles FRTD" in cols:
    df = df[cols[:cols.index("Fumbles FRTD")+1]]

df["Did Not Play"] = df.eq("Did Not Play").any(axis=1).astype(int)
df["Inactive"]     = df.eq("Inactive").any(axis=1).astype(int)

if "GS" in df.columns:
    def fix_gs(val, row):
        s = str(val)
        if "Did Not Play" in s:
            row["Did Not Play"] = 1
            return 0
        if "Inactive" in s:
            row["Inactive"] = 1
            return 0
        return 1 if "*" in s else 0

    df["GS"] = df.apply(lambda row: fix_gs(row["GS"], row), axis=1)

for col in df.columns:
    df[col] = df[col].replace(["Did Not Play", "Inactive", ""], 0)

pd.set_option("display.max_columns", None)




  df = pd.read_csv("/Users/sethfried/Fantasy Football/data/player_game_logs/pfr_gamelogs_QB.csv")


In [3]:
print(df.columns)

Index(['Gcar', 'Week', 'Date', 'Team', 'home', 'Opp', 'Result', 'GS',
       'Passing Cmp', 'Passing Att', 'Passing Cmp%', 'Passing Yds',
       'Passing TD', 'Passing Int', 'Passing Y/A', 'Passing AY/A',
       'Passing Rate', 'Passing Sk', 'Passing Yds.1', 'Rushing Att',
       'Rushing Yds', 'Rushing TD', 'Rushing Y/A', 'Snap Counts OffSnp',
       'Snap Counts Off%', 'Snap Counts DefSnp', 'Snap Counts Def%',
       'Snap Counts STSnp', 'Snap Counts ST%', 'Player', 'Position', 'Year',
       'Receiving Tgt', 'Receiving Rec', 'Receiving Yds', 'Receiving TD',
       'Receiving Ctch%', 'Receiving Y/Tgt', 'Fumbles Fmb', 'Fumbles FL',
       'Fumbles FF', 'Fumbles FR', 'Fumbles Yds', 'Fumbles FRTD',
       'Did Not Play', 'Inactive'],
      dtype='object')


In [4]:
import re
import pandas as pd

# --- assume df is already loaded and pre-cleaned up to this point ---



# 1. Split Result into win & score
def split_result(res):
    if not isinstance(res, str):
        return pd.Series([0, "0-0"])
    res = res.strip()
    win = 1 if res.startswith("W") else 0
    # extract "##-##" and drop any "(OT)" etc.
    m = re.search(r",\s*([0-9]+-[0-9]+)", res)
    score = m.group(1) if m else "0-0"
    return pd.Series([win, score])

df[["win", "score"]] = df["Result"].apply(split_result)

# 2. Drop Result and Passing Yds.1
df = df.drop(columns=["Result", "Passing Yds.1"], errors="ignore")

# 3. Fill any NaN with 0
df = df.fillna(0)

# Inspect
pd.set_option("display.max_columns", None)
print(df.head())


   Gcar Week        Date Team  home  Opp  GS Passing Cmp Passing Att  \
0   0.0  1.0  2019-09-09  DEN     0  OAK   0           0           0   
1   0.0  2.0  2019-09-15  DEN     1  CHI   0           0           0   
2   0.0  3.0  2019-09-22  DEN     0  GNB   0           0           0   
3   0.0  4.0  2019-09-29  DEN     1  JAX   0           0           0   
4   0.0  5.0  2019-10-06  DEN     0  LAC   0           0           0   

  Passing Cmp% Passing Yds Passing TD Passing Int Passing Y/A Passing AY/A  \
0            0           0          0           0           0            0   
1            0           0          0           0           0            0   
2            0           0          0           0           0            0   
3            0           0          0           0           0            0   
4            0           0          0           0           0            0   

  Passing Rate Passing Sk Rushing Att Rushing Yds Rushing TD Rushing Y/A  \
0            0        

In [5]:
import pandas as pd

# 1) Dates → datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# 2) Strings
for col in ['Team','Opp','Player','Position','score']:
    df[col] = df[col].astype('string')

# 3) Integer columns (nullable Int64)
int_cols = [
    'Gcar','Week','home','GS',
    'Passing Cmp','Passing Att','Passing TD','Passing Int','Passing Sk',
    'Rushing Att','Rushing TD',
    'Snap Counts OffSnp','Snap Counts DefSnp','Snap Counts STSnp',
    'Receiving Tgt','Receiving Rec','Receiving TD',
    'Fumbles Fmb','Fumbles FL','Fumbles FF','Fumbles FR','Fumbles FRTD',
    'Did Not Play','Inactive','win'
]
for c in int_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype('Int64')

# 4) Float columns
float_cols = [
    'Passing Cmp%','Passing Yds','Passing Y/A','Passing AY/A','Passing Rate',
    'Rushing Yds','Rushing Y/A',
    'Snap Counts Off%','Snap Counts Def%','Snap Counts ST%',
    'Receiving Yds','Receiving Ctch%','Receiving Y/Tgt'
]
for c in float_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0.0).astype('float')

# 5) Year & season as integers
for c in ['Year']:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype('Int64')

# 6) Quick check
print(df.dtypes)


Gcar                           Int64
Week                           Int64
Date                  datetime64[ns]
Team                  string[python]
home                           Int64
Opp                   string[python]
GS                             Int64
Passing Cmp                    Int64
Passing Att                    Int64
Passing Cmp%                 float64
Passing Yds                  float64
Passing TD                     Int64
Passing Int                    Int64
Passing Y/A                  float64
Passing AY/A                 float64
Passing Rate                 float64
Passing Sk                     Int64
Rushing Att                    Int64
Rushing Yds                  float64
Rushing TD                     Int64
Rushing Y/A                  float64
Snap Counts OffSnp             Int64
Snap Counts Off%             float64
Snap Counts DefSnp             Int64
Snap Counts Def%             float64
Snap Counts STSnp              Int64
Snap Counts ST%              float64
P

In [6]:
import numpy as np
import pandas as pd

# … assume df['Date'] has already been coerced to datetime …

# Recompute season and force it to Int64, dropping NaT → NaN → cast
df['season'] = (
    df['Date']
      .dt.year
      .where(df['Date'].dt.month >= 9, df['Date'].dt.year - 1)
      .astype('Int64')     # now a nullable integer dtype
)

new_rows = []
for player, grp in df.groupby('Player'):
    # 1) get only the non-null seasons, as plain Python ints
    seasons = sorted(int(s) for s in grp['season'].dropna().unique())
    if len(seasons) < 2:
        continue

    full_range = range(seasons[0], seasons[-1] + 1)
    missing = [s for s in full_range if s not in seasons]
    
    for s in missing:
        prev = grp[grp['season'] == s - 1]
        if prev.empty:
            continue
        
        last_week = int(prev['Week'].max())
        last_gcar = int(prev.sort_values('Week').iloc[-1]['Gcar'])
        pos       = prev.iloc[0]['Position']
        
        for w in range(1, last_week + 1):
            row = {col: 0 for col in df.columns}
            row.update({
                'Player': player,
                'Position': pos,
                'season': s,
                'Week': w,
                'Gcar': last_gcar,
                'Inactive': 1
            })
            for c in ['Date','Team','home','Opp','win','score']:
                row[c] = np.nan
            new_rows.append(row)

# Append, sort, reset index
df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True, sort=False)
df = df.sort_values(['Player','season','Week']).reset_index(drop=True)

pd.set_option('display.max_columns', None)
print(df.head(20))


    Gcar  Week       Date Team  home  Opp  GS  Passing Cmp  Passing Att  \
0      0     1 2006-09-10  PHI     0  HOU   0            0            0   
1      0     2 2006-09-17  PHI     1  NYG   0            0            0   
2      0     3 2006-09-24  PHI     0  SFO   0            0            0   
3      0     4 2006-10-02  PHI     1  GNB   0            0            0   
4      0     5 2006-10-08  PHI     1  DAL   0            0            0   
5      0     6 2006-10-15  PHI     0  NOR   0            0            0   
6      0     7 2006-10-22  PHI     0  TAM   0            0            0   
7      0     8 2006-10-29  PHI     1  JAX   0            0            0   
8      0    10 2006-11-12  PHI     1  WAS   0            0            0   
9      0    11 2006-11-19  PHI     1  TEN   0            0            0   
10    19    12 2006-11-26  PHI     0  IND   0            4            5   
11     0    13 2006-12-04  PHI     1  CAR   0            0            0   
12     0    14 2006-12-10

  df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True, sort=False)


In [7]:
# Split score into two new integer columns
df[['score_for','score_against']] = (
    df['score']
      .str.split('-', expand=True)       # ["20","18"]
      .astype('Int64')                    # nullable integer dtype
)

# (Optional) drop the old string column now that you have numeric scores
df = df.drop(columns='score')

# Inspect types to confirm
print(df[['score_for','score_against']].dtypes)
print(df[['score_for','score_against']].head())


score_for        Int64
score_against    Int64
dtype: object
   score_for  score_against
0         24             10
1         24             30
2         38             24
3         31              9
4         38             24


In [8]:
# Export the cleaned DataFrame to CSV for manual inspection
df.to_csv("cleaned_qb_logs.csv", index=False)
