# Engineering Features for Analysis

Create a strength of schedule feature for the player's upcoming season

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# load all the required data
schedule = pd.read_csv("other features/schedule.csv")

# Drop 2019 because no SoS data exists
schedule = schedule[schedule["Year"] != 2019]


# players.csv columns: Name,Year,Team,Position,...
players = pd.read_csv("clean_data/clean_stats.csv")

# sos files: one CSV per year with columns: Year,Team,Position,SoS_Color,SoS_Score
# Adjust the glob pattern or list them explicitly if needed.
sos_dir = Path("other features")
sos_files = sorted(sos_dir.glob("sos_*.csv"))
sos_list = []
for f in sos_files:
    df = pd.read_csv(f)
    # ensure the necessary columns exist and types are consistent
    df = df[['Year','Team','Position','SoS_Score']].copy()
    sos_list.append(df)

sos_all = pd.concat(sos_list, ignore_index=True)

schedule['Team'] = schedule['Team'].str.upper().str.strip()
players['Team'] = players['Team'].str.upper().str.strip()
players['Position'] = players['Position'].str.upper().str.strip()
sos_all['Team'] = sos_all['Team'].str.upper().str.strip()
sos_all['Position'] = sos_all['Position'].str.upper().str.strip()

Now we have all the data that we need loaded, now we can try to make a custom variable for schedule strength.

In [3]:
opp_cols = [c for c in schedule.columns if c.lower().startswith("opp")]
key_cols = ["Team", "Year"]

for oc in opp_cols:
    schedule[oc] = (
        schedule[oc]
        .astype(str).str.upper().str.strip()
        .replace({"": pd.NA, "NONE": pd.NA, "BYE": pd.NA, "TBD": pd.NA})
    )

In [4]:
final_frames = []  # to collect the four outputs

for pos in ["QB", "RB", "WR", "TE"]:
    # 1) Filter SoS to this position and keep only what's needed
    pos_sos = (
        sos_all.loc[sos_all["Position"].str.upper() == pos, ["Year", "Team", "SoS_Score"]]
        .copy()
    )
    base = schedule[key_cols + opp_cols].copy()

    # 3) Join SoS onto EVERY opponent column (Year + OppK)
    #    We rename SoS_Score -> f"{OppK}_SoS" so each week’s join has its own column
    for oc in opp_cols:
        if base[oc].notna().sum() == 0:
            continue
        base = base.merge(
            pos_sos.rename(columns={"Team": oc, "SoS_Score": f"{oc}_SoS"}),
            how="left",
            on=["Year", oc]
        )

    # 4) Row-wise mean across all opponent SoS columns
    sos_cols = [f"{oc}_SoS" for oc in opp_cols]
    for c in sos_cols:
        base[c] = pd.to_numeric(base[c], errors="coerce")
    counts = base[sos_cols].count(axis=1)
    sums   = base[sos_cols].sum(axis=1, skipna=True)
    base["Avg_Strength_Against"] = sums.divide(counts).where(counts > 0, pd.NA)

    # 5) Keep only requested columns and label the position
    out = base[["Team", "Year", "Avg_Strength_Against"]].copy()
    out.insert(2, "Pos", pos)  # Cols: Team, Year, Pos, Avg_Strength_Against

    final_frames.append(out)

In [5]:
# Combine all four position-specific DataFrames into one
asa_all = pd.concat(final_frames, ignore_index=True)

# Optional: sort for readability
asa_all = asa_all.sort_values(["Year", "Team", "Pos"]).reset_index(drop=True)

In [6]:
asa_all.head()

Unnamed: 0,Team,Year,Pos,Avg_Strength_Against
0,ARI,2020,QB,2.125
1,ARI,2020,RB,2.1875
2,ARI,2020,TE,2.0
3,ARI,2020,WR,2.4375
4,ATL,2020,QB,2.5625


Now we can normalize the strength of schedule to Z-Score for use in the model. A higher score will be an easier schedule and a lower score is a harder schedule

In [7]:
asa_all["ASA_zscore"] = (
    asa_all.groupby("Year")["Avg_Strength_Against"]
           .transform(lambda x: (x - x.mean()) / x.std())
)

In [8]:
asa_all = asa_all.drop(columns="Avg_Strength_Against")
asa_all = asa_all.rename(columns={"Pos":"Position"})
asa_all.head()

Unnamed: 0,Team,Year,Position,ASA_zscore
0,ARI,2020,QB,-1.180997
1,ARI,2020,RB,-0.984164
2,ARI,2020,TE,-1.574662
3,ARI,2020,WR,-0.196833
4,ATL,2020,QB,0.196833


Now we can join the ASA to the player data

In [9]:
players["Next_Season"] = players["Year"] + 1
# prepare ASA for "next season" merge
asa_next = (
    asa_all
      .rename(columns={
          "Team": "Team_NextYear",
          "Year": "Next_Season",
          "Pos": "Position"
      })[["Team_NextYear","Next_Season","Position","ASA_zscore"]]
)

# players already has Team_NextYear and computed Next_Season = Year + 1
players_with_asa_next = players.merge(
    asa_next,
    how="left",
    on=["Team_NextYear","Next_Season","Position"],
    validate="m:1"
)

In [10]:
players_with_asa_next.head()

Unnamed: 0,Name,Year,Team,Position,Age,Games,Passing_Cmp,Passing_Att,Passing_Yds,Passing_TD,...,Receiving_Yds,Receiving_TD,Fumbles_Lost,2PM,2PP,Fantasy_PPR,Fantasy_PPR_NextYear,Team_NextYear,Next_Season,ASA_zscore
0,Christian McCaffrey,2019,CAR,RB,23,16,0.0,2.0,0.0,0.0,...,1005.0,4.0,0.0,1.0,0.0,471.2,90.4,CAR,2020,-0.393666
1,Lamar Jackson,2019,BAL,QB,22,15,265.0,401.0,3127.0,36.0,...,0.0,0.0,2.0,0.0,0.0,415.7,332.8,BAL,2020,-0.787331
2,Derrick Henry,2019,TEN,RB,25,15,0.0,0.0,0.0,0.0,...,206.0,2.0,3.0,0.0,0.0,294.6,333.1,TEN,2020,1.180997
3,Aaron Jones,2019,GNB,RB,25,16,0.0,0.0,0.0,0.0,...,474.0,3.0,2.0,0.0,0.0,314.8,258.9,GNB,2020,0.393666
4,Ezekiel Elliott,2019,DAL,RB,24,16,0.0,0.0,0.0,0.0,...,420.0,2.0,2.0,0.0,0.0,311.7,223.7,DAL,2020,-1.771495


## Now that we added the Average schedule strength, we can now add preseason team rankings. 

The rankings mean how good ESPN expects a team to be going into the next season. 1 is the best team and 32 is the worst

In [11]:
preseason_ranking = pd.read_csv("other features/pre_power_rankings.csv")
preseason_ranking.head()

Unnamed: 0,Team,Year,Ranking
0,KAN,2020,1
1,BAL,2020,2
2,SFO,2020,3
3,NOR,2020,4
4,SEA,2020,5


In [12]:
players_final = players_with_asa_next.merge(
    preseason_ranking.rename(columns={"Team":"Team_NextYear","Year":"Next_Season","Ranking":"Preseason_Rank"}),
    how="left",
    on=["Team_NextYear","Next_Season"]
)

players_final.head()


Unnamed: 0,Name,Year,Team,Position,Age,Games,Passing_Cmp,Passing_Att,Passing_Yds,Passing_TD,...,Receiving_TD,Fumbles_Lost,2PM,2PP,Fantasy_PPR,Fantasy_PPR_NextYear,Team_NextYear,Next_Season,ASA_zscore,Preseason_Rank
0,Christian McCaffrey,2019,CAR,RB,23,16,0.0,2.0,0.0,0.0,...,4.0,0.0,1.0,0.0,471.2,90.4,CAR,2020,-0.393666,28.0
1,Lamar Jackson,2019,BAL,QB,22,15,265.0,401.0,3127.0,36.0,...,0.0,2.0,0.0,0.0,415.7,332.8,BAL,2020,-0.787331,2.0
2,Derrick Henry,2019,TEN,RB,25,15,0.0,0.0,0.0,0.0,...,2.0,3.0,0.0,0.0,294.6,333.1,TEN,2020,1.180997,9.0
3,Aaron Jones,2019,GNB,RB,25,16,0.0,0.0,0.0,0.0,...,3.0,2.0,0.0,0.0,314.8,258.9,GNB,2020,0.393666,7.0
4,Ezekiel Elliott,2019,DAL,RB,24,16,0.0,0.0,0.0,0.0,...,2.0,2.0,0.0,0.0,311.7,223.7,DAL,2020,-1.771495,8.0


In [13]:
players_final.to_csv("training/players.csv")

# College Players
We will now add the extra features to the college players

In [14]:
college_players = pd.read_csv("clean_data/final_rookies.csv")
college_players.head()

Unnamed: 0,Player,Draft_Year,Pos,Age,Team,Round,Pick,College,Conf,G,...,Pass_Yds,Pass_TD,Int,Rushing_Att,Rushing_Yds,Rushing_TD,Rec,Receiving_Yds,Receiving_TD,Target
0,Joe Burrow,2020,QB,23.0,CIN,1,1,LSU,SEC,15.0,...,5671.0,60.0,6.0,115.0,368.0,5.0,1.0,16.0,0.0,173.7
1,Tua Tagovailoa,2020,QB,22.0,MIA,1,5,Alabama,SEC,9.0,...,2840.0,33.0,3.0,23.0,17.0,2.0,0.0,0.0,0.0,135.5
2,Justin Herbert,2020,QB,22.0,LAC,1,6,Oregon,Pac-12,14.0,...,3471.0,32.0,6.0,58.0,50.0,4.0,0.0,0.0,0.0,332.8
3,Henry Ruggs III,2020,WR,21.0,LVR,1,12,Alabama,SEC,12.0,...,0.0,0.0,0.0,2.0,75.0,1.0,40.0,746.0,7.0,84.1
4,Jerry Jeudy,2020,WR,21.0,DEN,1,15,Alabama,SEC,13.0,...,0.0,0.0,0.0,1.0,1.0,0.0,77.0,1163.0,10.0,157.6


In [17]:
# prepare ASA for merge
asa_college = (
    asa_next
      .rename(columns={
          "Team_NextYear":"Team",
          "Next_Season":"Draft_Year",
          "Position": "Pos"
      })[["Team","Draft_Year","Pos","ASA_zscore"]]
)

college_players_with_asa = college_players.merge(
    asa_college,
    how="left",
    on=["Team","Draft_Year","Pos"],
    validate="m:1"
)

college_players_with_asa.head()

Unnamed: 0,Player,Draft_Year,Pos,Age,Team,Round,Pick,College,Conf,G,...,Pass_TD,Int,Rushing_Att,Rushing_Yds,Rushing_TD,Rec,Receiving_Yds,Receiving_TD,Target,ASA_zscore
0,Joe Burrow,2020,QB,23.0,CIN,1,1,LSU,SEC,15.0,...,60.0,6.0,115.0,368.0,5.0,1.0,16.0,0.0,173.7,-0.590498
1,Tua Tagovailoa,2020,QB,22.0,MIA,1,5,Alabama,SEC,9.0,...,33.0,3.0,23.0,17.0,2.0,0.0,0.0,0.0,135.5,1.180997
2,Justin Herbert,2020,QB,22.0,LAC,1,6,Oregon,Pac-12,14.0,...,32.0,6.0,58.0,50.0,4.0,0.0,0.0,0.0,332.8,1.377829
3,Henry Ruggs III,2020,WR,21.0,LVR,1,12,Alabama,SEC,12.0,...,0.0,0.0,2.0,75.0,1.0,40.0,746.0,7.0,84.1,-0.590498
4,Jerry Jeudy,2020,WR,21.0,DEN,1,15,Alabama,SEC,13.0,...,0.0,0.0,1.0,1.0,0.0,77.0,1163.0,10.0,157.6,-0.393666


### Now we can add the preseason team rankings as well

In [19]:
college_with_rank = college_players_with_asa.merge(
    preseason_ranking.rename(columns={"Year":"Draft_Year","Ranking":"Preseason_Rank"}),
    how="left",
    on=["Team","Draft_Year"]
)

college_with_rank.head()

Unnamed: 0,Player,Draft_Year,Pos,Age,Team,Round,Pick,College,Conf,G,...,Int,Rushing_Att,Rushing_Yds,Rushing_TD,Rec,Receiving_Yds,Receiving_TD,Target,ASA_zscore,Preseason_Rank
0,Joe Burrow,2020,QB,23.0,CIN,1,1,LSU,SEC,15.0,...,6.0,115.0,368.0,5.0,1.0,16.0,0.0,173.7,-0.590498,30
1,Tua Tagovailoa,2020,QB,22.0,MIA,1,5,Alabama,SEC,9.0,...,3.0,23.0,17.0,2.0,0.0,0.0,0.0,135.5,1.180997,26
2,Justin Herbert,2020,QB,22.0,LAC,1,6,Oregon,Pac-12,14.0,...,6.0,58.0,50.0,4.0,0.0,0.0,0.0,332.8,1.377829,24
3,Henry Ruggs III,2020,WR,21.0,LVR,1,12,Alabama,SEC,12.0,...,0.0,2.0,75.0,1.0,40.0,746.0,7.0,84.1,-0.590498,21
4,Jerry Jeudy,2020,WR,21.0,DEN,1,15,Alabama,SEC,13.0,...,0.0,1.0,1.0,0.0,77.0,1163.0,10.0,157.6,-0.393666,23


### Now we can add special features just for college to add more predictive power.

These features will include their college's ranking and also their conferences ranking

In [23]:
# join the conference rankings with the players
conf_rks = pd.read_csv("other features/conference_rankings.csv")

conf_rks["Draft_Year"] = conf_rks["Year"] + 1
conf_rks.head()

Unnamed: 0,Rk,Conf,Year,Draft_Year
0,1,SEC,2019,2020
1,2,Big Ten,2019,2020
2,3,Big 12,2019,2020
3,4,Pac-12,2019,2020
4,5,American,2019,2020


In [24]:


college_confs = college_with_rank.merge(
    conf_rks.rename(columns={"Rk":"Conf_rank"})[["Conf_rank","Conf","Draft_Year"]],
    how = "left",
    on=["Conf","Draft_Year"]
)

college_confs.head()

Unnamed: 0,Player,Draft_Year,Pos,Age,Team,Round,Pick,College,Conf,G,...,Rushing_Att,Rushing_Yds,Rushing_TD,Rec,Receiving_Yds,Receiving_TD,Target,ASA_zscore,Preseason_Rank,Conf_rank
0,Joe Burrow,2020,QB,23.0,CIN,1,1,LSU,SEC,15.0,...,115.0,368.0,5.0,1.0,16.0,0.0,173.7,-0.590498,30,1
1,Tua Tagovailoa,2020,QB,22.0,MIA,1,5,Alabama,SEC,9.0,...,23.0,17.0,2.0,0.0,0.0,0.0,135.5,1.180997,26,1
2,Justin Herbert,2020,QB,22.0,LAC,1,6,Oregon,Pac-12,14.0,...,58.0,50.0,4.0,0.0,0.0,0.0,332.8,1.377829,24,4
3,Henry Ruggs III,2020,WR,21.0,LVR,1,12,Alabama,SEC,12.0,...,2.0,75.0,1.0,40.0,746.0,7.0,84.1,-0.590498,21,1
4,Jerry Jeudy,2020,WR,21.0,DEN,1,15,Alabama,SEC,13.0,...,1.0,1.0,0.0,77.0,1163.0,10.0,157.6,-0.393666,23,1
