In [29]:
import pandas as pd

ipl = pd.read_csv("../data/cleaned/ipl_cleaned.csv")

ipl.head()

Unnamed: 0,id,innings,overs,ball_number,batter,bowler,non_striker,extra_type,batsman_run,extras_run,...,non_boundary,iswicket_delivery,player_out,dismisal_kind,fielders_involved,batting_team,date,venue,team1,team2
0,1312200,1,0,1,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,...,0,0,,,,rajasthan royals,2022-05-29,"Narendra Modi Stadium, Ahmedabad",rajasthan royals,gujarat titans
1,1312200,1,0,2,YBK Jaiswal,Mohammed Shami,JC Buttler,legbyes,0,1,...,0,0,,,,rajasthan royals,2022-05-29,"Narendra Modi Stadium, Ahmedabad",rajasthan royals,gujarat titans
2,1312200,1,0,3,JC Buttler,Mohammed Shami,YBK Jaiswal,,1,0,...,0,0,,,,rajasthan royals,2022-05-29,"Narendra Modi Stadium, Ahmedabad",rajasthan royals,gujarat titans
3,1312200,1,0,4,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,...,0,0,,,,rajasthan royals,2022-05-29,"Narendra Modi Stadium, Ahmedabad",rajasthan royals,gujarat titans
4,1312200,1,0,5,YBK Jaiswal,Mohammed Shami,JC Buttler,,0,0,...,0,0,,,,rajasthan royals,2022-05-29,"Narendra Modi Stadium, Ahmedabad",rajasthan royals,gujarat titans


In [30]:
batsman_match = (
    ipl.groupby(["id", "date", "venue", "batter"])
    .agg(
        runs_scored=("batsman_run", "sum"),
        balls_faced=("batsman_run", "count")
    )
    .reset_index()
)

In [31]:
bowler_match = (
    ipl.groupby(["id", "date", "venue", "bowler"])
    .agg(
        wickets_taken=("iswicket_delivery", "sum"),
        balls_bowled=("iswicket_delivery", "count")
    )
    .reset_index()
)

In [32]:
batsman_match = batsman_match.sort_values("date")
bowler_match = bowler_match.sort_values("date")


In [33]:
batsman_match["runs_last_5"] = (
    batsman_match
    .groupby("batter")["runs_scored"]
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)

In [34]:
bowler_match["wickets_last_5"] = (
    bowler_match
    .groupby("bowler")["wickets_taken"]
    .rolling(5, min_periods=1)
    .mean()
    .reset_index(level=0, drop=True)
)


In [35]:
batsman_match["venue_avg_runs"] = (
    batsman_match
    .groupby(["batter", "venue"])["runs_scored"]
    .transform("mean")
)


In [36]:
bowler_match["venue_avg_wickets"] = (
    bowler_match
    .groupby(["bowler", "venue"])["wickets_taken"]
    .transform("mean")
)

In [37]:
batsman_match["career_avg_runs"] = (
    batsman_match
    .groupby("batter")["runs_scored"]
    .transform("mean")
)

In [38]:
bowler_match["career_avg_wickets"] = (
    bowler_match
    .groupby("bowler")["wickets_taken"]
    .transform("mean")
)

In [39]:
batsman_match["target_runs"] = (
    batsman_match
    .groupby("batter")["runs_scored"]
    .shift(-1)
)

In [40]:
bowler_match["target_wickets"] = (
    bowler_match
    .groupby("bowler")["wickets_taken"]
    .shift(-1)
)

In [41]:
batsman_match.dropna(subset=["target_runs"], inplace=True)
bowler_match.dropna(subset=["target_wickets"], inplace=True)

In [42]:
batsman_features = batsman_match[
    [
        "runs_last_5",
        "venue_avg_runs",
        "career_avg_runs"
    ]
]

batsman_target = batsman_match["target_runs"]

In [43]:
bowler_features = bowler_match[
    [
        "wickets_last_5",
        "venue_avg_wickets",
        "career_avg_wickets"
    ]
]

bowler_target = bowler_match["target_wickets"]

In [46]:
batsman_match.to_csv("../data/processed/batsman_features.csv", index=False)
bowler_match.to_csv("../data/processed/bowler_features.csv", index=False)