In [1]:
import pandas as pd
import joblib 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
ipl = pd.read_csv("../data/cleaned/ipl_cleaned.csv")

  ipl = pd.read_csv("../data/cleaned/ipl_cleaned.csv")


In [3]:
#match-level Aggregation
df = ipl.groupby(
    ['batter', 'match_id', 'date', 'venue', 'bowling_team']
).agg(
    match_runs=('batsman_runs', 'sum'),
    balls_faced=('ball', 'count'),
    wickets=('is_wicket', 'sum')
).reset_index()
#Creating Derived Batting Metrics
df['strike_rate'] = (df['match_runs'] / df['balls_faced']) * 100
df['not_out'] = (df['wickets'] == 0).astype(int)
df['date']=pd.to_datetime(df['date'])

In [4]:
print(df.columns.tolist())

['batter', 'match_id', 'date', 'venue', 'bowling_team', 'match_runs', 'balls_faced', 'wickets', 'strike_rate', 'not_out']


In [5]:
print(df)

               batter  match_id       date  \
0      A Ashish Reddy    548346 2012-04-29   
1      A Ashish Reddy    548352 2012-05-04   
2      A Ashish Reddy    548359 2012-05-08   
3      A Ashish Reddy    548373 2012-05-18   
4      A Ashish Reddy    548376 2012-05-20   
...               ...       ...        ...   
16510          Z Khan    980903 2016-04-10   
16511          Z Khan    980993 2016-05-15   
16512          Z Khan   1082595 2017-04-08   
16513          Z Khan   1082635 2017-05-06   
16514          Z Khan   1082646 2017-05-14   

                                                   venue  \
0                                       Wankhede Stadium   
1                        MA Chidambaram Stadium, Chepauk   
2              Rajiv Gandhi International Stadium, Uppal   
3              Rajiv Gandhi International Stadium, Uppal   
4              Rajiv Gandhi International Stadium, Uppal   
...                                                  ...   
16510                      

In [6]:
#Sorting Data by Time
df = df.sort_values(["batter", "date","match_id"])
df = df.reset_index(drop=True)


In [7]:
#Recent Form - Rolling Average
recent_form = (
    df.groupby("batter")["match_runs"]
      .rolling(window=5) 
      .mean()
)

df["recent_form"] = recent_form.reset_index(level=0, drop=True)


In [8]:
print(df)

               batter  match_id       date  \
0      A Ashish Reddy    548346 2012-04-29   
1      A Ashish Reddy    548352 2012-05-04   
2      A Ashish Reddy    548359 2012-05-08   
3      A Ashish Reddy    548373 2012-05-18   
4      A Ashish Reddy    548376 2012-05-20   
...               ...       ...        ...   
16510          Z Khan    980903 2016-04-10   
16511          Z Khan    980993 2016-05-15   
16512          Z Khan   1082595 2017-04-08   
16513          Z Khan   1082635 2017-05-06   
16514          Z Khan   1082646 2017-05-14   

                                                   venue  \
0                                       Wankhede Stadium   
1                        MA Chidambaram Stadium, Chepauk   
2              Rajiv Gandhi International Stadium, Uppal   
3              Rajiv Gandhi International Stadium, Uppal   
4              Rajiv Gandhi International Stadium, Uppal   
...                                                  ...   
16510                      

In [9]:
#venue Performance(PvP)
venue_mean = df.groupby(["batter", "venue"])["match_runs"].mean()
df["venue_performance"] = df.set_index(["batter", "venue"]).index.map(venue_mean)



In [10]:
print(df)

               batter  match_id       date  \
0      A Ashish Reddy    548346 2012-04-29   
1      A Ashish Reddy    548352 2012-05-04   
2      A Ashish Reddy    548359 2012-05-08   
3      A Ashish Reddy    548373 2012-05-18   
4      A Ashish Reddy    548376 2012-05-20   
...               ...       ...        ...   
16510          Z Khan    980903 2016-04-10   
16511          Z Khan    980993 2016-05-15   
16512          Z Khan   1082595 2017-04-08   
16513          Z Khan   1082635 2017-05-06   
16514          Z Khan   1082646 2017-05-14   

                                                   venue  \
0                                       Wankhede Stadium   
1                        MA Chidambaram Stadium, Chepauk   
2              Rajiv Gandhi International Stadium, Uppal   
3              Rajiv Gandhi International Stadium, Uppal   
4              Rajiv Gandhi International Stadium, Uppal   
...                                                  ...   
16510                      

In [11]:
#Opponent Performance(PvT)
opponent_mean = df.groupby(["batter", "bowling_team"])["match_runs"].mean()
df["opponent_performance"] = df.set_index(
    ["batter", "bowling_team"]
).index.map(opponent_mean)

In [12]:
print(df)

               batter  match_id       date  \
0      A Ashish Reddy    548346 2012-04-29   
1      A Ashish Reddy    548352 2012-05-04   
2      A Ashish Reddy    548359 2012-05-08   
3      A Ashish Reddy    548373 2012-05-18   
4      A Ashish Reddy    548376 2012-05-20   
...               ...       ...        ...   
16510          Z Khan    980903 2016-04-10   
16511          Z Khan    980993 2016-05-15   
16512          Z Khan   1082595 2017-04-08   
16513          Z Khan   1082635 2017-05-06   
16514          Z Khan   1082646 2017-05-14   

                                                   venue  \
0                                       Wankhede Stadium   
1                        MA Chidambaram Stadium, Chepauk   
2              Rajiv Gandhi International Stadium, Uppal   
3              Rajiv Gandhi International Stadium, Uppal   
4              Rajiv Gandhi International Stadium, Uppal   
...                                                  ...   
16510                      

In [13]:
#Overall Career Consistency
career_mean = df.groupby("batter")["match_runs"].mean()
df["overall_average"] = df["batter"].map(career_mean)

In [14]:
print(df)

               batter  match_id       date  \
0      A Ashish Reddy    548346 2012-04-29   
1      A Ashish Reddy    548352 2012-05-04   
2      A Ashish Reddy    548359 2012-05-08   
3      A Ashish Reddy    548373 2012-05-18   
4      A Ashish Reddy    548376 2012-05-20   
...               ...       ...        ...   
16510          Z Khan    980903 2016-04-10   
16511          Z Khan    980993 2016-05-15   
16512          Z Khan   1082595 2017-04-08   
16513          Z Khan   1082635 2017-05-06   
16514          Z Khan   1082646 2017-05-14   

                                                   venue  \
0                                       Wankhede Stadium   
1                        MA Chidambaram Stadium, Chepauk   
2              Rajiv Gandhi International Stadium, Uppal   
3              Rajiv Gandhi International Stadium, Uppal   
4              Rajiv Gandhi International Stadium, Uppal   
...                                                  ...   
16510                      

In [15]:
#Handling Missing Recent Form Values
df["recent_form"] = df["recent_form"].fillna(df["overall_average"])

In [16]:
#Target Prediction
df["future_runs"] = df.groupby("batter")["match_runs"].shift(-1)
df = df.dropna().reset_index(drop=True)



In [17]:
print(df)

               batter  match_id       date  \
0      A Ashish Reddy    548346 2012-04-29   
1      A Ashish Reddy    548352 2012-05-04   
2      A Ashish Reddy    548359 2012-05-08   
3      A Ashish Reddy    548373 2012-05-18   
4      A Ashish Reddy    548376 2012-05-20   
...               ...       ...        ...   
15837          Z Khan    729317 2014-04-30   
15838          Z Khan    980903 2016-04-10   
15839          Z Khan    980993 2016-05-15   
15840          Z Khan   1082595 2017-04-08   
15841          Z Khan   1082635 2017-05-06   

                                                   venue  \
0                                       Wankhede Stadium   
1                        MA Chidambaram Stadium, Chepauk   
2              Rajiv Gandhi International Stadium, Uppal   
3              Rajiv Gandhi International Stadium, Uppal   
4              Rajiv Gandhi International Stadium, Uppal   
...                                                  ...   
15837                Dubai 

In [18]:
#Feature Selection
input_features = df[
    ["recent_form", "venue_performance",
     "opponent_performance", "overall_average"]
]

In [19]:
#Label Selection
output_label= df[["future_runs"]]

In [20]:
# Time Aware Train Test Split 
# sort by time first
df = df.sort_values(['date']).reset_index(drop=True)
df['date']=pd.to_datetime(df['date'])

# split by season
split_date=df['date'].quantile(0.8)
train_df = df[df['date'] <= split_date]
test_df  = df[df['date'] > split_date]

# features
X_train = train_df[input_features.columns]
X_test  = test_df[input_features.columns]

# labels
y_train = train_df[['future_runs']]
y_test  = test_df[['future_runs']]

In [21]:
#Scaling Pipeline 
scale_pipe = Pipeline([
    ("scale", StandardScaler())
])

scale_pipe.fit(X_train)

In [26]:
joblib.dump(scale_pipe, "../scripts/player_feature_pipeline.pkl")


['../scripts/player_feature_pipeline.pkl']

In [23]:
#Final Dataset 
final_data = df[
    ["batter", "date", "venue", "bowling_team",
     "recent_form", "venue_performance",
     "opponent_performance", "overall_average",
     "future_runs"]
]

In [25]:
final_data.to_csv("../data/cleaned/player_dataset.csv", index=False)