In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
!pip install scikit-learn

In [None]:
#Reading cleaned data
ipl = pd.read_csv("data/cleaned/ipl_master.csv", low_memory=False)

In [None]:
#----------------Player-Match Aggregation---------------------
df = ipl.groupby(['batter','match_id','date','venue','bowler','bowling_team']).agg({
    'batsman_runs':'sum',
    'is_wicket':'sum'
    }).reset_index()

In [None]:
#Sort in increasing order
df = df.sort_values(['batter','date'])

In [None]:
print(df)

In [None]:
#---------------Rolling Form Feature----------------------
df['rolling_avg_5'] = df.groupby('batter')['batsman_runs'] \
    .rolling(5).mean().reset_index(0,drop=True)  

In [None]:
print(df)

In [None]:
#---------------Venue Average Feature---------------------
#Used transform instead of aggregation because it doesn't reduce any rows
df['venue_avg'] = df.groupby(['batter','venue'])['batsman_runs']\
                    .transform('mean')

In [None]:
print(df)

In [None]:
#----------------- Player vs Team (Bowling) Feature------------------------------
#df['pvt_avg'] = df.groupby(['batter','bowling_team'])['batsman_runs']\
 #                 .transform(mean)   It looks future data also
df['pvt_avg'] = df.groupby(['batter','bowling_team'])['batsman_runs'] \
                  .expanding().mean().shift(1)\
                   .reset_index(level=[0,1], drop=True)               

In [None]:
print(df)

In [None]:
#---------------------------Player vs Player -------------------------
# For Simple analysis(but we don't use it)
df['pvp_avg']=df.groupby(['batter','bowler'])['batsman_runs']\
                .transform('mean')

In [None]:
print(df)

In [None]:
#For no data leakage(we use this method )
df['pvp_avg']=df.groupby(['batter','bowler'])['batsman_runs']\
                 .expanding().mean().shift(1)\
                  .reset_index(level=[0,1], drop=True)

In [None]:
print(df)

In [None]:
#--------------------------- Career Avgerage ------------------------------
#df['career_avg']=df.groupby('batter')['batsman_runs']\
#                .transform('mean')
df['career_avg'] = df.groupby('batter')['batsman_runs']\
                  .expanding().mean().shift(1)\
                   .reset_index(level=[0,1], drop=True)

In [None]:
print(df)

In [None]:
#--------------------------- Create Target Table -------------------------
df['next_match_runs']=df.groupby('batter')['batsman_runs'].shift(-1) #shifting one value 
df = df.dropna()                                                     #And dropping null values

In [None]:
print(df)

In [None]:
df['next_match_wickets']=df.groupby('bowler')['is_wicket'].shift(-1) #shifting one value 
df = df.dropna()

In [None]:
print(df)

In [None]:
#--------------------------- Feature Selection -----------------------
features = df[['rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg']]
labels   = df[['next_match_runs','next_match_wickets']]

In [None]:
train_df = df.sort_values('match_id').reset_index(drop=True)

In [None]:
#--------------------------- Time Series Split -----------------------
train_df = train_df.sort_values('match_id')
split = int(len(df)*0.8)
X_train, X_test = features[:split], features[split:]
y_train, t_test = labels[:split], labels[split:]

In [None]:
split_idx = int(len(train_df) * 0.8)

train = train_df.iloc[:split_idx]
test = train_df.iloc[split_idx:]

In [None]:
print(train_df.columns.tolist())

In [None]:
train_df['season'] = pd.to_datetime(train_df['date']).dt.year

train = train_df[train_df['season'] < 2023]
test = train_df[train_df['season'] == 2023]

In [None]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, test_idx in tscv.split(train_df):
    train = train_df.iloc[train_idx]
    test = train_df.iloc[test_idx]

In [None]:
#-------------------------  Feature Pipleline -------------------------
pipeline = Pipeline([('scaler', StandardScaler())])

pipeline.fit(X_train)
joblib.dump(pipeline,r"C:\Users\asind\cricket_ai_project\scripts\feature_pipeline.pkl")

In [None]:
#------------------------  Save Final dataset -------------------------
final_df = df[['batter','date','venue','bowling_team',
                  'rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg','next_match_runs','next_match_wickets']]
final_df.to_csv(r"C:\Users\asind\cricket_ai_project\data\processed\dataset.csv ",index=False)

In [None]:
print(final_df.head())
print(final_df.shape)