In [1]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [3]:
ipl = pd.read_csv(
    r"C:\Users\ADMIN\Desktop\ipl_score\data\cleaned\ipl_clean.csv",
    low_memory=False
)


In [4]:
print(ipl.columns.tolist())


['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs', 'total_runs', 'extras_type', 'is_wicket', 'player_dismissed', 'dismissal_kind', 'fielder', 'id', 'season', 'venue', 'date', 'team1', 'team2']


In [5]:
#player-match aggregation

df = (ipl.groupby([
        'match_id', 'date', 'venue',
        'batting_team', 'bowling_team',
        'batter', 'bowler'
    ])
    .agg(
        runs_scored=('batsman_runs', 'sum'),
        wickets=('is_wicket', 'sum'),
        balls=('ball', 'count')
    )
    .reset_index()
     )

print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
1        335982  2008-04-18                     M Chinnaswamy Stadium   
2        335982  2008-04-18                     M Chinnaswamy Stadium   
3        335982  2008-04-18                     M Chinnaswamy Stadium   
4        335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54066   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54067   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54068   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54069   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                batting_team                 bowling_team         batter  \
0      Kolkata Knight Riders  Royal Challengers

In [6]:
print(df.columns.tolist())


['match_id', 'date', 'venue', 'batting_team', 'bowling_team', 'batter', 'bowler', 'runs_scored', 'wickets', 'balls']


In [7]:
df = df.sort_values(['date'])

In [8]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [9]:
# rolling form feature

df['rolling_avg_5'] = ( df.groupby('batter')['runs_scored'].rolling(5)
    .mean().reset_index(level=0, drop=True))


In [10]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [11]:
# venue average feature
df['venue_avg'] = df.groupby(['batter','venue'])['runs_scored'].transform('mean')

In [12]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [13]:
#opponent pvt feature
df['pvt_avg'] = df.groupby(['batter','bowling_team'])['runs_scored'].expanding().mean().shift(1)\
                            .reset_index(level=[0,1],drop=True)

In [14]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [15]:
#pvp feature
df['pvp_avg'] = (df.groupby(['batter', 'bowler'])['runs_scored']
    .transform('mean'))


In [16]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [17]:
#career average
df['career_avg'] = df.groupby('batter')['runs_scored'].expanding().mean().shift(1)\
                            .reset_index(level=0,drop=True)

In [18]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [19]:
# Fill missing values (NaN) in the 'rolling_avg_5' column with the corresponding 'career_avg' values

df['rolling_avg_5'] = df['rolling_avg_5'].fillna(df['career_avg'].mean())
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54047   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54039   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54070   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [20]:
#create target label
df['next_match_runs'] = df.groupby('batter')['runs_scored'].shift(-1)

df= df.dropna()    
    

In [21]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54042   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54043   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54044   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [22]:
#----Rolling average of wickets---
df['rolling_wkts_5'] = (
    df.groupby('bowler')['wickets']
      .rolling(5)
      .mean()
      .reset_index(level=0, drop=True)
)

df['rolling_wkts_5'] = df['rolling_wkts_5'].fillna(
    df['wickets'].mean()
)
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54042   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54043   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54044   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [23]:
#---------Venue wise bowling average------
df['bowler_venue_avg'] = (
    df.groupby(['bowler', 'venue'])['wickets']
      .transform('mean')
)
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54042   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54043   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54044   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [24]:
#----------Career average wickets------
df['bowler_career_avg'] = (
    df.groupby('bowler')['wickets']
      .expanding()
      .mean()
      .shift(1)
      .reset_index(level=0, drop=True)
)
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54042   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54043   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54044   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [25]:
#--------Bowler vs Batting Team (team Matchup)-------
df['bowler_vs_team_avg'] = (
    df.groupby(['bowler', 'batting_team'])['wickets']
      .expanding()
      .mean()
      .shift(1)
      .reset_index(level=[0,1], drop=True)
)
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54042   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54043   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54044   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54045   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54046   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team       batter  \
0            Kolkata Knight Riders  Royal C

In [26]:
df['next_match_wicket'] = df.groupby('bowler')['wickets'].shift(-1)

df= df.dropna() 

In [27]:
print(df)

       match_id        date                                     venue  \
0        335982  2008-04-18                     M Chinnaswamy Stadium   
26       335982  2008-04-18                     M Chinnaswamy Stadium   
27       335982  2008-04-18                     M Chinnaswamy Stadium   
28       335982  2008-04-18                     M Chinnaswamy Stadium   
29       335982  2008-04-18                     M Chinnaswamy Stadium   
...         ...         ...                                       ...   
54068   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54036   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54031   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54032   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   
54033   1426312  2024-05-26  MA Chidambaram Stadium, Chepauk, Chennai   

                      batting_team                 bowling_team  \
0            Kolkata Knight Riders  Royal Challengers Ba

In [28]:
print(df.columns.tolist())


['match_id', 'date', 'venue', 'batting_team', 'bowling_team', 'batter', 'bowler', 'runs_scored', 'wickets', 'balls', 'rolling_avg_5', 'venue_avg', 'pvt_avg', 'pvp_avg', 'career_avg', 'next_match_runs', 'rolling_wkts_5', 'bowler_venue_avg', 'bowler_career_avg', 'bowler_vs_team_avg', 'next_match_wicket']


In [29]:
df = df.sort_values('match_id').reset_index(drop=True)


In [30]:
#feature selection
features_runs = df[['rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg']]
labels_runs   = df[['next_match_runs']]


In [31]:
features_wickets = df[['rolling_wkts_5','bowler_venue_avg','pvt_avg','pvp_avg','bowler_career_avg',
                      'bowler_vs_team_avg']]
labels_wickets   = df[['next_match_wicket']]


In [32]:
print("Features(Runs):")
print(features_runs.head(5))

Features(Runs):
   rolling_avg_5  venue_avg  pvt_avg    pvp_avg  career_avg
0        6.04888   8.077586      0.0  10.181818         0.0
1        6.04888   6.666667      0.0   1.000000        11.5
2        6.04888   6.666667      1.0   2.000000         1.0
3        6.04888   6.666667      1.5   3.000000         1.5
4        6.04888   6.333333      1.5   2.333333         1.5


In [33]:
print("Labels(Runs):")
print(labels_runs.head(5))

Labels(Runs):
   next_match_runs
0              2.0
1              2.0
2              3.0
3              3.0
4              6.0


In [34]:
print("Features(Wickets):")
print(features_wickets.head(5))

Features(Wickets):
   rolling_wkts_5  bowler_venue_avg  pvt_avg    pvp_avg  bowler_career_avg  \
0        0.237864          0.111111      0.0  10.181818           0.000000   
1        0.237864          0.300000      0.0   1.000000           0.333333   
2        0.400000          0.236842      1.0   2.000000           0.250000   
3        0.000000          0.111111      1.5   3.000000           0.000000   
4        0.237864          0.287037      1.5   2.333333           0.000000   

   bowler_vs_team_avg  
0            0.000000  
1            0.333333  
2            0.250000  
3            0.000000  
4            0.000000  


In [35]:
print("Labels(Wickets):")
print(labels_wickets.head(5))

Labels(Wickets):
   next_match_wicket
0                0.0
1                1.0
2                0.0
3                1.0
4                0.0


In [36]:
split = int(len(df) * 0.8)

# Runs
Xr_train, Xr_test = features_runs[:split], features_runs[split:]
yr_train, yr_test = labels_runs[:split], labels_runs[split:]

# Wickets
Xw_train, Xw_test = features_wickets[:split], features_wickets[split:]
yw_train, yw_test = labels_wickets[:split], labels_wickets[split:]


In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [38]:
pipeline_runs = Pipeline([
    ('scaler', StandardScaler())
])

Xr_train_scaled = pipeline_runs.fit_transform(Xr_train)
Xr_test_scaled  = pipeline_runs.transform(Xr_test)

joblib.dump(
    pipeline_runs,
    r"C:\Users\ADMIN\Desktop\ipl_score\scripts\pipeline_runs.pkl"
)


['C:\\Users\\ADMIN\\Desktop\\ipl_score\\scripts\\pipeline_runs.pkl']

In [39]:
pipeline_wickets = Pipeline([
    ('scaler', StandardScaler())
])

Xw_train_scaled = pipeline_wickets.fit_transform(Xw_train)
Xw_test_scaled  = pipeline_wickets.transform(Xw_test)

joblib.dump(
    pipeline_wickets,
    r"C:\Users\ADMIN\Desktop\ipl_score\scripts\pipeline_wickets.pkl"
)


['C:\\Users\\ADMIN\\Desktop\\ipl_score\\scripts\\pipeline_wickets.pkl']

In [42]:
final_df = df[
    ['batter','date','venue','bowling_team',
     'rolling_avg_5','venue_avg','pvt_avg','pvp_avg','career_avg',
     'next_match_runs','rolling_wkts_5','bowler_venue_avg','bowler_career_avg',
     'bowler_vs_team_avg','next_match_wicket']
]

final_df.to_csv(
    r"C:\Users\ADMIN\Desktop\ipl_score\data\processed\dataset.csv",
    index=False
)


In [41]:
print(final_df.head())
print(final_df.shape)


      batter        date                  venue                 bowling_team  \
0    V Kohli  2008-04-18  M Chinnaswamy Stadium        Kolkata Knight Riders   
1   W Jaffer  2008-04-18  M Chinnaswamy Stadium        Kolkata Knight Riders   
2   W Jaffer  2008-04-18  M Chinnaswamy Stadium        Kolkata Knight Riders   
3   W Jaffer  2008-04-18  M Chinnaswamy Stadium        Kolkata Knight Riders   
4  DJ Hussey  2008-04-18  M Chinnaswamy Stadium  Royal Challengers Bangalore   

   rolling_avg_5  venue_avg  pvt_avg    pvp_avg  career_avg  next_match_runs  \
0        6.04888   8.077586      0.0  10.181818         0.0              2.0   
1        6.04888   6.666667      0.0   1.000000        11.5              2.0   
2        6.04888   6.666667      1.0   2.000000         1.0              3.0   
3        6.04888   6.666667      1.5   3.000000         1.5              3.0   
4        6.04888   6.333333      1.5   2.333333         1.5              6.0   

   rolling_wkts_5  bowler_venue_avg  p