In [42]:
import numpy as np
import pandas as pd

In [48]:
# Reading the file
data = pd.read_csv('/content/deliveries.csv')

In [49]:
data.head()

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,...,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.1,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
1,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.2,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
2,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.3,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,
3,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.4,England,New Zealand,DJ Malan,JM Bairstow,...,0,,,,,,,,,
4,1,2023/24,2023-10-05,"Narendra Modi Stadium, Ahmedabad",1,0.5,England,New Zealand,JM Bairstow,DJ Malan,...,0,,,,,,,,,


In [50]:
data.columns

Index(['match_id', 'season', 'start_date', 'venue', 'innings', 'ball',
       'batting_team', 'bowling_team', 'striker', 'non_striker', 'bowler',
       'runs_off_bat', 'extras', 'wides', 'noballs', 'byes', 'legbyes',
       'penalty', 'wicket_type', 'player_dismissed', 'other_wicket_type',
       'other_player_dismissed'],
      dtype='object')

In [51]:
# Taking only those columns which are important
data = data[["match_id","ball","batting_team","bowling_team","striker","bowler","runs_off_bat","wicket_type","player_dismissed"]]

In [53]:
data.shape

(17380, 9)

In [54]:
data.isna().sum()

match_id                0
ball                    0
batting_team            0
bowling_team            0
striker                 0
bowler                  0
runs_off_bat            0
wicket_type         16889
player_dismissed    16889
dtype: int64

In [55]:
data['wickets'] = 0

# Update wickets count for rows where player is dismissed and name matches striker
mask = (data['player_dismissed'].notnull()) & (data['player_dismissed'] == data['striker'])
data.loc[mask, 'wickets'] = 1

# Calculate cumulative wickets player-wise for all matches
data['cumulative_wickets'] = data.groupby('striker')['wickets'].cumsum()

# Fill NaN values with 0
data['cumulative_wickets'] = data['cumulative_wickets'].fillna(0).astype(int)

In [56]:
data

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets
0,1,0.1,England,New Zealand,JM Bairstow,TA Boult,0,,,0,0
1,1,0.2,England,New Zealand,JM Bairstow,TA Boult,6,,,0,0
2,1,0.3,England,New Zealand,JM Bairstow,TA Boult,1,,,0,0
3,1,0.4,England,New Zealand,DJ Malan,TA Boult,1,,,0,0
4,1,0.5,England,New Zealand,JM Bairstow,TA Boult,4,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
17375,32,34.5,New Zealand,South Africa,MJ Henry,KA Maharaj,0,,,0,2
17376,32,34.6,New Zealand,South Africa,MJ Henry,KA Maharaj,0,,,0,2
17377,32,35.1,New Zealand,South Africa,GD Phillips,G Coetzee,0,,,0,4
17378,32,35.2,New Zealand,South Africa,GD Phillips,G Coetzee,6,,,0,4


In [59]:
# Finding the total runs for each bowler
data['cumulative_runs'] = data.groupby('striker')['runs_off_bat'].cumsum()

In [60]:
data.head(5)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs
0,1,0.1,England,New Zealand,JM Bairstow,TA Boult,0,,,0,0,0
1,1,0.2,England,New Zealand,JM Bairstow,TA Boult,6,,,0,0,6
2,1,0.3,England,New Zealand,JM Bairstow,TA Boult,1,,,0,0,7
3,1,0.4,England,New Zealand,DJ Malan,TA Boult,1,,,0,0,1
4,1,0.5,England,New Zealand,JM Bairstow,TA Boult,4,,,0,0,11


In [61]:
#data['balls_faced'] = data.groupby('striker').cumcount() + 1

In [62]:
# Getting the strike rate for each batsmen
data['batsman_strike_rate'] = (data['cumulative_runs'] / data['balls_faced']) * 100

In [63]:
# Finding cumulative wickets for each bowler
data['cumulative_wickets_bowler'] = data.groupby('bowler')['wickets'].cumsum()

# If you want to fill NaN values with 0
data['cumulative_wickets_bowler'].fillna(0, inplace=True)

In [64]:
data.head(5)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate,cumulative_wickets_bowler
0,1,0.1,England,New Zealand,JM Bairstow,TA Boult,0,,,0,0,0,1,0.0,0
1,1,0.2,England,New Zealand,JM Bairstow,TA Boult,6,,,0,0,6,2,300.0,0
2,1,0.3,England,New Zealand,JM Bairstow,TA Boult,1,,,0,0,7,3,233.333333,0
3,1,0.4,England,New Zealand,DJ Malan,TA Boult,1,,,0,0,1,1,100.0,0
4,1,0.5,England,New Zealand,JM Bairstow,TA Boult,4,,,0,0,11,4,275.0,0


In [65]:
#data['balls_bowled'] = data.groupby('bowler').cumcount() + 1

In [66]:
# Calculate Bowling Strike Rate
data['bowling_strike_rate'] = data['balls_bowled'] / data['cumulative_wickets_bowler'].where(data['cumulative_wickets_bowler'] != 0, 1)

# If you want to fill NaN values (for cases where a bowler hasn't taken any wickets yet)
data['bowling_strike_rate'].fillna(0, inplace=True)

In [67]:
data.head(5)

Unnamed: 0,match_id,ball,batting_team,bowling_team,striker,bowler,runs_off_bat,wicket_type,player_dismissed,wickets,cumulative_wickets,cumulative_runs,balls_faced,batsman_strike_rate,cumulative_wickets_bowler,balls_bowled,bowling_strike_rate
0,1,0.1,England,New Zealand,JM Bairstow,TA Boult,0,,,0,0,0,1,0.0,0,1,1.0
1,1,0.2,England,New Zealand,JM Bairstow,TA Boult,6,,,0,0,6,2,300.0,0,2,2.0
2,1,0.3,England,New Zealand,JM Bairstow,TA Boult,1,,,0,0,7,3,233.333333,0,3,3.0
3,1,0.4,England,New Zealand,DJ Malan,TA Boult,1,,,0,0,1,1,100.0,0,4,4.0
4,1,0.5,England,New Zealand,JM Bairstow,TA Boult,4,,,0,0,11,4,275.0,0,5,5.0


In [68]:
batsmen_data = data[['batting_team', 'striker', 'batsman_strike_rate']].copy()
bowlers_data = data[['bowling_team', 'bowler', 'bowling_strike_rate']].copy()

In [70]:
batsmen_data[batsmen_data['striker'] == 'V Kohli']

Unnamed: 0,batting_team,striker,batsman_strike_rate
2425,India,V Kohli,0.000000
2426,India,V Kohli,0.000000
2433,India,V Kohli,66.666667
2434,India,V Kohli,50.000000
2435,India,V Kohli,40.000000
...,...,...,...
15319,India,V Kohli,87.407407
15320,India,V Kohli,87.192118
15321,India,V Kohli,86.977887
15322,India,V Kohli,86.764706


In [71]:
# Calculate average batting strike rate for each batsman
batsmen_data['avg_batting_strike_rate'] = batsmen_data.groupby(['batting_team', 'striker'])['batsman_strike_rate'].transform('mean')
# Calculate average bowling strike rate for each bowler
bowlers_data['avg_bowling_strike_rate'] = bowlers_data.groupby(['bowling_team', 'bowler'])['bowling_strike_rate'].transform('mean')


In [72]:
batsmen_data = batsmen_data.drop_duplicates(subset=['batting_team', 'striker'])
bowlers_data = bowlers_data.drop_duplicates(subset=['bowling_team', 'bowler'])

In [73]:
batsmen_data[batsmen_data['striker'] == 'V Kohli']

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate
2425,India,V Kohli,0.0,75.105182


In [74]:
# Calculate rankings for batsmen and bowlers within each team
batsmen_data['batsman_rank'] = batsmen_data.groupby(['batting_team'])['avg_batting_strike_rate'].rank(ascending=False)
bowlers_data['bowler_rank'] = bowlers_data.groupby(['bowling_team'])['avg_bowling_strike_rate'].rank(ascending=True)

In [75]:
batsmen_data[batsmen_data['striker'] == 'V Kohli']

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate,batsman_rank
2425,India,V Kohli,0.0,75.105182,8.0


In [76]:
# Display the resulting DataFrames
print("Batsmen DataFrame:")
print(batsmen_data[batsmen_data['batting_team'] == 'India'])

print("\nBowlers DataFrame:")
print(bowlers_data)

Batsmen DataFrame:
      batting_team         striker  batsman_strike_rate  \
2420         India       RG Sharma                  0.0   
2424         India    Ishan Kishan                  0.0   
2425         India         V Kohli                  0.0   
2430         India         SS Iyer                  0.0   
2438         India        KL Rahul                  0.0   
2651         India       HH Pandya                100.0   
5829         India    Shubman Gill                400.0   
10830        India        SA Yadav                100.0   
10840        India       RA Jadeja                  0.0   
10926        India  Mohammed Shami                100.0   
15531        India       JJ Bumrah                  0.0   
15562        India   Kuldeep Yadav                  0.0   

       avg_batting_strike_rate  batsman_rank  
2420                126.600993           1.0  
2424                 56.864697          10.0  
2425                 75.105182           8.0  
2430                 82.2

In [77]:
batsmen_data.head(5)

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate,batsman_rank
0,England,JM Bairstow,0.0,97.038568,6.0
3,England,DJ Malan,100.0,94.382799,8.0
48,England,JE Root,0.0,90.415623,9.0
79,England,HC Brook,0.0,123.741021,2.0
105,England,MM Ali,100.0,73.814839,13.0


In [78]:
bowlers_data.head(5)

Unnamed: 0,bowling_team,bowler,bowling_strike_rate,avg_bowling_strike_rate,bowler_rank
0,New Zealand,TA Boult,1.0,49.349654,8.0
6,New Zealand,MJ Henry,1.0,23.808435,3.0
37,New Zealand,MJ Santner,1.0,25.294835,4.0
92,New Zealand,JDS Neesham,1.0,44.005618,7.0
98,New Zealand,R Ravindra,1.0,64.932359,9.0


In [79]:
batsmen_data[batsmen_data['striker'] == 'Shubman Gill']

Unnamed: 0,batting_team,striker,batsman_strike_rate,avg_batting_strike_rate,batsman_rank
5829,India,Shubman Gill,400.0,111.48871,3.0


In [80]:
batsmen_data[batsmen_data['batting_team'] == 'India']['striker'].value_counts()

RG Sharma         1
Ishan Kishan      1
V Kohli           1
SS Iyer           1
KL Rahul          1
HH Pandya         1
Shubman Gill      1
SA Yadav          1
RA Jadeja         1
Mohammed Shami    1
JJ Bumrah         1
Kuldeep Yadav     1
Name: striker, dtype: int64

In [81]:
squad_players = [
    "RG Sharma","Ishan Kishan","V Kohli","SS Iyer","KL Rahul","HH Pandya","Shubman Gill",
    "SA Yadav","RA Jadeja","Mohammed Shami","JJ Bumrah","Kuldeep Yadav"
]

# Filter batsmen_data for players in the squad
squad_batsmen = batsmen_data[batsmen_data['striker'].isin(squad_players)]

# Sort the squad batsmen by their batting rank
top_squad_batsmen = squad_batsmen.sort_values(by='batsman_rank').head(6)

# Display the list of top 6 batsmen from the squad
print("Top 6 Batsmen from the Squad:")
print(top_squad_batsmen[['striker', 'avg_batting_strike_rate', 'batsman_rank']])

Top 6 Batsmen from the Squad:
             striker  avg_batting_strike_rate  batsman_rank
2420       RG Sharma               126.600993           1.0
2651       HH Pandya               112.752976           2.0
5829    Shubman Gill               111.488710           3.0
10830       SA Yadav                89.656012           4.0
15562  Kuldeep Yadav                84.508843           5.0
2430         SS Iyer                82.255148           6.0


In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming you have a DataFrame named 'data' with relevant features
# (batting_team, striker, batsman_strike_rate, bowling_team, bowler, bowling_strike_rate)

# Separate data for batsmen and bowlers
batsmen_data = data[['batting_team', 'striker', 'batsman_strike_rate']].copy()
bowlers_data = data[['bowling_team', 'bowler', 'bowling_strike_rate']].copy()

# Calculate average batting strike rate for each batsman
batsmen_data['avg_batting_strike_rate'] = batsmen_data.groupby(['batting_team', 'striker'])['batsman_strike_rate'].transform('mean')

# Calculate average bowling strike rate for each bowler
bowlers_data['avg_bowling_strike_rate'] = bowlers_data.groupby(['bowling_team', 'bowler'])['bowling_strike_rate'].transform('mean')

batsmen_data = batsmen_data.drop_duplicates(subset=['batting_team', 'striker'])
bowlers_data = bowlers_data.drop_duplicates(subset=['bowling_team', 'bowler'])

# Use only relevant features for prediction
batsmen_features = ['avg_batting_strike_rate']
bowlers_features = ['avg_bowling_strike_rate']

# Separate squads for batsmen and bowlers
squad_players = [
    'RG Sharma', 'Shubman Gill', 'V Kohli',
    'SS Iyer', 'Ishan Kishan', 'KL Rahul', 'SA Yadav',
    'RA Jadeja', 'SN Thakur', 'JJ Bumrah',
    'Mohammed Shami', 'Mohammed Siraj', 'Kuldeep Yadav','R Ashwin','M Prasidh Krishna'
]

squad_batsmen = batsmen_data[batsmen_data['striker'].isin(squad_players)]
squad_bowlers = bowlers_data[bowlers_data['bowler'].isin(squad_players)]

# Train separate models for batsmen and bowlers
batsmen_model = LinearRegression()
bowlers_model = LinearRegression()

# Train batsmen model
X_batsmen = squad_batsmen[batsmen_features].values.reshape(-1, 1)
y_batsmen = squad_batsmen.groupby(['batting_team'])['avg_batting_strike_rate'].rank(ascending=False)

batsmen_model.fit(X_batsmen, y_batsmen)

# Train bowlers model
X_bowlers = squad_bowlers[bowlers_features].values.reshape(-1, 1)
y_bowlers = squad_bowlers.groupby(['bowling_team'])['avg_bowling_strike_rate'].rank(ascending=True)

bowlers_model.fit(X_bowlers, y_bowlers)

# Predict for squad players
# squad_data = pd.DataFrame({'player': squad_players})

# Predict batsmen ranks
squad_batsmen['predicted_batsmen_rank'] = batsmen_model.predict(squad_batsmen[batsmen_features].values.reshape(-1, 1))

# Predict bowlers ranks
squad_bowlers['predicted_bowlers_rank'] = bowlers_model.predict(squad_bowlers[bowlers_features].values.reshape(-1, 1))

# Rank squad players based on predicted performance
squad_batsmen['batsmen_rank'] = squad_batsmen['predicted_batsmen_rank'].rank()
squad_bowlers['bowlers_rank'] = squad_bowlers['predicted_bowlers_rank'].rank()

# Select top 6 batsmen and top 5 bowlers
top_batsmen = squad_batsmen.sort_values(by='batsmen_rank').head(6)
top_bowlers = squad_bowlers.sort_values(by='bowlers_rank').head(6)

# Display the results
print("Top 6 Batsmen from the Indian Squad:")
print(top_batsmen[['striker', 'predicted_batsmen_rank', 'batsmen_rank']])

print("Top 6 Bowlers from the Indian Squad:")
print(top_bowlers[['bowler', 'predicted_bowlers_rank', 'bowlers_rank']])

Top 6 Batsmen from the Indian Squad:
             striker  predicted_batsmen_rank  batsmen_rank
2420       RG Sharma                0.309691           1.0
5829    Shubman Gill                2.046457           2.0
10830       SA Yadav                4.555562           3.0
15562  Kuldeep Yadav                5.147096           4.0
2430         SS Iyer                5.406100           5.0
2438        KL Rahul                6.154404           6.0
Top 6 Bowlers from the Indian Squad:
               bowler  predicted_bowlers_rank  bowlers_rank
8163          V Kohli                0.331458           1.0
10377  Mohammed Shami                2.256736           2.0
2117        JJ Bumrah                3.827448           3.0
2159         R Ashwin                4.496925           4.0
2231        RA Jadeja                4.835498           5.0
2189    Kuldeep Yadav                5.638574           6.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squad_batsmen['predicted_batsmen_rank'] = batsmen_model.predict(squad_batsmen[batsmen_features].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squad_bowlers['predicted_bowlers_rank'] = bowlers_model.predict(squad_bowlers[bowlers_features].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Assuming you have a DataFrame named 'data' with relevant features
# (batting_team, striker, batsman_strike_rate, bowling_team, bowler, bowling_strike_rate)

# Separate data for batsmen and bowlers
batsmen_data = data[['batting_team', 'striker', 'batsman_strike_rate']].copy()
bowlers_data = data[['bowling_team', 'bowler', 'bowling_strike_rate']].copy()

# Calculate average batting strike rate for each batsman
batsmen_data['avg_batting_strike_rate'] = batsmen_data.groupby(['batting_team', 'striker'])['batsman_strike_rate'].transform('mean')

# Calculate average bowling strike rate for each bowler
bowlers_data['avg_bowling_strike_rate'] = bowlers_data.groupby(['bowling_team', 'bowler'])['bowling_strike_rate'].transform('mean')

batsmen_data = batsmen_data.drop_duplicates(subset=['batting_team', 'striker'])
bowlers_data = bowlers_data.drop_duplicates(subset=['bowling_team', 'bowler'])

# Use only relevant features for prediction
batsmen_features = ['avg_batting_strike_rate']
bowlers_features = ['avg_bowling_strike_rate']

# Separate squads for batsmen and bowlers
squad_players = [
    'T Bavuma', 'Q de Kock', 'RR Hendricks',
    'H Klaasen', 'AK Markram', 'DA Miller', 'HE van der Dussen',
    'M Jansen', 'AL Phehlukwayo', 'G Coetzee',
    'KA Maharaj', 'L Ngidi', 'K Rabada','T Shamsi','LB Williams'
]

squad_batsmen = batsmen_data[batsmen_data['striker'].isin(squad_players)]
squad_bowlers = bowlers_data[bowlers_data['bowler'].isin(squad_players)]

# Train separate models for batsmen and bowlers
batsmen_model = LinearRegression()
bowlers_model = LinearRegression()

# Train batsmen model
X_batsmen = squad_batsmen[batsmen_features].values.reshape(-1, 1)
y_batsmen = squad_batsmen.groupby(['batting_team'])['avg_batting_strike_rate'].rank(ascending=False)

batsmen_model.fit(X_batsmen, y_batsmen)

# Train bowlers model
X_bowlers = squad_bowlers[bowlers_features].values.reshape(-1, 1)
y_bowlers = squad_bowlers.groupby(['bowling_team'])['avg_bowling_strike_rate'].rank(ascending=True)

bowlers_model.fit(X_bowlers, y_bowlers)

# Predict for squad players
# squad_data = pd.DataFrame({'player': squad_players})

# Predict batsmen ranks
squad_batsmen['predicted_batsmen_rank'] = batsmen_model.predict(squad_batsmen[batsmen_features].values.reshape(-1, 1))

# Predict bowlers ranks
squad_bowlers['predicted_bowlers_rank'] = bowlers_model.predict(squad_bowlers[bowlers_features].values.reshape(-1, 1))

# Rank squad players based on predicted performance
squad_batsmen['batsmen_rank'] = squad_batsmen['predicted_batsmen_rank'].rank()
squad_bowlers['bowlers_rank'] = squad_bowlers['predicted_bowlers_rank'].rank()

# Select top 6 batsmen and top 5 bowlers
top_batsmen = squad_batsmen.sort_values(by='batsmen_rank').head(6)
top_bowlers = squad_bowlers.sort_values(by='bowlers_rank').head(8)

# Display the results
print("Top 6 Batsmen from the South African Squad:")
print(top_batsmen[['striker', 'predicted_batsmen_rank', 'batsmen_rank']])

print("Top 8 Bowlers from the South African Squad:")
print(top_bowlers[['bowler', 'predicted_bowlers_rank', 'bowlers_rank']])

Top 6 Batsmen from the South African Squad:
         striker  predicted_batsmen_rank  batsmen_rank
7529    K Rabada                2.822303           1.0
1754   H Klaasen                2.885709           2.0
1713  AK Markram                3.062619           3.0
7485   G Coetzee                4.233260           4.0
1792   DA Miller                4.330017           5.0
1523   Q de Kock                6.217146           6.0
Top 8 Bowlers from the South African Squad:
            bowler  predicted_bowlers_rank  bowlers_rank
13555   AK Markram                0.656941           1.0
1916     G Coetzee                2.704169           2.0
11981     T Shamsi                3.617579           3.0
1874      K Rabada                4.117922           4.0
1843      M Jansen                4.228259           5.0
12416  LB Williams                5.358770           6.0
1892    KA Maharaj                7.369238           7.0
1837       L Ngidi                7.947121           8.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squad_batsmen['predicted_batsmen_rank'] = batsmen_model.predict(squad_batsmen[batsmen_features].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squad_bowlers['predicted_bowlers_rank'] = bowlers_model.predict(squad_bowlers[bowlers_features].values.reshape(-1, 1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#