In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
deliveries = pd.read_csv(r'C:\Users\soura\OneDrive\Desktop\Projects\IPL_Prediction\data\processed\cleaned_deliveries.csv')
matches = pd.read_csv(r'C:\Users\soura\OneDrive\Desktop\Projects\IPL_Prediction\data\processed\cleaned_matches.csv')

In [3]:
deliveries.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,extra_runs,total_runs,is_wicket
0,335982,1,kolkata knight riders,royal challengers bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,1,1,0
1,335982,1,kolkata knight riders,royal challengers bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,0,0,0
2,335982,1,kolkata knight riders,royal challengers bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,1,1,0
3,335982,1,kolkata knight riders,royal challengers bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,0,0,0
4,335982,1,kolkata knight riders,royal challengers bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,0,0,0


In [4]:
matches.head()

Unnamed: 0,id,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin
0,335982,2007/08,Bangalore,2008-04-18,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
1,335983,2007/08,Chandigarh,2008-04-19,League,MEK Hussey,Punjab Cricket Association Stadium,kings xi punjab,chennai super kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0
2,335984,2007/08,Delhi,2008-04-19,League,MF Maharoof,Arun Jaitley Stadium,delhi daredevils,rajasthan royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0
3,335985,2007/08,Mumbai,2008-04-20,League,MV Boucher,Wankhede Stadium,mumbai indians,royal challengers bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0
4,335986,2007/08,Kolkata,2008-04-20,League,DJ Hussey,Eden Gardens,kolkata knight riders,deccan chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0


## Feature: Batting Strength

A measure of a team's batting capability based on player statistics.

Approach:

Calculate the sum of the strike rates and average runs per match of all batters in a team.

Normalize this value for consistency.

In [5]:
# Calculate individual batter strike rates
batsman_stats = deliveries.groupby("batter").agg({"batsman_runs": "sum", "ball": "count"})
batsman_stats["strike_rate"] = (batsman_stats["batsman_runs"] / batsman_stats["ball"]) * 100

# Aggregate batting strength per team
batting_strength = deliveries.groupby("batting_team")["batter"].apply(lambda x: x.map(batsman_stats["strike_rate"]).sum())
deliveries["batting_strength"] = deliveries["batting_team"].map(batting_strength)


## Feature: Bowling Strength

A measure of how strong a team’s bowling unit is.

Approach:

Calculate average economy rate for each bowler.

Sum up a team's wicket-taking ability.


In [6]:
# Calculate individual bowler economy rate and wickets
bowler_stats = deliveries.groupby("bowler").agg({"total_runs": "sum", "ball": "count", "is_wicket": "sum"})
bowler_stats["economy_rate"] = (bowler_stats["total_runs"] / (bowler_stats["ball"] / 6))

# Aggregate bowling strength per team (lower economy and more wickets are better)
bowling_strength = deliveries.groupby("bowling_team")["bowler"].apply(lambda x: x.map(bowler_stats["is_wicket"]).sum())
deliveries["bowling_strength"] = deliveries["bowling_team"].map(bowling_strength)


## Feature: Match Context

The situational importance of the match, such as powerplay, death overs, and match phase.

Approach:

Define Powerplay (1-6 overs), Middle Overs (7-15), and Death Overs (16-20).

Assign match phases based on the over.

In [7]:
# Assign match context based on overs
def assign_match_phase(over):
    if over <= 6:
        return "Powerplay"
    elif 7 <= over <= 15:
        return "Middle Overs"
    else:
        return "Death Overs"

deliveries["match_phase"] = deliveries["over"].apply(assign_match_phase)


### Cumulative Runs: Total runs scored till that ball

In [8]:
# Aggregate inning data to compute cumulative features
deliveries['cumulative_runs'] = deliveries.groupby(['match_id', 'inning'])['total_runs'].cumsum()

### Wickets fallen : Wickets lost till that ball

In [9]:
deliveries['wickets_fallen'] = deliveries.groupby(['match_id', 'inning'])['is_wicket'].cumsum()

### Remaining Overs: 20 - current over

In [10]:
# Calculate remaining overs and balls
deliveries["remaining_overs"] = 20 - deliveries["over"]
deliveries["remaining_balls"] = (20 * 6) - (deliveries["over"] * 6 + deliveries["ball"])

### Run Rate: Current run rate = total_runs / overs

In [11]:
# Current run rate (RR)
deliveries["run_rate"] = deliveries["cumulative_runs"] / (deliveries["over"] + 1)

In [12]:
# Merge deliveries with matches to get season, city, venue, etc.
combined_data = deliveries.merge(matches, left_on="match_id", right_on="id", how="left")

In [13]:
combined_data.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin
0,335982,1,kolkata knight riders,royal challengers bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
1,335982,1,kolkata knight riders,royal challengers bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
2,335982,1,kolkata knight riders,royal challengers bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
3,335982,1,kolkata knight riders,royal challengers bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
4,335982,1,kolkata knight riders,royal challengers bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0


In [14]:
combined_data.columns

Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs',
       'total_runs', 'is_wicket', 'batting_strength', 'bowling_strength',
       'match_phase', 'cumulative_runs', 'wickets_fallen', 'remaining_overs',
       'remaining_balls', 'run_rate', 'id', 'season', 'city', 'date',
       'match_type', 'player_of_match', 'venue', 'team1', 'team2',
       'toss_winner', 'toss_decision', 'winner', 'result', 'result_margin'],
      dtype='object')

In [17]:
combined_data.head()

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batter,bowler,non_striker,batsman_runs,...,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin
0,335982,1,kolkata knight riders,royal challengers bangalore,0,1,SC Ganguly,P Kumar,BB McCullum,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
1,335982,1,kolkata knight riders,royal challengers bangalore,0,2,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
2,335982,1,kolkata knight riders,royal challengers bangalore,0,3,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
3,335982,1,kolkata knight riders,royal challengers bangalore,0,4,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0
4,335982,1,kolkata knight riders,royal challengers bangalore,0,5,BB McCullum,P Kumar,SC Ganguly,0,...,League,BB McCullum,M. Chinnaswamy Stadium,royal challengers bangalore,kolkata knight riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0


In [18]:
combined_data.shape

(260920, 35)

In [19]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260920 entries, 0 to 260919
Data columns (total 35 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   match_id          260920 non-null  int64  
 1   inning            260920 non-null  int64  
 2   batting_team      260920 non-null  object 
 3   bowling_team      260920 non-null  object 
 4   over              260920 non-null  int64  
 5   ball              260920 non-null  int64  
 6   batter            260920 non-null  object 
 7   bowler            260920 non-null  object 
 8   non_striker       260920 non-null  object 
 9   batsman_runs      260920 non-null  int64  
 10  extra_runs        260920 non-null  int64  
 11  total_runs        260920 non-null  int64  
 12  is_wicket         260920 non-null  int64  
 13  batting_strength  260920 non-null  float64
 14  bowling_strength  260920 non-null  int64  
 15  match_phase       260920 non-null  object 
 16  cumulative_runs   26

In [20]:
combined_data.describe()

Unnamed: 0,match_id,inning,over,ball,batsman_runs,extra_runs,total_runs,is_wicket,batting_strength,bowling_strength,cumulative_runs,wickets_fallen,remaining_overs,remaining_balls,run_rate,id,result_margin
count,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0,260920.0
mean,907066.5,1.483531,9.197677,3.624486,1.265001,0.067806,1.332807,0.049632,2847180.0,1929097.0,76.2142,2.457098,10.802323,61.189449,7.019484,907066.5,17.006339
std,367991.3,0.502643,5.683484,1.81492,1.639298,0.343265,1.626416,0.217184,1123483.0,872181.8,49.336267,2.097949,5.683484,34.146916,2.211554,367991.3,21.537025
min,335982.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,186331.1,77170.0,0.0,0.0,1.0,-4.0,0.0,335982.0,0.0
25%,548334.0,1.0,4.0,2.0,0.0,0.0,0.0,0.0,2354401.0,1481000.0,35.0,1.0,6.0,32.0,5.894737,548334.0,5.0
50%,980967.0,1.0,9.0,4.0,1.0,0.0,1.0,0.0,3300786.0,2040585.0,72.0,2.0,11.0,62.0,7.153846,980967.0,8.0
75%,1254066.0,2.0,14.0,5.0,1.0,0.0,1.0,0.0,3665110.0,2700823.0,112.0,4.0,16.0,91.0,8.35,1254066.0,20.0
max,1426312.0,6.0,19.0,11.0,6.0,7.0,7.0,1.0,4012460.0,3076403.0,287.0,10.0,20.0,119.0,27.0,1426312.0,146.0


In [21]:
combined_data.describe(include='object')

Unnamed: 0,batting_team,bowling_team,batter,bowler,non_striker,match_phase,season,city,date,match_type,player_of_match,venue,team1,team2,toss_winner,toss_decision,winner,result
count,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920,260920
unique,19,19,673,530,663,3,17,34,823,8,292,38,19,19,19,2,20,4
top,mumbai indians,mumbai indians,V Kohli,R Ashwin,V Kohli,Middle Overs,2013,Mumbai,2020-10-18,League,AB de Villiers,Wankhede Stadium,royal challengers bangalore,mumbai indians,Mumbai Indians,field,Mumbai Indians,wickets
freq,31437,31505,6236,4679,6067,118979,18177,41742,520,245098,6170,28506,31649,33330,34357,167471,34629,135296


In [22]:
combined_data.to_csv(r'C:\Users\soura\OneDrive\Desktop\Projects\IPL_Prediction\data\processed\combined_data.csv', index=False)