In [34]:
import pandas as pd

In [35]:
import pandas as pd

# Load the CSV file
input_csv = 'ipl_ball_by_ball.csv'  # Update this to your file path
df = pd.read_csv(input_csv)

In [36]:
# Columns to drop
columns_to_drop = [
    'match_id', 'batter', 'bowler', 'non_striker', 'runs_batter', 'date',
    'toss_winner', 'toss_decision', 'extras_wides', 'extras_legbyes',
    'extras_byes', 'extras_noballs', 'extras_penalty', 'player_out', 'fielder'
]

In [37]:
df = df.drop(columns=columns_to_drop)

In [38]:
df['balls_bowled'] = df['over'] * 6 + df['ball']

In [39]:
df = df.sort_values(['batting_team', 'bowling_team', 'venue', 'inning', 'balls_bowled'])

In [40]:
df.head()

Unnamed: 0,inning,batting_team,bowling_team,over,ball,runs_extras,runs_total,venue,wicket_kind,balls_bowled
145145,1,Chennai Super Kings,Deccan Chargers,0,1,0,0,Buffalo Park,,1
145146,1,Chennai Super Kings,Deccan Chargers,0,2,0,1,Buffalo Park,,2
145147,1,Chennai Super Kings,Deccan Chargers,0,3,0,0,Buffalo Park,,3
145148,1,Chennai Super Kings,Deccan Chargers,0,4,0,4,Buffalo Park,,4
145149,1,Chennai Super Kings,Deccan Chargers,0,5,0,1,Buffalo Park,,5


In [41]:
df.shape

(262148, 10)

In [42]:
# Calculate cumulative runs
df['current_runs'] = df.groupby(['batting_team', 'bowling_team', 'venue', 'inning'])['runs_total'].cumsum()

In [43]:
# Calculate cumulative wickets: Convert wicket_kind to binary (1 for wicket, 0 for no wicket) then cumsum
df['wicket_occurred'] = df['wicket_kind'].notna().astype(int)  # 1 if wicket_kind is not NaN, 0 if NaN
df['current_wickets'] = df.groupby(['batting_team', 'bowling_team', 'venue', 'inning'])['wicket_occurred'].cumsum()

In [44]:
df = df.drop(columns=['wicket_occurred'])

In [45]:
df.head()

Unnamed: 0,inning,batting_team,bowling_team,over,ball,runs_extras,runs_total,venue,wicket_kind,balls_bowled,current_runs,current_wickets
145145,1,Chennai Super Kings,Deccan Chargers,0,1,0,0,Buffalo Park,,1,0,0
145146,1,Chennai Super Kings,Deccan Chargers,0,2,0,1,Buffalo Park,,2,1,0
145147,1,Chennai Super Kings,Deccan Chargers,0,3,0,0,Buffalo Park,,3,1,0
145148,1,Chennai Super Kings,Deccan Chargers,0,4,0,4,Buffalo Park,,4,5,0
145149,1,Chennai Super Kings,Deccan Chargers,0,5,0,1,Buffalo Park,,5,6,0


In [46]:
# Calculate final score per inning
final_scores = df.groupby(['batting_team', 'bowling_team', 'venue', 'inning'])['runs_total'].sum().reset_index()
final_scores = final_scores.rename(columns={'runs_total': 'final_score'})

In [47]:
# Merge final_score back into the main dataframe
df = df.merge(final_scores, on=['batting_team', 'bowling_team', 'venue', 'inning'], how='left')


In [48]:
# Reorder columns for clarity
final_columns = [
    'batting_team', 'bowling_team', 'venue', 'inning', 'over', 'ball', 'balls_bowled',
    'runs_total', 'wicket_kind', 'current_runs', 'current_wickets', 'final_score'
]
df = df[final_columns]

In [49]:
df.head()

Unnamed: 0,batting_team,bowling_team,venue,inning,over,ball,balls_bowled,runs_total,wicket_kind,current_runs,current_wickets,final_score
0,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,1,1,0,,0,0,178
1,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,2,2,1,,1,0,178
2,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,3,3,0,,1,0,178
3,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,4,4,4,,5,0,178
4,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,5,5,1,,6,0,178


In [50]:
# Save the cleaned dataframe to a new CSV
output_csv = 'ipl_cleaned_data.csv'
df.to_csv(output_csv, index=False)
print(f"Cleaned data saved to {output_csv}")
print("Remaining columns:", df.columns.tolist())

Cleaned data saved to ipl_cleaned_data.csv
Remaining columns: ['batting_team', 'bowling_team', 'venue', 'inning', 'over', 'ball', 'balls_bowled', 'runs_total', 'wicket_kind', 'current_runs', 'current_wickets', 'final_score']


In [51]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression

In [52]:
input_csv = 'ipl_cleaned_data.csv'  # Update this to your file path
df = pd.read_csv(input_csv)

# --- Preprocessing ---
# Check for missing values
print("Missing values:\n", df.isnull().sum())


Missing values:
 batting_team            0
bowling_team            0
venue                   0
inning                  0
over                    0
ball                    0
balls_bowled            0
runs_total              0
wicket_kind        249133
current_runs            0
current_wickets         0
final_score             0
dtype: int64


  df = pd.read_csv(input_csv)


In [53]:
# Handle missing values (if any, though unlikely from prior cleaning)
df['wicket_kind'] = df['wicket_kind'].fillna('none')  # Fill NaN in wicket_kind with 'none'

In [54]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 batting_team       0
bowling_team       0
venue              0
inning             0
over               0
ball               0
balls_bowled       0
runs_total         0
wicket_kind        0
current_runs       0
current_wickets    0
final_score        0
dtype: int64


In [56]:
df.head()

Unnamed: 0,batting_team,bowling_team,venue,inning,over,ball,balls_bowled,runs_total,wicket_kind,current_runs,current_wickets,final_score
0,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,1,1,0,none,0,0,178
1,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,2,2,1,none,1,0,178
2,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,3,3,0,none,1,0,178
3,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,4,4,4,none,5,0,178
4,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,5,5,1,none,6,0,178


In [57]:
# Encode categorical variables
le_batting = LabelEncoder()
le_bowling = LabelEncoder()
le_venue = LabelEncoder()

In [58]:

df['batting_team_encoded'] = le_batting.fit_transform(df['batting_team'])
df['bowling_team_encoded'] = le_bowling.fit_transform(df['bowling_team'])
df['venue_encoded'] = le_venue.fit_transform(df['venue'])

In [59]:
df.head()

Unnamed: 0,batting_team,bowling_team,venue,inning,over,ball,balls_bowled,runs_total,wicket_kind,current_runs,current_wickets,final_score,batting_team_encoded,bowling_team_encoded,venue_encoded
0,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,1,1,0,none,0,0,178,0,1,7
1,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,2,2,1,none,1,0,178,0,1,7
2,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,3,3,0,none,1,0,178,0,1,7
3,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,4,4,4,none,5,0,178,0,1,7
4,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,5,5,1,none,6,0,178,0,1,7


In [60]:
df['balls_remaining'] = 120 - df['balls_bowled']

In [61]:
# 2. Current Run Rate (runs per over up to this ball)
df['current_run_rate'] = (df['current_runs'] / (df['balls_bowled'] / 6)).replace([np.inf, -np.inf], 0)


In [62]:
# 3. Team Scoring Average (historical batting average per team)
team_avg_runs = df.groupby('batting_team')['final_score'].mean().reset_index()
team_avg_runs.columns = ['batting_team', 'team_avg_score']
df = df.merge(team_avg_runs, on='batting_team', how='left')

In [63]:

# 4. Bowling Team Restriction Ability (average runs conceded)
bowling_avg_runs = df.groupby('bowling_team')['final_score'].mean().reset_index()
bowling_avg_runs.columns = ['bowling_team', 'bowling_avg_conceded']
df = df.merge(bowling_avg_runs, on='bowling_team', how='left')

In [64]:
# 5. Venue Scoring Tendency (average score at each venue)
venue_avg_score = df.groupby('venue')['final_score'].mean().reset_index()
venue_avg_score.columns = ['venue', 'venue_avg_score']
df = df.merge(venue_avg_score, on='venue', how='left')

In [65]:
df.head()

Unnamed: 0,batting_team,bowling_team,venue,inning,over,ball,balls_bowled,runs_total,wicket_kind,current_runs,current_wickets,final_score,batting_team_encoded,bowling_team_encoded,venue_encoded,balls_remaining,current_run_rate,team_avg_score,bowling_avg_conceded,venue_avg_score
0,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,1,1,0,none,0,0,178,0,1,7,119,0.0,339.446175,248.929749,134.767832
1,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,2,2,1,none,1,0,178,0,1,7,118,3.0,339.446175,248.929749,134.767832
2,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,3,3,0,none,1,0,178,0,1,7,117,2.0,339.446175,248.929749,134.767832
3,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,4,4,4,none,5,0,178,0,1,7,116,7.5,339.446175,248.929749,134.767832
4,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,5,5,1,none,6,0,178,0,1,7,115,7.2,339.446175,248.929749,134.767832


In [66]:
df.to_csv("ipl_cleaned.csv")

In [29]:
# --- Feature Selection ---
# Define features and target
features = [
    'batting_team_encoded', 'bowling_team_encoded', 'venue_encoded', 'inning',
    'balls_bowled', 'balls_remaining', 'current_runs', 'current_wickets',
    'current_run_rate', 'team_avg_score', 'bowling_avg_conceded', 'venue_avg_score'
]
X = df[features]
y = df['final_score']

# Use SelectKBest to rank features by importance (f_regression for continuous target)
selector = SelectKBest(score_func=f_regression, k='all')
selector.fit(X, y)

In [30]:
  # Get feature scores
feature_scores = pd.DataFrame({
    'Feature': features,
    'Score': selector.scores_
}).sort_values(by='Score', ascending=False)
print("Feature Importance Scores:\n", feature_scores)

# Select top features (e.g., top 8, but adjust based on scores)
top_features = feature_scores['Feature'].head(8).tolist()
print("Selected Features:", top_features)


Feature Importance Scores:
                  Feature         Score
8       current_run_rate  1.893307e+06
11       venue_avg_score  2.777060e+05
6           current_runs  2.446825e+05
7        current_wickets  1.462211e+05
9         team_avg_score  2.845532e+04
10  bowling_avg_conceded  2.725451e+04
3                 inning  9.488088e+02
2          venue_encoded  6.438798e+02
0   batting_team_encoded  7.566216e+01
1   bowling_team_encoded  4.018543e+01
4           balls_bowled  3.184307e+00
5        balls_remaining  3.184307e+00
Selected Features: ['current_run_rate', 'venue_avg_score', 'current_runs', 'current_wickets', 'team_avg_score', 'bowling_avg_conceded', 'inning', 'venue_encoded']


In [31]:
df.head()

Unnamed: 0,batting_team,bowling_team,venue,inning,over,ball,balls_bowled,runs_total,wicket_kind,current_runs,current_wickets,final_score,batting_team_encoded,bowling_team_encoded,venue_encoded,balls_remaining,current_run_rate,team_avg_score,bowling_avg_conceded,venue_avg_score
0,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,1,1,0,none,0,0,178,0,1,7,119,0.0,339.446175,248.929749,134.767832
1,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,2,2,1,none,1,0,178,0,1,7,118,3.0,339.446175,248.929749,134.767832
2,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,3,3,0,none,1,0,178,0,1,7,117,2.0,339.446175,248.929749,134.767832
3,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,4,4,4,none,5,0,178,0,1,7,116,7.5,339.446175,248.929749,134.767832
4,Chennai Super Kings,Deccan Chargers,Buffalo Park,1,0,5,5,1,none,6,0,178,0,1,7,115,7.2,339.446175,248.929749,134.767832


In [32]:
# Final dataset with selected features and target
final_df = df[top_features + ['final_score']]

In [33]:
final_df.head()

Unnamed: 0,current_run_rate,venue_avg_score,current_runs,current_wickets,team_avg_score,bowling_avg_conceded,inning,venue_encoded,final_score
0,0.0,134.767832,0,0,339.446175,248.929749,1,7,178
1,3.0,134.767832,1,0,339.446175,248.929749,1,7,178
2,2.0,134.767832,1,0,339.446175,248.929749,1,7,178
3,7.5,134.767832,5,0,339.446175,248.929749,1,7,178
4,7.2,134.767832,6,0,339.446175,248.929749,1,7,178


In [43]:
# Save the processed dataframe
output_csv = 'ipl_processed_data.csv'
final_df.to_csv(output_csv, index=False)
print(f"Processed data saved to {output_csv}")
print("Final columns:", final_df.columns.tolist())

Processed data saved to ipl_processed_data.csv
Final columns: ['current_run_rate', 'venue_avg_score', 'current_runs', 'current_wickets', 'team_avg_score', 'bowling_avg_conceded', 'inning', 'venue_encoded', 'final_score']
