In [1]:
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np


In [2]:
data = pd.read_csv('input_file_1_1000.csv')


In [3]:
# For batsman
# Group by batsman and match
batsman_data = data.groupby(['batter_id', 'event_id'])

# Calculate the necessary statistics
batsman_stats = batsman_data.agg(
    total_runs=('batter_runs', 'max'),
    balls_faced=('batter_balls_faced', 'count'),
    fours=('outcome', lambda x: (x == 'four').sum()),
    sixes=('outcome', lambda x: (x == 'six').sum()),
    outs=('outcome', lambda x: (x == 'out').sum())
)

# Calculate cumulative statistics
# Calculate the number of matches for each batsman
batsman_matches = data.groupby('batter_id')['event_id'].nunique()

# Correct the 'matches' column in batsman_stats
batsman_stats = batsman_stats.reset_index()
batsman_stats['matches'] = batsman_stats['batter_id'].map(batsman_matches)
batsman_stats['cumulative_runs'] = batsman_stats.groupby('batter_id')['total_runs'].cumsum()
batsman_stats['cumulative_balls_faced'] = batsman_stats.groupby('batter_id')['balls_faced'].cumsum()
batsman_stats['cumulative_outs'] = batsman_stats.groupby('batter_id')['outs'].cumsum()

# Calculate centuries, half-centuries, strike rate, and average
batsman_stats['centuries'] = (batsman_stats['cumulative_runs'] >= 100).astype(int)
batsman_stats['half_centuries'] = ((batsman_stats['cumulative_runs'] >= 50) & (batsman_stats['cumulative_runs'] < 100)).astype(int)
batsman_stats['strike_rate'] = (batsman_stats['cumulative_runs'] / batsman_stats['cumulative_balls_faced']) * 100
batsman_stats['average'] = batsman_stats['cumulative_runs'] / batsman_stats['cumulative_outs'].replace(0,1)

# Reset index
batsman_stats = batsman_stats.reset_index()

# Display the first few rows
batsman_stats.head()


Unnamed: 0,index,batter_id,event_id,total_runs,balls_faced,fours,sixes,outs,matches,cumulative_runs,cumulative_balls_faced,cumulative_outs,centuries,half_centuries,strike_rate,average
0,0,4578.0,226374,0.0,1,0,0,0,1,0.0,1,0,0,0,0.0,0.0
1,1,5390.0,238195,1.0,3,0,0,1,1,1.0,3,1,0,0,33.333333,1.0
2,2,5702.0,226374,17.0,22,2,0,1,1,17.0,22,1,0,0,77.272727,17.0
3,3,6128.0,238195,27.0,15,4,1,0,1,27.0,15,0,0,0,180.0,27.0
4,4,6513.0,226374,97.0,60,6,5,1,2,97.0,60,1,0,1,161.666667,97.0


In [4]:
batsman_stats[batsman_stats['batter_id']==6513]

Unnamed: 0,index,batter_id,event_id,total_runs,balls_faced,fours,sixes,outs,matches,cumulative_runs,cumulative_balls_faced,cumulative_outs,centuries,half_centuries,strike_rate,average
4,4,6513.0,226374,97.0,60,6,5,1,2,97.0,60,1,0,1,161.666667,97.0
5,5,6513.0,238195,13.0,7,1,0,0,2,110.0,67,1,1,0,164.179104,110.0


In [None]:
# Group by bowler, match and over
bowler_data_over = data.groupby(['bowler_id', 'event_id', 'over_runs'])

# Calculate the necessary statistics per over
bowler_over_stats = bowler_data_over.agg(
    runs_conceded=('over_runs', 'max'),
    wickets_taken=('outcome', lambda x: (x == 'out').sum())
)

# Group by bowler and match to sum runs_conceded and wickets_taken per match
bowler_stats = bowler_over_stats.groupby(['bowler_id', 'event_id']).sum().reset_index()

# Calculate the number of matches for each bowler
bowler_matches = data.groupby('bowler_id')['event_id'].nunique()

# Correct the 'matches' column in bowler_stats
bowler_stats['matches'] = bowler_stats['bowler_id'].map(bowler_matches)

# Calculate cumulative statistics
bowler_stats['cumulative_runs_conceded'] = bowler_stats.groupby('bowler_id')['runs_conceded'].transform(pd.Series.cumsum)
bowler_stats['cumulative_wickets_taken'] = bowler_stats.groupby('bowler_id')['wickets_taken'].transform(pd.Series.cumsum)

# Calculate strike rate and average
bowler_stats['strike_rate'] = bowler_stats['cumulative_wickets_taken'] / bowler_stats['matches']
bowler_stats['average'] = bowler_stats['cumulative_runs_conceded'] / bowler_stats['cumulative_wickets_taken'].replace(0,1)

# Display the first few rows
bowler_stats.head(50)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select input and output variables
X = batsman_stats[['batter_id', 'matches', 'cumulative_runs', 'cumulative_balls_faced', 'cumulative_outs', 'average', 'strike_rate']]
y = batsman_stats[['total_runs', 'strike_rate']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error of the predictions
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Let's say we want to predict for an existing batsman with 'batter_id' = 1000
existing_batsman_id = 6513

# Get the historical data of the existing batsman
existing_batsman = batsman_stats[batsman_stats['batter_id'] == existing_batsman_id].iloc[-1]

# Prepare the input for the model
existing_batsman_input = existing_batsman[['batter_id', 'matches', 'cumulative_runs', 'cumulative_balls_faced', 'cumulative_outs', 'average', 'strike_rate']]

# Make a prediction for the existing batsman
existing_batsman_pred = model.predict([existing_batsman_input])
print(f'Predicted runs and strike rate for next match: {existing_batsman_pred}')


Mean Squared Error: 136.3056592329852
Predicted runs and strike rate for next match: [[ 32.41      154.6823495]]




In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Select input and output variables
X = bowler_stats[['bowler_id', 'matches', 'cumulative_wickets_taken', 'cumulative_runs_conceded', 'strike_rate', 'average']]
y = bowler_stats[['wickets_taken', 'runs_conceded']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Calculate the Mean Squared Error of the predictions
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Let's say we want to predict for an existing bowler with 'bowler_id' = 4185
existing_bowler_id = 4185

# Get the historical data of the existing bowler
existing_bowler = bowler_stats[bowler_stats['bowler_id'] == existing_bowler_id].iloc[-1]

# Prepare the input for the model
existing_bowler_input = existing_bowler[['bowler_id', 'matches', 'cumulative_wickets_taken', 'cumulative_runs_conceded', 'strike_rate', 'average']]

# Make a prediction for the existing bowler
existing_bowler_pred = model.predict([existing_bowler_input])
print(f'Predicted wickets_taken and runs_conceded for next match: {existing_bowler_pred}')


Mean Squared Error: 23.001507692307694
Predicted wickets_taken and runs_conceded for next match: [[ 1.29 29.61]]




BATSMAN

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso

# Create a dictionary of different regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.0001),
    'Lasso Regression': Lasso(alpha=0.1),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42)
    }


# For each model, train, predict, and compute MSE
for model_name, model in models.items():
  print(model_name)
  # Select input and output variables
  X = batsman_stats[['batter_id', 'matches', 'cumulative_runs', 'cumulative_balls_faced', 'cumulative_outs', 'average', 'strike_rate']]
  y = batsman_stats[['total_runs', 'strike_rate']]

  # Split the data into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  # Train the model
  model.fit(X_train, y_train)

  # Make predictions on the testing set
  y_pred = model.predict(X_test)

  # Calculate Mean Squared Error
  mse = mean_squared_error(y_test, y_pred)

  print(f'{model_name} MSE: {mse}')


Linear Regression
Linear Regression MSE: 68.23372648456785
Ridge Regression
Ridge Regression MSE: 68.23373929217938
Lasso Regression
Lasso Regression MSE: 68.7529814084147
Decision Tree
Decision Tree MSE: 297.02338149288676
Random Forest
Random Forest MSE: 147.36397867291524
