In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from pulp import LpProblem, LpMaximize, LpVariable, lpSum
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler


1. Data Collection and Preprocessing

In [3]:
# Load datasets
matches = pd.read_csv('matches.csv')
balls = pd.read_csv('deliveries.csv')  # Assuming ball-by-ball data is in 'deliveries.csv'

# Preprocess matches data
matches.dropna(subset=['winner', 'toss_winner', 'toss_decision'], inplace=True)

# Encode categorical features
le = LabelEncoder()
for col in ['team1', 'team2', 'toss_winner', 'venue', 'toss_decision']:
    matches[f'{col}_encoded'] = le.fit_transform(matches[col])

# Target variable: Team1 wins (1) or not (0)
matches['team1_wins'] = (matches['winner'] == matches['team1']).astype(int)

2. Winning Probability Model

In [4]:
# Feature Engineering: Historical Win Rate
team_stats = matches.groupby('team1')['team1_wins'].mean().reset_index()
team_stats.columns = ['team', 'win_rate']

matches = matches.merge(team_stats, left_on='team1', right_on='team', suffixes=('', '_team1'))
matches = matches.merge(team_stats, left_on='team2', right_on='team', suffixes=('', '_team2'))

# Select Features
features = ['team1_encoded', 'team2_encoded', 'venue_encoded', 'toss_winner_encoded', 
            'toss_decision_encoded', 'win_rate', 'win_rate_team2']
X = matches[features]
y = matches['team1_wins']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'ROC AUC: {roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])}')

Accuracy: 0.5045871559633027
ROC AUC: 0.5089338390223346


3. Top Player Rankings

In [8]:

# 1) Ensure correct column names:
print("Columns:", balls.columns.tolist())
if 'batter' in balls.columns:
    balls = balls.rename(columns={'batter': 'batsman'})

# 2) Batsmen stats:
batsman_stats = (
    balls
    .groupby('batsman')
    .agg(
        total_runs   = ('batsman_runs', 'sum'),
        balls_faced  = ('batsman_runs', 'count'),
    )
    .reset_index()
)
batsman_stats['average']     = batsman_stats['total_runs'] / batsman_stats['balls_faced']
batsman_stats['strike_rate'] = batsman_stats['total_runs'] / batsman_stats['balls_faced'] * 100

# 3) Bowlers stats:
bowler_stats = (
    balls
    .groupby('bowler')
    .agg(
        total_wickets   = ('is_wicket', 'sum'),
        runs_conceded   = ('total_runs', 'sum'),
        balls_bowled    = ('total_runs', 'count'),
    )
    .reset_index()
)
bowler_stats['economy'] = bowler_stats['runs_conceded'] / (bowler_stats['balls_bowled'] / 6)
# avoid divide-by-zero for average:
bowler_stats['average'] = bowler_stats['runs_conceded'] / bowler_stats['total_wickets'].replace({0: np.nan})

# 4) Normalize & score batsmen:
scaler = StandardScaler()
cols_to_scale = ['total_runs', 'average', 'strike_rate']
batsman_stats[cols_to_scale] = scaler.fit_transform(batsman_stats[cols_to_scale])
batsman_stats['score'] = batsman_stats[cols_to_scale].mean(axis=1)
top_batsmen = batsman_stats.sort_values('score', ascending=False).head(10)

# 5) (Optional) Likewise for bowlers:
cols_bowl = ['total_wickets', 'economy', 'average']
bowler_stats[cols_bowl] = scaler.fit_transform(bowler_stats[cols_bowl])
bowler_stats['score'] = bowler_stats[cols_bowl].mean(axis=1)
top_bowlers = bowler_stats.sort_values('score', ascending=False).head(10)


Columns: ['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batter', 'bowler', 'non_striker', 'batsman_runs', 'extra_runs', 'total_runs', 'extras_type', 'is_wicket', 'player_dismissed', 'dismissal_kind', 'fielder']


4. Best Playing XI Suggestions

In [11]:
# Sample player data (replace with actual data)
players = pd.DataFrame({
    'name': ['Player1', 'Player2', 'Player3'],
    'role': ['Batsman', 'Bowler', 'All-Rounder'],
    'score': [9.5, 8.7, 9.0],
    'overseas': [0, 1, 1]
})

# Optimization model
prob = LpProblem('BestXI', LpMaximize)
player_vars = LpVariable.dicts('Player', players.index, cat='Binary')

# Objective: Maximize total score
prob += lpSum([players.loc[i, 'score'] * player_vars[i] for i in players.index])

# Constraints
prob += lpSum([player_vars[i] for i in players.index]) == 11
prob += lpSum([player_vars[i] for i in players[players['overseas'] == 1].index]) <= 4
# Add role constraints here (e.g., min batsmen, bowlers)

prob.solve()
best_xi = [players.loc[i, 'name'] for i in players.index if player_vars[i].value() == 1]
print(best_xi)

['Player1', 'Player2', 'Player3']
