In [3]:
pip install pandas numpy scikit-learn joblib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [31]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import joblib

In [32]:
import pandas as pd

matches = pd.read_csv(r"C:\Users\ADMIN\Downloads\cricketprediction\matches.csv")
deliveries = pd.read_csv(r"C:\Users\ADMIN\Downloads\cricketprediction\deliveries.csv")

print(matches.shape)
print(deliveries.shape)

(756, 18)
(179078, 21)


In [33]:
matches.columns = matches.columns.str.strip().str.lower()
deliveries.columns = deliveries.columns.str.strip().str.lower()

In [34]:
match_cols = ['id', 'date', 'venue', 'team1', 'team2']
matches = matches[match_cols]

data = deliveries.merge(
    matches,
    left_on='match_id',
    right_on='id',
    how='left'
)

data['date'] = pd.to_datetime(data['date'], errors='coerce')
data = data.dropna(subset=['date'])

In [35]:
batsman_match = (
    data.groupby(['match_id', 'batsman', 'date', 'venue', 'batting_team'])
    .agg(
        runs=('batsman_runs', 'sum'),
        balls=('ball', 'count')
    )
    .reset_index()
)

In [36]:
# Sort by player and date
batsman_match = batsman_match.sort_values(['batsman', 'date'])

# Rolling average - last 5 matches
batsman_match['avg_runs_last_5'] = (
    batsman_match
    .groupby('batsman')['runs']
    .rolling(5)
    .mean()
    .reset_index(level=0, drop=True)
)

# Rolling average - last 10 matches
batsman_match['avg_runs_last_10'] = (
    batsman_match
    .groupby('batsman')['runs']
    .rolling(10)
    .mean()
    .reset_index(level=0, drop=True)
)

In [37]:
# Venue-wise average runs for each batsman
venue_avg = (
    batsman_match
    .groupby(['batsman', 'venue'])['runs']
    .mean()
    .reset_index()
    .rename(columns={'runs': 'venue_avg_runs'})
)

# Merge venue feature back
batsman_match = batsman_match.merge(
    venue_avg,
    on=['batsman', 'venue'],
    how='left'
)

In [38]:
# Player vs Team (PvT) average runs
pvt_stats = (
    data
    .groupby(['batsman', 'bowling_team'])['batsman_runs']
    .mean()
    .reset_index()
    .rename(columns={'batsman_runs': 'pvt_avg_runs'})
)

# Merge PvT feature into batsman_match
batsman_match = batsman_match.merge(
    pvt_stats,
    left_on=['batsman', 'batting_team'],
    right_on=['batsman', 'bowling_team'],
    how='left'
)

# Drop extra column
batsman_match.drop(columns=['bowling_team'], inplace=True)

In [39]:
# Player vs Player (PvP) average runs
pvp_stats = (
    data
    .groupby(['batsman', 'bowler'])['batsman_runs']
    .mean()
    .reset_index()
    .rename(columns={'batsman_runs': 'pvp_avg_runs'})
)

# Merge PvP feature
batsman_match = batsman_match.merge(
    pvp_stats,
    on='batsman',
    how='left'
)

In [40]:
# Average PvP per match (safe ML feature)
batsman_match['pvp_avg_runs'] = (
    batsman_match
    .groupby(['match_id','batsman'])['pvp_avg_runs']
    .transform('mean')
)

In [41]:
career_stats = (
    batsman_match
    .groupby('batsman')
    .agg(
        career_avg_runs=('runs', 'mean'),
        career_matches=('match_id', 'count')
    )
    .reset_index()
)

batsman_match = batsman_match.merge(
    career_stats,
    on='batsman',
    how='left'
)

In [42]:
batsman_match = batsman_match.sort_values(['batsman', 'date'])

batsman_match['runs_next_match'] = (
    batsman_match.groupby('batsman')['runs']
    .shift(-1)
)

batsman_match = batsman_match.dropna(subset=['runs_next_match'])

In [16]:
batsman_match.fillna({
    'avg_runs_last_5': batsman_match['avg_runs_last_5'].mean(),
    'avg_runs_last_10': batsman_match['avg_runs_last_10'].mean(),
    'venue_avg_runs': batsman_match['venue_avg_runs'].mean()
}, inplace=True)

In [53]:
print(data.columns.tolist())

['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball', 'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs', 'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs', 'batsman_runs', 'extra_runs', 'total_runs', 'player_dismissed', 'dismissal_kind', 'fielder', 'id', 'date', 'venue', 'team1', 'team2']


In [54]:
bowler_match = (
    data.groupby(['match_id', 'bowler', 'date', 'venue'])
    .agg(
        wickets=('player_dismissed', lambda x: x.notna().sum()),
        balls=('ball', 'count'),
        runs_conceded=('batsman_runs', 'sum')
    )
    .reset_index()
)

bowler_match['economy'] = bowler_match['runs_conceded'] / (bowler_match['balls'] / 6)


In [55]:
bowler_match['wickets_next_match'] = (
    bowler_match.groupby('bowler')['wickets']
    .shift(-1)
)

bowler_match = bowler_match.dropna()

In [56]:
bowler_match.to_csv("../dataset_bowler.csv", index=False)

In [57]:
TARGET = 'runs'

FEATURES = [
    'avg_runs_last_5',
    'avg_runs_last_10',
    'venue_avg_runs',
    'career_avg_runs',
    'career_matches',
    'venue',
    'batting_team'
]

X = batsman_match[FEATURES]
y = batsman_match[TARGET]

In [58]:
numeric_features = [
    'avg_runs_last_5',
    'avg_runs_last_10',
    'venue_avg_runs',
    'career_avg_runs',
    'career_matches'
]

categorical_features = [
    'venue',
    'batting_team'
]

In [59]:
# Sort by date (mandatory for time-series)
batsman_match = batsman_match.sort_values('date')

# Use 80% data for training based on time
split_date = batsman_match['date'].quantile(0.8)

train_df = batsman_match[batsman_match['date'] <= split_date]
test_df  = batsman_match[batsman_match['date'] > split_date]

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (209701, 16)
Test shape: (50599, 16)


In [60]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [61]:
X_processed = preprocessor.fit_transform(X)


In [62]:
final_df = batsman_match[FEATURES + ['runs']]
final_df.to_csv("dataset.csv", index=False)

In [63]:
joblib.dump(preprocessor, "feature_pipeline.pkl")

['feature_pipeline.pkl']

In [50]:
import joblib

preprocessor = joblib.load("feature_pipeline.pkl")

print("Training features used:")
print(preprocessor.feature_names_in_)


Training features used:
['avg_runs_last_5' 'avg_runs_last_10' 'venue_avg_runs' 'career_avg_runs'
 'career_matches' 'venue' 'batting_team']
