### Convert raw cricket data into numerical signals that a neural network can actually learn from.

In [1]:
import pandas as pd

In [4]:
match_df = pd.read_csv("data/processed/match_summary_final.csv")
innings_df = pd.read_csv("data/processed/innings_summary_valid.csv")
match_df.shape,innings_df.shape

((2438, 9), (4876, 6))

We will compute past average runs per batting team

In [11]:
team_avg_runs = (
    innings_df
    .groupby("batting_team")["total_runs"]
    .mean()
    .reset_index()
)

team_avg_runs.columns = ["team", "avg_runs"]
team_avg_runs.head()


Unnamed: 0,team,avg_runs
0,Africa XI,242.2
1,Asia XI,237.8
2,Australia,246.498915
3,Bangladesh,214.406349
4,Bermuda,155.0


#### Team 1 batting strength

In [6]:
match_df = match_df.merge(
    team_avg_runs,
    left_on="team_1",
    right_on="team",
    how="left"
).rename(columns={"avg_runs": "team_1_avg_runs"}).drop("team", axis=1)


### Team 2 batting strength

In [7]:
match_df = match_df.merge(
    team_avg_runs,
    left_on="team_2",
    right_on="team",
    how="left"
).rename(columns={"avg_runs": "team_2_avg_runs"}).drop("team", axis=1)


In [8]:
match_df["team_1_won_toss"] = (
    match_df["toss_winner"] == match_df["team_1"]
).astype(int)


### Toss winner Feature

In [9]:
match_df["team_1_batted_first"] = (
    (match_df["team_1_won_toss"] == 1) &
    (match_df["toss_decision"] == "bat")
).astype(int)


### Batting First feature

In [13]:
match_df["team_1_batted_first"] = (
    (
        (match_df["toss_winner"] == match_df["team_1"]) &
        (match_df["toss_decision"] == "bat")
    ) |
    (
        (match_df["toss_winner"] == match_df["team_2"]) &
        (match_df["toss_decision"] == "field")
    )
).astype(int)


In [14]:
from sklearn.preprocessing import LabelEncoder

venue_encoder = LabelEncoder()
match_df["venue_encoded"] = venue_encoder.fit_transform(match_df["venue"])
match_df

Unnamed: 0,match_id,team_1,team_2,team_1_runs,team_2_runs,winner,venue,toss_winner,toss_decision,team_1_avg_runs,team_2_avg_runs,team_1_won_toss,team_1_batted_first,venue_encoded
0,64814,New Zealand,India,254,219,New Zealand,"McLean Park, Napier",India,field,234.997319,247.694000,0,1,159
1,64815,India,New Zealand,108,109,New Zealand,"Jade Stadium, Christchurch",India,bat,247.694000,234.997319,1,1,119
2,64816,India,New Zealand,122,123,New Zealand,"Davies Park, Queenstown",New Zealand,field,247.694000,234.997319,0,1,67
3,64817,New Zealand,India,168,169,India,"Westpac Stadium, Wellington",New Zealand,bat,234.997319,247.694000,1,1,299
4,64819,India,New Zealand,122,125,New Zealand,"Westpac Park, Hamilton",New Zealand,field,247.694000,234.997319,0,1,297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2433,1507885,United Arab Emirates,United States of America,211,213,United States of America,Dubai International Cricket Stadium,United States of America,field,199.614679,221.647887,0,1,75
2434,1507886,Nepal,United Arab Emirates,233,237,United Arab Emirates,"ICC Academy, Dubai",Nepal,bat,201.026667,199.614679,1,1,111
2435,1507887,Nepal,United States of America,271,273,United States of America,"ICC Academy, Dubai",United States of America,field,201.026667,221.647887,0,1,111
2436,1507888,United States of America,United Arab Emirates,292,49,United States of America,Dubai International Cricket Stadium,United Arab Emirates,field,221.647887,199.614679,0,1,75


### win prediction target

In [None]:
match_df["team_1_won"] = (
    match_df["winner"] == match_df["team_1"]
).astype(int)

y_win = match_df["team_1_won"]

0       1
1       0
2       0
3       0
4       0
       ..
2433    0
2434    0
2435    0
2436    1
2437    0
Name: team_1_won, Length: 2438, dtype: int64

In [16]:
y_runs = match_df[["team_1_runs", "team_2_runs"]]
y_runs

Unnamed: 0,team_1_runs,team_2_runs
0,254,219
1,108,109
2,122,123
3,168,169
4,122,125
...,...,...
2433,211,213
2434,233,237
2435,271,273
2436,292,49


In [17]:
match_df.head()

Unnamed: 0,match_id,team_1,team_2,team_1_runs,team_2_runs,winner,venue,toss_winner,toss_decision,team_1_avg_runs,team_2_avg_runs,team_1_won_toss,team_1_batted_first,venue_encoded,team_1_won
0,64814,New Zealand,India,254,219,New Zealand,"McLean Park, Napier",India,field,234.997319,247.694,0,1,159,1
1,64815,India,New Zealand,108,109,New Zealand,"Jade Stadium, Christchurch",India,bat,247.694,234.997319,1,1,119,0
2,64816,India,New Zealand,122,123,New Zealand,"Davies Park, Queenstown",New Zealand,field,247.694,234.997319,0,1,67,0
3,64817,New Zealand,India,168,169,India,"Westpac Stadium, Wellington",New Zealand,bat,234.997319,247.694,1,1,299,0
4,64819,India,New Zealand,122,125,New Zealand,"Westpac Park, Hamilton",New Zealand,field,247.694,234.997319,0,1,297,0


## Final Feature Matrix

In [18]:
feature_cols =[
    "team_1_avg_runs",
    "team_2_avg_runs",
    "team_1_won_toss",
    "team_1_batted_first",
    "venue_encoded",
]
X= match_df[feature_cols]

In [19]:
X.isna().sum()

team_1_avg_runs        0
team_2_avg_runs        0
team_1_won_toss        0
team_1_batted_first    0
venue_encoded          0
dtype: int64

In [20]:
X.head()

Unnamed: 0,team_1_avg_runs,team_2_avg_runs,team_1_won_toss,team_1_batted_first,venue_encoded
0,234.997319,247.694,0,1,159
1,247.694,234.997319,1,1,119
2,247.694,234.997319,0,1,67
3,234.997319,247.694,1,1,299
4,247.694,234.997319,0,1,297


In [21]:
match_df.to_csv(
    "data/processed/match_features_step4.csv",index=False
)
print("Features csv file saved successfully")

Features csv file saved successfully



# Step 5 -> Add team recent match avg

In [22]:
import pandas as pd

match_df = pd.read_csv("data/processed/match_features_step4.csv")
innings_df = pd.read_csv("data/processed/innings_summary_valid.csv")


In [23]:
team_match_runs = innings_df[
    ["match_id", "batting_team", "total_runs"]
].copy()

team_match_runs = team_match_runs.rename(
    columns={"batting_team": "team"}
)


In [24]:
match_df["match_order"] = range(len(match_df))

team_match_runs = team_match_runs.merge(
    match_df[["match_id", "match_order"]],
    on="match_id",
    how="left"
)


In [25]:
team_match_runs = team_match_runs.sort_values(
    by=["team", "match_order"]
)


In [26]:
team_match_runs["recent_avg_runs"] = (
    team_match_runs
    .groupby("team")["total_runs"]
    .shift(1)
    .rolling(window=7, min_periods=1)
    .mean()
)


In [27]:
team_match_runs.head()

Unnamed: 0,match_id,team,total_runs,match_order,recent_avg_runs
470,216271,Africa XI,198,235,
472,216669,Africa XI,106,236,198.0
945,289107,Africa XI,283,472,152.0
947,289108,Africa XI,306,473,195.666667
949,289110,Africa XI,318,474,223.25


In [28]:
match_df = match_df.merge(
    team_match_runs[["match_id", "team", "recent_avg_runs"]],
    left_on=["match_id", "team_1"],
    right_on=["match_id", "team"],
    how="left"
).rename(columns={"recent_avg_runs": "team_1_recent_avg_runs"}) \
 .drop("team", axis=1)


In [29]:
match_df = match_df.merge(
    team_match_runs[["match_id", "team", "recent_avg_runs"]],
    left_on=["match_id", "team_2"],
    right_on=["match_id", "team"],
    how="left"
).rename(columns={"recent_avg_runs": "team_2_recent_avg_runs"}) \
 .drop("team", axis=1)


In [30]:
match_df.head()

Unnamed: 0,match_id,team_1,team_2,team_1_runs,team_2_runs,winner,venue,toss_winner,toss_decision,team_1_avg_runs,team_2_avg_runs,team_1_won_toss,team_1_batted_first,venue_encoded,team_1_won,match_order,team_1_recent_avg_runs,team_2_recent_avg_runs
0,64814,New Zealand,India,254,219,New Zealand,"McLean Park, Napier",India,field,234.997319,247.694,0,1,159,1,0,229.166667,159.2
1,64815,India,New Zealand,108,109,New Zealand,"Jade Stadium, Christchurch",India,bat,247.694,234.997319,1,1,119,0,1,184.8,224.5
2,64816,India,New Zealand,122,123,New Zealand,"Davies Park, Queenstown",New Zealand,field,247.694,234.997319,0,1,67,0,2,171.6,208.5
3,64817,New Zealand,India,168,169,India,"Westpac Stadium, Wellington",New Zealand,bat,234.997319,247.694,1,1,299,0,3,189.833333,167.6
4,64819,India,New Zealand,122,125,New Zealand,"Westpac Park, Hamilton",New Zealand,field,247.694,234.997319,0,1,297,0,4,178.2,180.333333


Fill early NaNs (first few matches)

In [31]:
match_df["team_1_recent_avg_runs"] = match_df["team_1_recent_avg_runs"].fillna(
    match_df["team_1_avg_runs"]
)

match_df["team_2_recent_avg_runs"] = match_df["team_2_recent_avg_runs"].fillna(
    match_df["team_2_avg_runs"]
)


In [34]:
match_df.head()

Unnamed: 0,match_id,team_1,team_2,team_1_runs,team_2_runs,winner,venue,toss_winner,toss_decision,team_1_avg_runs,team_2_avg_runs,team_1_won_toss,team_1_batted_first,venue_encoded,team_1_won,match_order,team_1_recent_avg_runs,team_2_recent_avg_runs
0,64814,New Zealand,India,254,219,New Zealand,"McLean Park, Napier",India,field,234.997319,247.694,0,1,159,1,0,229.166667,159.2
1,64815,India,New Zealand,108,109,New Zealand,"Jade Stadium, Christchurch",India,bat,247.694,234.997319,1,1,119,0,1,184.8,224.5
2,64816,India,New Zealand,122,123,New Zealand,"Davies Park, Queenstown",New Zealand,field,247.694,234.997319,0,1,67,0,2,171.6,208.5
3,64817,New Zealand,India,168,169,India,"Westpac Stadium, Wellington",New Zealand,bat,234.997319,247.694,1,1,299,0,3,189.833333,167.6
4,64819,India,New Zealand,122,125,New Zealand,"Westpac Park, Hamilton",New Zealand,field,247.694,234.997319,0,1,297,0,4,178.2,180.333333


In [32]:
feature_cols = [
    "team_1_avg_runs",
    "team_2_avg_runs",
    "team_1_recent_avg_runs",
    "team_2_recent_avg_runs",
    "team_1_won_toss",
    "team_1_batted_first",
    "venue_encoded"
]

X = match_df[feature_cols]


In [33]:
X.head()

Unnamed: 0,team_1_avg_runs,team_2_avg_runs,team_1_recent_avg_runs,team_2_recent_avg_runs,team_1_won_toss,team_1_batted_first,venue_encoded
0,234.997319,247.694,229.166667,159.2,0,1,159
1,247.694,234.997319,184.8,224.5,1,1,119
2,247.694,234.997319,171.6,208.5,0,1,67
3,234.997319,247.694,189.833333,167.6,1,1,299
4,247.694,234.997319,178.2,180.333333,0,1,297


In [37]:
match_df.to_csv(
    "data/processed/match_features_step5.csv",
    index=False
)

print("Step 5 completed: recent form features added")


Step 5 completed: recent form features added
