In [1]:
import pandas as pd
import numpy as np

In [2]:
match=pd.read_csv("./dataset/matches.csv")
deliveries=pd.read_csv("./dataset/deliveries.csv")
most_runs=pd.read_csv("./dataset/most_runs_average_strikerate.csv")
team=pd.read_csv("./dataset/teams.csv")
venue=pd.read_csv("./dataset/teamwise_home_and_away.csv")

In [3]:
total_score_df=deliveries.groupby(["match_id","inning"]).sum()["total_runs"].reset_index()

In [4]:
total_score_df=total_score_df[total_score_df["inning"]==1]

In [5]:
match_df=match.merge(total_score_df[["match_id","total_runs"]],left_on="id",right_on="match_id")

In [6]:
teams={
       "Chennai Super Kings", 
       "Delhi Capitals",
       "Kings XI Punjab",
       "Kolkata Knight Riders",
       "Mumbai Indians",
       "Rajasthan Royals",
       "Royal Challengers Bangalore",
       "Sunrisers Hyderabad"   
}

In [7]:
match_df=match_df[match_df["team1"].str.replace("Delhi Daredevils","Delhi Capitals").isin(teams)]
match_df=match_df[match_df["team2"].str.replace("Delhi Daredevils","Delhi Capitals").isin(teams)]
match_df=match_df[match_df["team1"].str.replace("Deccan Chargers","Sunrisers Hyderabad").isin(teams)]
match_df=match_df[match_df["team2"].str.replace("Deccan Chargers","Sunrisers Hyderabad").isin(teams)]

In [8]:
match_df=match_df[match_df["dl_applied"]==0]

In [9]:
match_df=match_df[["match_id","city","winner","total_runs"]]

In [10]:
deliveries_df=match_df.merge(deliveries,on="match_id")

In [11]:
deliveries_df=deliveries_df[deliveries_df["inning"]==2]

In [12]:

deliveries_df = deliveries_df.rename(columns={"total_runs_x": "target_runs", "total_runs_y": "runs_scored"})

# Ensure deliveries are ordered correctly before cumsum
deliveries_df = deliveries_df.sort_values(["match_id", "over", "ball"]).reset_index(drop=True)

# Compute running/current score in the chase (second innings) per match
deliveries_df["current_score"] = deliveries_df.groupby("match_id")["runs_scored"].cumsum()

deliveries_df["score_left"]=deliveries_df["target_runs"]-deliveries_df["current_score"]

In [13]:
# Calculate balls left correctly (total 120 balls minus balls completed)
deliveries_df["balls_left"] = 120 - (deliveries_df["over"]*6 + deliveries_df["ball"])

# Validate and clean up any invalid values
deliveries_df.loc[deliveries_df["balls_left"] < 0, "balls_left"] = 0

In [14]:
# More robust wickets calculation
# 1. Handle player dismissals properly
deliveries_df["player_dismissed"] = deliveries_df["player_dismissed"].fillna("")
deliveries_df["dismissal_kind"] = deliveries_df["dismissal_kind"].fillna("")

# 2. Consider a wicket only when there's both a player dismissed and a valid dismissal kind
deliveries_df["is_wicket"] = ((deliveries_df["player_dismissed"].str.len() > 0) & 
                             (deliveries_df["dismissal_kind"].str.len() > 0)).astype(int)

# 3. Calculate wickets with validation
wickets_fallen = deliveries_df.groupby("match_id")["is_wicket"].cumsum()
deliveries_df["wickets"] = 10 - wickets_fallen

# 4. Ensure wickets are within valid range [0, 10]
deliveries_df.loc[deliveries_df["wickets"] < 0, "wickets"] = 0
deliveries_df.loc[deliveries_df["wickets"] > 10, "wickets"] = 10

In [15]:
# Calculate run rates safely (avoiding division by zero)
# Current run rate: runs scored per over so far
balls_played = 120 - deliveries_df["balls_left"]
deliveries_df["crr"] = np.where(balls_played > 0,
                               (deliveries_df["current_score"] * 6) / balls_played,
                               0)  # 0 for first ball of innings

# Required run rate: runs needed per over remaining
deliveries_df["rrr"] = np.where(deliveries_df["balls_left"] > 0,
                               (deliveries_df["score_left"] * 6) / deliveries_df["balls_left"],
                               np.inf)  # inf when no balls left but runs needed

In [16]:
# Diagnostic checks
print("1. Balls left calculation:")
print("Max balls left:", deliveries_df["balls_left"].max())
print("Min balls left:", deliveries_df["balls_left"].min())

print("\n2. Wickets analysis:")
print("Max wickets:", deliveries_df["wickets"].max())
print("Min wickets:", deliveries_df["wickets"].min())
wickets_per_match = deliveries_df.groupby("match_id")["wickets"].min()
invalid_matches = wickets_per_match[wickets_per_match < 0]
if len(invalid_matches) > 0:
    print("\nMatches with invalid wickets (< 0):", len(invalid_matches))
    print(invalid_matches.head())

print("\n3. Required run rate (rrr) analysis:")
print("Infinite or NaN rrr values:", deliveries_df["rrr"].isna().sum())
print("Negative rrr values:", (deliveries_df["rrr"] < 0).sum())

print("\n4. Current run rate (crr) analysis:")
print("Infinite or NaN crr values:", deliveries_df["crr"].isna().sum())
print("Negative crr values:", (deliveries_df["crr"] < 0).sum())

# Sample problematic rows if any
problematic = deliveries_df[
    (deliveries_df["balls_left"] <= 0) |
    (deliveries_df["wickets"] < 0) |
    (deliveries_df["wickets"] > 10) |
    (deliveries_df["rrr"].isna()) |
    (deliveries_df["crr"].isna())
].head()

if len(problematic) > 0:
    print("\nSample problematic rows:")
    print(problematic[["match_id", "over", "ball", "balls_left", "wickets", "rrr", "crr"]])

1. Balls left calculation:
Max balls left: 113
Min balls left: 0

2. Wickets analysis:
Max wickets: 10
Min wickets: 0

3. Required run rate (rrr) analysis:
Infinite or NaN rrr values: 0
Negative rrr values: 149

4. Current run rate (crr) analysis:
Infinite or NaN crr values: 0
Negative crr values: 0

Sample problematic rows:
     match_id  over  ball  balls_left  wickets  rrr   crr
118         1    19     6           0        1  inf  8.25
119         1    20     1           0        1  inf  8.25
120         1    20     2           0        1  inf  8.30
121         1    20     3           0        1  inf  8.60
122         1    20     4           0        0  inf  8.60


In [17]:
deliveries_df["rrr"]=(deliveries_df["score_left"]*6)/deliveries_df["balls_left"]


In [18]:
final_df=deliveries_df[["batting_team","bowling_team","city","score_left","balls_left","wickets","target_runs","crr","rrr","winner"]]
final_df.dropna(inplace=True)
final_df=final_df[final_df["balls_left"]!=0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.dropna(inplace=True)


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline


In [20]:
X=final_df.iloc[:,:-1]
Y=final_df.iloc[:,-1]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=1)
trf=ColumnTransformer([('trf',OneHotEncoder(drop="first"),["batting_team","bowling_team","city"])] , remainder="passthrough")
ll=LogisticRegression(solver="liblinear")
pipeline=Pipeline(steps=[('step 1',trf),('step 2',ll)])
pipeline.fit(X_train,Y_train)
Y_pred=pipeline.predict(X_test)
print("Model Accuracy:",pipeline.score(X_test,Y_test))



Model Accuracy: 0.7972701301449582


In [21]:
import pickle
pickle.dump(pipeline,open("pipeline.pkl","wb"))