In [1]:
# Phase 1: Mental Proxy — create pressure_applied flag and summary
import pandas as pd
import numpy as np

# Load dataset (adjust path if you moved the file)
df = pd.read_excel("../data/IPL_Bowler_Detailed_Data.xls")

# Clean / standardize
df['Phase'] = df['Phase'].astype(str).str.strip().str.title()
df['Pitch_Type'] = df['Pitch_Type'].astype(str).str.strip().str.title()

# Sort properly for shifting logic
df = df.sort_values(["Match_ID", "Over", "Ball"]).reset_index(drop=True)

# Identify dot balls + death overs
df['is_dot'] = df['Runs_Conceded'] == 0
df['is_death'] = df['Phase'].str.lower() == "death"

# Shift previous row
prev = df.shift(1)

# Previous row must be from SAME MATCH and SAME OVER
same_match = df['Match_ID'] == prev['Match_ID']
same_over = df['Over'] == prev['Over']
same_match_and_over = same_match & same_over

# Pressure source: previous ball is dot AND in death AND same match+over
prev_is_pressure = same_match_and_over & (prev['is_dot']) & (prev['is_death'])

df['pressure_applied'] = prev_is_pressure.astype(int)

# --- Summary ---
total_balls = len(df)
total_pressure = int(df['pressure_applied'].sum())

print("===== PHASE 1: PRESSURE FEATURE SUMMARY =====\n")
print(f"Total balls in dataset: {total_balls:,}")
print(f"Balls with pressure applied (next-ball after death dot): {total_pressure:,}")

# Wicket probabilities
p_overall = df['Is_Wicket'].mean()
p_after_pressure = df[df['pressure_applied'] == 1]['Is_Wicket'].mean()
p_no_pressure = df[df['pressure_applied'] == 0]['Is_Wicket'].mean()

print("\nWicket Probability Summary:")
print(f"Overall wicket rate:               {p_overall:.4f}")
print(f"Wicket rate after pressure:        {p_after_pressure:.4f}")
print(f"Wicket rate without pressure:      {p_no_pressure:.4f}")

# Per-bowler summary
bowler_rows = []
for bowler, sub in df.groupby("Bowler"):
    total = len(sub)
    pressure_rows = sub[sub["pressure_applied"] == 1]
    n_pressure = len(pressure_rows)
    w_after = int(pressure_rows["Is_Wicket"].sum())
    p_rate = (w_after / n_pressure) if n_pressure > 0 else np.nan
    baseline = sub["Is_Wicket"].mean()

    bowler_rows.append({
        "Bowler": bowler,
        "Total_Balls": total,
        "Pressure_Balls": n_pressure,
        "Wickets_After_Pressure": w_after,
        "P(Wicket | Pressure)": p_rate,
        "Baseline Wicket Rate": baseline
    })

bowler_df = pd.DataFrame(bowler_rows).set_index("Bowler")

print("\n===== PER-BOWLER PRESSURE SUMMARY =====")
display(bowler_df)

# Save output for reproducibility
df.to_csv("../data/ipl_with_pressure_flag.csv", index=False)
print("\nSaved dataset with pressure flag → ../data/ipl_with_pressure_flag.csv")



===== PHASE 1: PRESSURE FEATURE SUMMARY =====

Total balls in dataset: 4,800
Balls with pressure applied (next-ball after death dot): 784

Wicket Probability Summary:
Overall wicket rate:               0.0706
Wicket rate after pressure:        0.1888
Wicket rate without pressure:      0.0476

===== PER-BOWLER PRESSURE SUMMARY =====


Unnamed: 0_level_0,Total_Balls,Pressure_Balls,Wickets_After_Pressure,P(Wicket | Pressure),Baseline Wicket Rate
Bowler,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bowler A,2400,399,12,0.030075,0.045
Bowler B,2400,385,136,0.353247,0.09625



Saved dataset with pressure flag → ../data/ipl_with_pressure_flag.csv
