In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/ipl_with_pressure_flag.csv")

df.head()


Unnamed: 0,Match_ID,Match_Date,Pitch_Type,Phase,Over,Ball,Bowler,Batter_Avg,Batter_SR,Runs_Conceded,Is_Wicket,is_dot,is_death,pressure_applied
0,11935,2023-01-28,Batting,Powerplay,2,1,Bowler B,33.46,131.68,0,0,True,False,0
1,11935,2023-01-28,Batting,Powerplay,2,2,Bowler B,41.41,138.41,3,0,False,False,0
2,11935,2023-01-28,Batting,Powerplay,2,3,Bowler B,26.8,107.22,0,0,True,False,0
3,11935,2023-01-28,Batting,Powerplay,2,4,Bowler B,9.31,136.44,0,0,True,False,0
4,11935,2023-01-28,Batting,Powerplay,2,5,Bowler B,19.52,105.85,0,1,True,False,0


In [2]:
# Encode categorical variables
df['Pitch_Code'] = df['Pitch_Type'].astype('category').cat.codes
df['Bowler_Code'] = df['Bowler'].astype('category').cat.codes

# Scale Batter_Avg
df['Batter_Avg_Scaled'] = (df['Batter_Avg'] - df['Batter_Avg'].mean()) / df['Batter_Avg'].std()

df[['Pitch_Type', 'Pitch_Code', 'Bowler', 'Bowler_Code', 'Batter_Avg', 'Batter_Avg_Scaled']].head()


Unnamed: 0,Pitch_Type,Pitch_Code,Bowler,Bowler_Code,Batter_Avg,Batter_Avg_Scaled
0,Batting,0,Bowler B,1,33.46,0.204695
1,Batting,0,Bowler B,1,41.41,1.018946
2,Batting,0,Bowler B,1,26.8,-0.477432
3,Batting,0,Bowler B,1,9.31,-2.268784
4,Batting,0,Bowler B,1,19.52,-1.22306


In [3]:
import numpy as np
import pymc as pm
import arviz as az




In [4]:
# Inputs to the model
y = df["Is_Wicket"].values
pressure = df["pressure_applied"].values
pitch = df["Pitch_Code"].values
batter_avg = df["Batter_Avg_Scaled"].values
bowler = df["Bowler_Code"].values
avg = df["Batter_Avg_Scaled"].values


In [5]:
df.columns


Index(['Match_ID', 'Match_Date', 'Pitch_Type', 'Phase', 'Over', 'Ball',
       'Bowler', 'Batter_Avg', 'Batter_SR', 'Runs_Conceded', 'Is_Wicket',
       'is_dot', 'is_death', 'pressure_applied', 'Pitch_Code', 'Bowler_Code',
       'Batter_Avg_Scaled'],
      dtype='object')

In [6]:
with pm.Model() as pressure_model:
    
    # Priors
    intercept = pm.Normal("intercept", 0, 1)
    beta_pressure = pm.Normal("beta_pressure", 0, 1)
    beta_pitch = pm.Normal("beta_pitch", 0, 1)
    beta_avg = pm.Normal("beta_avg", 0, 1)
    
    # Hierarchical effect for bowler
    bowler_effect = pm.Normal("bowler_effect", mu=0, sigma=1, shape=len(np.unique(bowler)))
    
    eta = (
        intercept
        + beta_pressure * pressure
        + beta_pitch * pitch
        + beta_avg * avg
        + bowler_effect[bowler]
    )
    
    # Probability of wicket
    p = pm.Deterministic("p", pm.math.sigmoid(eta))
    
    # Likelihood
    y_obs = pm.Bernoulli("y_obs", p=p, observed=y)


In [7]:
import statsmodels.formula.api as smf

model = smf.logit(
    formula="Is_Wicket ~ pressure_applied * Bowler_Code + Pitch_Code + Batter_Avg_Scaled",
    data=df
).fit()

model.summary()


Optimization terminated successfully.
         Current function value: 0.222945
         Iterations 7


0,1,2,3
Dep. Variable:,Is_Wicket,No. Observations:,4800.0
Model:,Logit,Df Residuals:,4794.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 03 Dec 2025",Pseudo R-squ.:,0.1266
Time:,11:06:52,Log-Likelihood:,-1070.1
converged:,True,LL-Null:,-1225.2
Covariance Type:,nonrobust,LLR p-value:,6.574e-65

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.0953,0.125,-24.823,0.000,-3.340,-2.851
pressure_applied,-0.4636,0.312,-1.485,0.137,-1.075,0.148
Bowler_Code,-0.0334,0.149,-0.224,0.823,-0.325,0.258
pressure_applied:Bowler_Code,2.8953,0.346,8.378,0.000,2.218,3.573
Pitch_Code,0.1088,0.071,1.536,0.124,-0.030,0.248
Batter_Avg_Scaled,-0.0695,0.060,-1.165,0.244,-0.187,0.047


In [8]:
model.summary()


0,1,2,3
Dep. Variable:,Is_Wicket,No. Observations:,4800.0
Model:,Logit,Df Residuals:,4794.0
Method:,MLE,Df Model:,5.0
Date:,"Wed, 03 Dec 2025",Pseudo R-squ.:,0.1266
Time:,11:06:53,Log-Likelihood:,-1070.1
converged:,True,LL-Null:,-1225.2
Covariance Type:,nonrobust,LLR p-value:,6.574e-65

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.0953,0.125,-24.823,0.000,-3.340,-2.851
pressure_applied,-0.4636,0.312,-1.485,0.137,-1.075,0.148
Bowler_Code,-0.0334,0.149,-0.224,0.823,-0.325,0.258
pressure_applied:Bowler_Code,2.8953,0.346,8.378,0.000,2.218,3.573
Pitch_Code,0.1088,0.071,1.536,0.124,-0.030,0.248
Batter_Avg_Scaled,-0.0695,0.060,-1.165,0.244,-0.187,0.047


In [10]:
# === PHASE 3: PRESSURE EFFECT ANALYSIS (SAFE VERSION) ===

import numpy as np
import pandas as pd
import scipy.stats as st
import matplotlib.pyplot as plt

res = model  # logistic regression results

# Confirmed parameter names
p_name = "pressure_applied"
int_name = "pressure_applied:Bowler_Code"

params = res.params
bse = res.bse
cov = res.cov_params()

z94 = st.norm.ppf(0.97)  # 94% interval

# Build coefficient table
rows = []
for name in params.index:
    coef = params[name]
    se = bse[name]
    ci_lo = coef - z94 * se
    ci_hi = coef + z94 * se
    rows.append({
        "term": name,
        "coef": coef,
        "se": se,
        "ci94_lo": ci_lo,
        "ci94_hi": ci_hi,
        "odds_ratio": np.exp(coef)
    })

coef_table = pd.DataFrame(rows).set_index("term")
print("=== Coefficients (94% CI) ===")
display(coef_table)

# PRESSURE EFFECT FOR A AND B
beta_p = params[p_name]                         # pressure main effect
beta_int = params[int_name]                     # bowler*pressure interaction

var_p = cov.loc[p_name, p_name]
var_int = cov.loc[int_name, int_name]
cov_p_int = cov.loc[p_name, int_name]

# Bowler A = beta_p
mean_A = beta_p
se_A = np.sqrt(var_p)
ciA = (mean_A - z94*se_A, mean_A + z94*se_A)

# Bowler B = beta_p + beta_int
mean_B = beta_p + beta_int
var_B = var_p + var_int + 2*cov_p_int
se_B = np.sqrt(var_B)
ciB = (mean_B - z94*se_B, mean_B + z94*se_B)

# Difference B - A = beta_int
mean_diff = beta_int
se_diff = np.sqrt(var_int)
ciD = (mean_diff - z94*se_diff, mean_diff + z94*se_diff)

print("\n=== Pressure Effect (log-odds) ===")
print(f"Bowler A: mean={mean_A:.4f}, SE={se_A:.4f}, 94% CI={ciA}")
print(f"Bowler B: mean={mean_B:.4f}, SE={se_B:.4f}, 94% CI={ciB}")
print(f"Difference (B - A): mean={mean_diff:.4f}, SE={se_diff:.4f}, 94% CI={ciD}")

# Probability(B > A)
prob_B_gt_A = 1 - st.norm.cdf(0, loc=mean_diff, scale=se_diff)
print(f"\nP(PressureEffect_B > PressureEffect_A) ≈ {prob_B_gt_A:.3f}")

# Decision
if ciD[0] > 0:
    decision = "BUY B — B has stronger pressure effect (94% HDI)"
elif ciD[1] < 0:
    decision = "BUY A — A has stronger pressure effect (94% HDI)"
else:
    decision = "AMBIGUOUS — HDI overlaps zero"

print("\n=== FINAL DECISION ===")
print(decision)


=== Coefficients (94% CI) ===


Unnamed: 0_level_0,coef,se,ci94_lo,ci94_hi,odds_ratio
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Intercept,-3.0953,0.1247,-3.3299,-2.8608,0.0453
pressure_applied,-0.4636,0.3121,-1.0507,0.1235,0.629
Bowler_Code,-0.0334,0.1487,-0.3131,0.2464,0.9672
pressure_applied:Bowler_Code,2.8953,0.3456,2.2454,3.5453,18.0894
Pitch_Code,0.1088,0.0708,-0.0244,0.242,1.115
Batter_Avg_Scaled,-0.0695,0.0597,-0.1818,0.0427,0.9328



=== Pressure Effect (log-odds) ===
Bowler A: mean=-0.4636, SE=0.3121, 94% CI=(np.float64(-1.0506630544343312), np.float64(0.12346671874547172))
Bowler B: mean=2.4317, SE=0.1529, 94% CI=(np.float64(2.1441659333777947), np.float64(2.7192860118993405))
Difference (B - A): mean=2.8953, SE=0.3456, 94% CI=(np.float64(2.2453808332907097), np.float64(3.545267447675285))

P(PressureEffect_B > PressureEffect_A) ≈ 1.000

=== FINAL DECISION ===
BUY B — B has stronger pressure effect (94% HDI)


In [11]:
print(model.params.index)
print(model.cov_params().index)


Index(['Intercept', 'pressure_applied', 'Bowler_Code',
       'pressure_applied:Bowler_Code', 'Pitch_Code', 'Batter_Avg_Scaled'],
      dtype='object')
Index(['Intercept', 'pressure_applied', 'Bowler_Code',
       'pressure_applied:Bowler_Code', 'Pitch_Code', 'Batter_Avg_Scaled'],
      dtype='object')


In [17]:
# 1. Recompute & save summary (safe reproducible block)

res = model  # statsmodels LogitResults

p_name = "pressure_applied"
int_name = "pressure_applied:Bowler_Code"

params = res.params
bse = res.bse
cov = res.cov_params()
z94 = st.norm.ppf(0.97)

# Coefficient table
rows = []
for name in params.index:
    coef = params[name]
    se = bse[name]
    ci_lo = coef - z94 * se
    ci_hi = coef + z94 * se
    rows.append((name, coef, se, ci_lo, ci_hi, np.exp(coef)))
coef_df = pd.DataFrame(rows, columns=["term","coef","se","ci94_lo","ci94_hi","odds_ratio"]).set_index("term")
coef_df.to_csv(r"d:\IPL_Auction_Analytics\data\model_coefficients_with_94CI.csv")

# Pressure effects
beta_p = params[p_name]
beta_int = params[int_name]
var_p = cov.loc[p_name, p_name]
var_int = cov.loc[int_name, int_name]
cov_p_int = cov.loc[p_name, int_name]

mean_A = beta_p
se_A = np.sqrt(var_p)
ciA = (mean_A - z94*se_A, mean_A + z94*se_A)

mean_B = beta_p + beta_int
var_B = var_p + var_int + 2*cov_p_int
se_B = np.sqrt(var_B)
ciB = (mean_B - z94*se_B, mean_B + z94*se_B)

mean_diff = beta_int
se_diff = np.sqrt(var_int)
ciD = (mean_diff - z94*se_diff, mean_diff + z94*se_diff)
prob_B_gt_A = 1 - st.norm.cdf(0, loc=mean_diff, scale=se_diff)

summary = {
    "mean_A": mean_A, "se_A": se_A, "ciA_lo": ciA[0], "ciA_hi": ciA[1],
    "mean_B": mean_B, "se_B": se_B, "ciB_lo": ciB[0], "ciB_hi": ciB[1],
    "mean_diff": mean_diff, "se_diff": se_diff, "ciD_lo": ciD[0], "ciD_hi": ciD[1],
    "prob_B_gt_A": prob_B_gt_A
}
pd.Series(summary).to_csv(r"d:\IPL_Auction_Analytics\data\pressure_effect_summary.csv")
print("Saved: data/model_coefficients_with_94CI.csv and data/pressure_effect_summary.csv")
display(coef_df)
print("\nPressure-effect summary:")
print(summary)


Saved: data/model_coefficients_with_94CI.csv and data/pressure_effect_summary.csv


Unnamed: 0_level_0,coef,se,ci94_lo,ci94_hi,odds_ratio
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Intercept,-3.0953,0.1247,-3.3299,-2.8608,0.0453
pressure_applied,-0.4636,0.3121,-1.0507,0.1235,0.629
Bowler_Code,-0.0334,0.1487,-0.3131,0.2464,0.9672
pressure_applied:Bowler_Code,2.8953,0.3456,2.2454,3.5453,18.0894
Pitch_Code,0.1088,0.0708,-0.0244,0.242,1.115
Batter_Avg_Scaled,-0.0695,0.0597,-0.1818,0.0427,0.9328



Pressure-effect summary:
{'mean_A': np.float64(-0.4635981678444297), 'se_A': np.float64(0.3121367937691813), 'ciA_lo': np.float64(-1.0506630544343312), 'ciA_hi': np.float64(0.12346671874547172), 'mean_B': np.float64(2.4317259726385676), 'se_B': np.float64(0.15289292669568), 'ciB_lo': np.float64(2.1441659333777947), 'ciB_hi': np.float64(2.7192860118993405), 'mean_diff': np.float64(2.8953241404829972), 'se_diff': np.float64(0.3455686495187301), 'ciD_lo': np.float64(2.2453808332907097), 'ciD_hi': np.float64(3.545267447675285), 'prob_B_gt_A': np.float64(1.0)}


In [16]:
import os
print(os.getcwd())


d:\IPL_Auction_Analytics\notebooks
