In [2]:
import numpy as np
import pandas as pd
import matplotlib as mp
import requests
from bs4 import BeautifulSoup
import json

In [3]:
url = 'https://stats.espncricinfo.com/ci/engine/stats/index.html?class=6;filter=advanced;orderby=runs;size=200;template=results;trophy=117;type=batting'

In [4]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://www.espncricinfo.com/",
}
response = requests.get(url, headers=headers)
html_content = response.text

In [None]:
def extract_batting_data(html_content):
    soup = BeautifulSoup(html_content, "html.parser")

    tables = soup.select("table.engineTable")

    my_table = tables[2]

    # Get all rows from the tbody (third child of table)
    rows = my_table.find_all("tr")[1:]  # Skipping header row
    batting_data = []
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 10:
            continue  # Skip if the row doesn't have enough columns
        
        temp_data = {
            "player": cols[0].text.strip(),
            "span": cols[1].text.strip(),
            "mat": cols[2].text.strip(),
            "runs": cols[5].text.strip(),
            "avg": cols[7].text.strip(),
            "sr": cols[9].text.strip(),
            "fours": cols[13].text.strip(),
            "sixes":cols[14].text.strip(),
        }
        batting_data.append(temp_data)
    return batting_data

In [6]:
def extract_bowling_data(html_content):
    soup = BeautifulSoup(html_content, "html.parser")
    tables = soup.select("table.engineTable")
    if len(tables) < 3:
        return []
    my_table = tables[2]

    rows = my_table.find_all("tr")[1:]
    bowling_data = []
    for row in rows:
        cols = row.find_all("td")
        if len(cols) < 11:
            continue 
        temp_data = {
            "player": cols[0].text.strip().split("\n")[0],
            "span": cols[1].text.strip(),
            "mat": cols[2].text.strip(),
            "wickets": cols[7].text.strip(),
            "econ": cols[10].text.strip(),
            "sr": cols[11].text.strip(),
        }
        bowling_data.append(temp_data)
    return bowling_data 

In [7]:
data = []
batting_data = []
bowling_data = []
data = [batting_data, bowling_data]

In [None]:
#data from all major t20 leagues
#https://stats.espncricinfo.com/ci/engine/stats/index.html?class=6;filter=advanced;orderby=runs;page=1;size=200;template=results;trophy=117;type=batting
for i in range(1,11):
    link = r"https://stats.espncricinfo.com/ci/engine/stats/index.html?class=6;filter=advanced;orderby=runs;page={};size=200;template=results;trophy=117;trophy=120;trophy=142;trophy=158;trophy=159;trophy=167;trophy=205;trophy=52;trophy=730;trophy=748;trophy=765;trophy=865;trophy=89;trophy=942;trophy=985;trophy=987;type=batting"
    r = requests.get(link.format(i), headers=headers)
    html_content = r.text
    parsed_data = extract_batting_data(html_content=html_content)
    batting_data.extend(parsed_data)

In [9]:
#bowling data for all t20 leagues
# https://stats.espncricinfo.com/ci/engine/stats/index.html?class=6;filter=advanced;orderby=wickets;page=2;size=200;template=results;trophy=117;trophy=120;trophy=142;trophy=158;trophy=159;trophy=167;trophy=205;trophy=52;trophy=730;trophy=748;trophy=765;trophy=865;trophy=89;trophy=942;trophy=985;trophy=987;type=bowling
for i in range(1,11):
    link = r"https://stats.espncricinfo.com/ci/engine/stats/index.html?class=6;filter=advanced;orderby=wickets;page={};size=200;template=results;trophy=117;trophy=120;trophy=142;trophy=158;trophy=159;trophy=167;trophy=205;trophy=52;trophy=730;trophy=748;trophy=765;trophy=865;trophy=89;trophy=942;trophy=985;trophy=987;type=bowling"
    r = requests.get(link.format(i), headers=headers)
    html_content = r.text
    parsed_data = extract_bowling_data(html_content=html_content)
    bowling_data.extend(parsed_data)

In [None]:
print(json.dumps(data[1][:2], indent=4))

In [16]:
batting_df = pd.DataFrame.from_dict(data[0])
batting_df

Unnamed: 0,player,span,mat,runs,avg,sr,fours,sixes
0,CH Gayle,2007-2022,356,11448,36.92,143.92,866,839
1,V Kohli,2007-2025,323,10622,41.17,133.09,953,344
2,KA Pollard,2009-2025,512,10148,30.11,150.02,646,681
3,DA Warner,2009-2025,296,9778,37.60,139.16,998,342
4,RG Sharma,2007-2025,352,9296,30.88,132.61,848,396
...,...,...,...,...,...,...,...,...
1995,J Merlo,2019-2024,17,65,6.50,98.48,6,0
1996,PP Ojha,2007-2018,139,65,3.61,63.72,0,2
1997,SA Quadri,2010-2011,6,65,21.66,98.48,4,1
1998,Rakibul Hasan,2023-2025,22,65,10.83,120.37,5,4


In [17]:
bowling_df = pd.DataFrame.from_dict(data[1])
bowling_df

Unnamed: 0,player,span,mat,wickets,econ,sr
0,DJ Bravo,2007-2024,433,496,8.42,16.9
1,SP Narine,2011-2025,431,457,6.13,21.7
2,Rashid Khan,2016-2025,326,416,6.61,18.1
3,AD Russell,2012-2025,415,386,8.83,16.5
4,Shakib Al Hasan,2007-2025,319,336,6.90,19.3
...,...,...,...,...,...,...
1995,J Charles,2012-2025,204,2,11.04,12.5
1996,D Chauhan,2007-2009,4,2,7.00,27.0
1997,HM Chauhan,2024-2024,2,2,7.57,21.0
1998,PR Chauhan,2019-2022,9,2,9.71,21.0


In [18]:
df = [batting_df, bowling_df]
df

[             player       span  mat   runs    avg      sr fours sixes
 0          CH Gayle  2007-2022  356  11448  36.92  143.92   866   839
 1           V Kohli  2007-2025  323  10622  41.17  133.09   953   344
 2        KA Pollard  2009-2025  512  10148  30.11  150.02   646   681
 3         DA Warner  2009-2025  296   9778  37.60  139.16   998   342
 4         RG Sharma  2007-2025  352   9296  30.88  132.61   848   396
 ...             ...        ...  ...    ...    ...     ...   ...   ...
 1995        J Merlo  2019-2024   17     65   6.50   98.48     6     0
 1996        PP Ojha  2007-2018  139     65   3.61   63.72     0     2
 1997      SA Quadri  2010-2011    6     65  21.66   98.48     4     1
 1998  Rakibul Hasan  2023-2025   22     65  10.83  120.37     5     4
 1999       M Shumba  2022-2022    7     65  10.83   79.26     5     0
 
 [2000 rows x 8 columns],
                player       span  mat wickets   econ    sr
 0            DJ Bravo  2007-2024  433     496   8.42  16.9


In [20]:
for d in df:
    d = d.dropna()

In [21]:
print(df[0].columns)
print(df[1].columns)

Index(['player', 'span', 'mat', 'runs', 'avg', 'sr', 'fours', 'sixes'], dtype='object')
Index(['player', 'span', 'mat', 'wickets', 'econ', 'sr'], dtype='object')


In [22]:
# Process batting data
d = df[0] # Access Batting DataFrame


d["mat"] = pd.to_numeric(d["mat"], errors="coerce").fillna(0).astype(int)
d["runs"] = d["runs"].astype(str).str.replace(",", "", regex=True).replace(["-", "DNB", "NA", ""], "0").astype(int)
d["avg"] = pd.to_numeric(d["avg"], errors="coerce").fillna(0).astype(float)
d["sr"] = pd.to_numeric(d["sr"], errors="coerce").fillna(0).astype(float)
d["fours"] = pd.to_numeric(d["fours"], errors="coerce").fillna(0).astype(int)
d["sixes"] = pd.to_numeric(d["sixes"], errors="coerce").fillna(0).astype(int)


d[["start_year", "end_year"]] = d["span"].str.split("-", expand=True).astype(int)
d["career_length"] = d["end_year"] - d["start_year"] + 1


print(d.head())


       player       span  mat   runs    avg      sr  fours  sixes  start_year  \
0    CH Gayle  2007-2022  356  11448  36.92  143.92    866    839        2007   
1     V Kohli  2007-2025  323  10622  41.17  133.09    953    344        2007   
2  KA Pollard  2009-2025  512  10148  30.11  150.02    646    681        2009   
3   DA Warner  2009-2025  296   9778  37.60  139.16    998    342        2009   
4   RG Sharma  2007-2025  352   9296  30.88  132.61    848    396        2007   

   end_year  career_length  
0      2022             16  
1      2025             19  
2      2025             17  
3      2025             17  
4      2025             19  


In [23]:
# Process Bowling Data
d = df[1]  # Access bowling DataFrame

d["mat"] = pd.to_numeric(d["mat"], errors="coerce").fillna(0).astype(int)
d["wickets"] = (
    d["wickets"]
    .astype(str)  
    .str.replace(",", "", regex=True)  
    .replace(["-", "DNB", "NA", ""], "0")  
    .astype(int)  
)
d["econ"] = pd.to_numeric(d["econ"], errors="coerce").fillna(0).astype(float)
d["sr"] = pd.to_numeric(d["sr"], errors="coerce").fillna(0).astype(float)

d["start_year"] = d["span"].apply(lambda x: int(x.split("-")[0]) if isinstance(x, str) else 0)
d["end_year"] = d["span"].apply(lambda x: int(x.split("-")[1]) if isinstance(x, str) else 0)
d["career_length"] = d["end_year"] - d["start_year"] + 1

print(d.head())  


            player       span  mat  wickets  econ    sr  start_year  end_year  \
0         DJ Bravo  2007-2024  433      496  8.42  16.9        2007      2024   
1        SP Narine  2011-2025  431      457  6.13  21.7        2011      2025   
2      Rashid Khan  2016-2025  326      416  6.61  18.1        2016      2025   
3       AD Russell  2012-2025  415      386  8.83  16.5        2012      2025   
4  Shakib Al Hasan  2007-2025  319      336  6.90  19.3        2007      2025   

   career_length  
0             18  
1             15  
2             10  
3             14  
4             19  


In [None]:
df[0].to_csv("data/batting_data.csv", index="False")

In [None]:
df[1].to_csv("data/bowling_data.csv", index="False")

In [18]:
# Final Model Training  
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, log_loss
import joblib  # For saving the unified model


df = pd.read_csv("data/batting_data.csv")

# Feature Engineering - Adding Boundary Percentage
df["boundary_pct"] = ((4 * df["fours"] + 6 * df["sixes"]) / df["runs"]) * 100


features = ["mat", "runs", "avg", "sr", "career_length", "fours", "sixes", "boundary_pct"]

X = df[features].fillna(df[features].mean())  # Handle missing values


def apply_feature_weights(X, role_type):
    weights = {
        "Anchor":        {"runs": 1.6, "avg": 1.5, "sr": 0.7, "fours": 1.2, "sixes": 0.6, "boundary_pct": 0.8},
        "Balanced":      {"runs": 1.0, "avg": 1.0, "sr": 1.0, "fours": 1.0, "sixes": 1.0, "boundary_pct": 1.0},
        "Power Hitter":  {"runs": 0.7, "avg": 0.5, "sr": 2.0, "fours": 1.4, "sixes": 2.2, "boundary_pct": 2.0},
        "Finisher":      {"runs": 0.4, "avg": 0.8, "sr": 2.2, "fours": 1.6, "sixes": 2.0, "boundary_pct": 2.2}
    }
    
    weighted_X = X.copy()
    for feature, weight in weights[role_type].items():
        if feature in weighted_X.columns:
            weighted_X[feature] *= weight
    
    return weighted_X


# Inexperienced Players (Less than 20 matches or 500 runs)
inexperienced_players = df[(df["mat"] < 20) | (df["runs"] < 500)].copy()
experienced_players = df[(df["mat"] >= 20) & (df["runs"] >= 500)].copy()

# K-Means for Inexperienced Players
X_inexperienced = apply_feature_weights(X.loc[inexperienced_players.index], "Balanced")
kmeans_inexperienced = KMeans(n_clusters=2, random_state=42, n_init=10)
inexperienced_players.loc[:, "role"] = kmeans_inexperienced.fit_predict(X_inexperienced)

inexperienced_players["role"] = inexperienced_players["role"].map({
    0: "Inexperienced - Potential Anchor",
    1: "Inexperienced - Potential Power Hitter"
})

# Power Hitter Identification (Rule-Based)
X_experienced_power = apply_feature_weights(X.loc[experienced_players.index], "Power Hitter")
power_hitter_criteria = (
    (experienced_players["boundary_pct"] >= 60) &
    (experienced_players["sr"] >= 140)
)
experienced_players.loc[power_hitter_criteria, "role"] = "Power Hitter"

# Finisher Identification (Rule-Based)
finisher_criteria = (
    (experienced_players["sr"] >= 155) & (experienced_players["boundary_pct"] >= 55)
)
experienced_players.loc[finisher_criteria, "role"] = "Finisher"

# GMM for Remaining Experienced Players
non_power_hitters = experienced_players[experienced_players["role"].isna()]
X_experienced_gmm = apply_feature_weights(X.loc[non_power_hitters.index], "Balanced")

gmm = GaussianMixture(n_components=4, covariance_type='diag', random_state=42)
non_power_hitters.loc[:, "GMM_Cluster"] = gmm.fit_predict(X_experienced_gmm)

# GMM Role Mapping 
role_mapping = {
    0: "Anchor",
    1: "Balanced Player",
    2: "Power Hitter",
    3: "Finisher"
}
non_power_hitters.loc[:, "role"] = non_power_hitters["GMM_Cluster"].map(role_mapping)

# Combine Experienced Data Back
experienced_players.update(non_power_hitters)


# Final Combined Dataset
df_final = pd.concat([experienced_players, inexperienced_players])


X_final = df_final[features]
y_final = df_final["role"]

# Train-Test Split with Stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

# Neural Network with Hyperparameter Tuning
nn_params = {
    'hidden_layer_sizes': [
        (256, 128),          
        (512, 256, 128),     
        (128, 64, 32, 16),   
    ],
    'activation': ['relu', 'tanh'],
    'alpha': [1e-5, 1e-4, 1e-3],
    'learning_rate_init': [0.001, 0.0005],
    'max_iter': [2000, 2500]
}
nn_model = GridSearchCV(MLPClassifier(random_state=42), nn_params, cv=3)
nn_model.fit(X_train, y_train)

# Predictions
y_pred_nn = nn_model.best_estimator_.predict(X_test)
y_pred_proba = nn_model.best_estimator_.predict_proba(X_test)


print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred_nn))
print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")


full_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('mlp_classifier', nn_model.best_estimator_)
])

# Saving the Unified Pipeline
joblib.dump(full_pipeline, "batting_model.pkl")

print("Batting data model saved successfully!")
# Fit the full pipeline
full_pipeline.fit(X_train, y_train)

# Save fitted pipeline
joblib.dump(full_pipeline, "batting_model.pkl")
print("Fitted pipeline saved!")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_power_hitters.loc[:, "GMM_Cluster"] = gmm.fit_predict(X_experienced_gmm)


Neural Network Classification Report:
                                        precision    recall  f1-score   support

                                Anchor       0.75      0.93      0.83        41
                       Balanced Player       0.95      0.86      0.90        22
                              Finisher       0.85      0.56      0.68        41
      Inexperienced - Potential Anchor       0.92      0.98      0.95        83
Inexperienced - Potential Power Hitter       1.00      0.98      0.99       186
                          Power Hitter       0.58      0.67      0.62        27

                              accuracy                           0.91       400
                             macro avg       0.84      0.83      0.83       400
                          weighted avg       0.91      0.91      0.90       400

Log Loss: 0.2555
Batting data model saved successfully!
Fitted pipeline saved!


In [16]:

# Load the saved model
model = joblib.load("batting_model.pkl")

In [22]:
# Final Model Training with Random Forest
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, log_loss
import joblib  # For saving the unified model


df = pd.read_csv("data/batting_data.csv")

# Feature Engineering - Adding Boundary Percentage
df["boundary_pct"] = ((4 * df["fours"] + 6 * df["sixes"]) / df["runs"]) * 100

features = ["mat", "runs", "avg", "sr", "career_length", "fours", "sixes", "boundary_pct"]
X = df[features].fillna(df[features].mean())  # Handle missing values


def apply_feature_weights(X, role_type):
    weights = {
        "Anchor":        {"runs": 1.6, "avg": 1.5, "sr": 0.7, "fours": 1.2, "sixes": 0.6, "boundary_pct": 0.8},
        "Balanced":      {"runs": 1.0, "avg": 1.0, "sr": 1.0, "fours": 1.0, "sixes": 1.0, "boundary_pct": 1.0},
        "Power Hitter":  {"runs": 0.7, "avg": 0.5, "sr": 2.0, "fours": 1.4, "sixes": 2.2, "boundary_pct": 2.0},
        "Finisher":      {"runs": 0.5, "avg": 0.8, "sr": 2.2, "fours": 1.6, "sixes": 2.0, "boundary_pct": 2.2}
    }

    weighted_X = X.copy()
    for feature, weight in weights[role_type].items():
        if feature in weighted_X.columns:
            weighted_X[feature] *= weight
    
    return weighted_X


# Inexperienced Players
inexperienced_players = df[(df["mat"] < 20) | (df["runs"] < 500)].copy()
experienced_players = df[(df["mat"] >= 20) & (df["runs"] >= 500)].copy()

# K-Means for Inexperienced Players
X_inexperienced = apply_feature_weights(X.loc[inexperienced_players.index], "Balanced")
kmeans_inexperienced = KMeans(n_clusters=2, random_state=42, n_init=10)
inexperienced_players.loc[:, "role"] = kmeans_inexperienced.fit_predict(X_inexperienced)
inexperienced_players["role"] = inexperienced_players["role"].map({
    0: "Inexperienced - Potential Anchor",
    1: "Inexperienced - Potential Power Hitter"
})

# Power Hitter Identification
X_experienced_power = apply_feature_weights(X.loc[experienced_players.index], "Power Hitter")
power_hitter_criteria = (
    (experienced_players["boundary_pct"] > 60) &
    (experienced_players["sr"] > 140)
)
experienced_players.loc[power_hitter_criteria, "role"] = "Power Hitter"

# Finisher Identification
finisher_criteria = (
    (experienced_players["sr"] > 150) &
    ((experienced_players["fours"] + experienced_players["sixes"]) > 40)
)
experienced_players.loc[finisher_criteria, "role"] = "Finisher"

# GMM for Remaining Experienced Players
non_power_hitters = experienced_players[experienced_players["role"].isna()]
X_experienced_gmm = apply_feature_weights(X.loc[non_power_hitters.index], "Balanced")

gmm = GaussianMixture(n_components=4, covariance_type='diag', random_state=42)
non_power_hitters.loc[:, "GMM_Cluster"] = gmm.fit_predict(X_experienced_gmm)

role_mapping = {
    0: "Anchor",
    1: "Balanced Player",
    2: "Power Hitter",
    3: "Finisher"
}
non_power_hitters.loc[:, "role"] = non_power_hitters["GMM_Cluster"].map(role_mapping)

# Merge back
experienced_players.update(non_power_hitters)
df_final = pd.concat([experienced_players, inexperienced_players])

# Train/Test Split
X_final = df_final[features]
y_final = df_final["role"]
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

# Random Forest with Hyperparameter Tuning
rf_params = {
    'n_estimators': [1000],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['sqrt']
}
rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.best_estimator_.predict(X_test)
y_pred_proba = rf_model.best_estimator_.predict_proba(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")

# Full pipeline
full_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('rf_classifier', rf_model.best_estimator_)
])

# Save trained pipeline
full_pipeline.fit(X_train, y_train)
joblib.dump(full_pipeline, "batting_rf_model.pkl")
print("Random Forest batting model saved successfully!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_power_hitters.loc[:, "GMM_Cluster"] = gmm.fit_predict(X_experienced_gmm)


Random Forest Classification Report:
                                        precision    recall  f1-score   support

                                Anchor       0.98      1.00      0.99        41
                       Balanced Player       1.00      1.00      1.00        22
                              Finisher       0.97      0.89      0.93        44
      Inexperienced - Potential Anchor       0.98      1.00      0.99        83
Inexperienced - Potential Power Hitter       1.00      0.99      1.00       186
                          Power Hitter       0.88      0.96      0.92        24

                              accuracy                           0.98       400
                             macro avg       0.97      0.97      0.97       400
                          weighted avg       0.98      0.98      0.98       400

Log Loss: 0.1138
Random Forest batting model saved successfully!


In [4]:
rf_model = joblib.load("batting_rf_model.pkl")

In [9]:
# Example predictions on test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)

# Example new player stats (make sure order matches 'features')
new_player = pd.DataFrame([{
    "mat": 40,
    "runs": 1200,
    "avg": 30,
    "sr": 160.0,
    "career_length": 6,
    "fours": 65,
    "sixes": 30,
    "boundary_pct": ((65*4 + 30*6) / 1200) * 100
}])

pred_role = model.predict(new_player)
pred_proba = model.predict_proba(new_player)

print("Predicted role:", pred_role[0])


Predicted role: Finisher


In [None]:
# NN for bowling data
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import joblib

# Load and preprocess data
df = pd.read_csv("data/bowling_data.csv")

# Ensure career length
if 'career_length' not in df.columns:
    df['career_length'] = df['end_year'] - df['start_year'] + 1

# Role assignment thresholds
wicket_taker_threshold = df['sr'].quantile(0.4)
economist_threshold = df['econ'].quantile(0.4)
experience_threshold_years = 4.5
experience_threshold_matches = 50

# Binary labels
df['is_wicket_taker'] = (df['sr'] <= wicket_taker_threshold) & (df['wickets'] > df['wickets'].quantile(0.5))
df['is_economist'] = df['econ'] <= economist_threshold
df['is_experienced'] = (df['career_length'] > experience_threshold_years) & (df['mat'] > experience_threshold_matches)

# Composite role logic
def assign_role(row):
    role_parts = []
    experience = "Experienced" if row['is_experienced'] else "Inexperienced"
    role_parts.append(experience)

    if row['is_wicket_taker'] and row['is_economist']:
        role_parts.append("Elite Bowler")
    elif row['is_wicket_taker']:
        role_parts.append("Wicket Taker")
    elif row['is_economist']:
        role_parts.append("Economist")
    else:
        role_parts.append("Balanced Bowler")

    return " ".join(role_parts)

df['role'] = df.apply(assign_role, axis=1)

print("\nRole distribution:")
print(df['role'].value_counts())

# Train ML model
features = ["mat", "wickets", "econ", "sr", "career_length"]
X = df[features]
y = df['role']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# NN hyperparameters
nn_params = {
    'hidden_layer_sizes': [
        (128, 64, 32),
        (256, 128, 64),
        (512, 256, 128)
    ],
    'activation': ['relu'],
    'alpha': [1e-5, 1e-4, 1e-3],
    'learning_rate_init': [0.001, 0.005],
    'max_iter': [2000, 2500]
}

grid = GridSearchCV(MLPClassifier(random_state=42), nn_params, cv=3)

# Unified pipeline
bowling_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('mlp_classifier', grid)
])

# Fit model
bowling_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = bowling_pipeline.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save unified model

joblib.dump(bowling_pipeline, "bowling_model.pkl")
print("\nUnified bowling model saved as 'bowling_model.pkl'")

# Function to classify new bowler

def classify_bowler(player_data):
    player_df = pd.DataFrame([player_data])
    model = joblib.load("bowling_model.pkl")
    return model.predict(player_df[features])[0]


Role distribution:
role
Inexperienced Balanced Bowler    733
Inexperienced Economist          526
Experienced Balanced Bowler      199
Inexperienced Wicket Taker       170
Experienced Economist            140
Inexperienced Elite Bowler       106
Experienced Wicket Taker          94
Experienced Elite Bowler          32
Name: count, dtype: int64

Classification Report:
                               precision    recall  f1-score   support

  Experienced Balanced Bowler       0.79      0.68      0.73        40
        Experienced Economist       0.82      0.82      0.82        28
     Experienced Elite Bowler       0.80      0.67      0.73         6
     Experienced Wicket Taker       0.70      0.84      0.76        19
Inexperienced Balanced Bowler       0.89      0.95      0.92       147
      Inexperienced Economist       0.95      0.91      0.93       105
   Inexperienced Elite Bowler       0.71      0.81      0.76        21
   Inexperienced Wicket Taker       0.72      0.62      0.67

In [38]:
# Final Model Training with Random Forest (Weights Removed)
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, log_loss
import joblib

# -------------------------
# Load Data
# -------------------------
df = pd.read_csv("data/batting_data.csv")

# Feature Engineering
df["boundary_pct"] = ((4 * df["fours"] + 6 * df["sixes"]) / df["runs"]) * 100
features = ["mat", "runs", "avg", "sr", "career_length", "fours", "sixes", "boundary_pct"]

# Fill missing values
X = df[features].fillna(df[features].mean())

# -------------------------
# Inexperienced vs Experienced Players
# -------------------------
inexperienced_players = df[(df["mat"] < 20) | (df["runs"] < 500)].copy()
experienced_players = df[(df["mat"] >= 20) & (df["runs"] >= 500)].copy()

# -------------------------
# K-Means for Inexperienced Players
# -------------------------
X_inexperienced = X.loc[inexperienced_players.index]  # No manual weights
kmeans_inexperienced = KMeans(n_clusters=2, random_state=42, n_init=10)
inexperienced_players.loc[:, "role"] = kmeans_inexperienced.fit_predict(X_inexperienced)
inexperienced_players["role"] = inexperienced_players["role"].map({
    0: "Inexperienced - Potential Anchor",
    1: "Inexperienced - Potential Power Hitter"
})

# -------------------------
# Rule-Based for Experienced Players
# -------------------------
# Power Hitter
power_hitter_criteria = (
    (experienced_players["boundary_pct"] > 60) &
    (experienced_players["sr"] > 140)
)
experienced_players.loc[power_hitter_criteria, "role"] = "Power Hitter"

# Finisher
finisher_criteria = (
    (experienced_players["sr"] > 150) &
    ((experienced_players["fours"] + experienced_players["sixes"]) > 40)
)
experienced_players.loc[finisher_criteria, "role"] = "Finisher"

# -------------------------
# GMM for Remaining Experienced Players
# -------------------------
non_power_hitters = experienced_players[experienced_players["role"].isna()]
X_experienced_gmm = X.loc[non_power_hitters.index]  # No manual weights

gmm = GaussianMixture(n_components=4, covariance_type='diag', random_state=42)
non_power_hitters.loc[:, "GMM_Cluster"] = gmm.fit_predict(X_experienced_gmm)

role_mapping = {
    0: "Anchor",
    1: "Balanced Player",
    2: "Power Hitter",
    3: "Finisher"
}
non_power_hitters.loc[:, "role"] = non_power_hitters["GMM_Cluster"].map(role_mapping)

# Merge back
experienced_players.update(non_power_hitters)
df_final = pd.concat([experienced_players, inexperienced_players])

# -------------------------
# Train/Test Split
# -------------------------
X_final = df_final[features]
y_final = df_final["role"]

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

# -------------------------
# Random Forest with GridSearch
# -------------------------
rf_params = {
    'n_estimators': [500, 1000],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf_model = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=3, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
y_pred_rf = rf_model.best_estimator_.predict(X_test)
y_pred_proba = rf_model.best_estimator_.predict_proba(X_test)

print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print(f"Log Loss: {log_loss(y_test, y_pred_proba):.4f}")

# Feature Importance
feat_imp = pd.DataFrame({
    "feature": features,
    "importance": rf_model.best_estimator_.feature_importances_
}).sort_values("importance", ascending=False)
print("\nFeature Importances:")
print(feat_imp)

# -------------------------
# Full Pipeline with Scaling + PCA
# -------------------------
full_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=3)),
    ('rf_classifier', rf_model.best_estimator_)
])

# Fit and Save
full_pipeline.fit(X_train, y_train)
joblib.dump(full_pipeline, "batting_rf_model.pkl")
print("Random Forest batting model saved successfully!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_power_hitters.loc[:, "GMM_Cluster"] = gmm.fit_predict(X_experienced_gmm)


Random Forest Classification Report:
                                        precision    recall  f1-score   support

                                Anchor       0.97      0.95      0.96        41
                       Balanced Player       1.00      1.00      1.00        22
                              Finisher       0.97      0.86      0.92        44
      Inexperienced - Potential Anchor       0.98      1.00      0.99        83
Inexperienced - Potential Power Hitter       1.00      0.99      1.00       186
                          Power Hitter       0.83      1.00      0.91        24

                              accuracy                           0.98       400
                             macro avg       0.96      0.97      0.96       400
                          weighted avg       0.98      0.98      0.98       400

Log Loss: 0.0865

Feature Importances:
         feature  importance
1           runs    0.468937
5          fours    0.239258
6          sixes    0.112586
0    