In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import joblib
import os

# 1. Load dataset
df = pd.read_csv('../data/raw/match_data.csv')  # Make sure the path is correct

# 2. Extract numeric target (first innings score)
df['1st_inning_runs'] = pd.to_numeric(
    df['1st_inning_score'].astype(str).str.extract(r'(\d+)')[0], errors='coerce'
)
df = df.dropna(subset=['1st_inning_runs'])

# 3. Encode categorical features
encoders = {
    'home_team': LabelEncoder(),
    'away_team': LabelEncoder(),
    'toss_won': LabelEncoder(),
    'decision': LabelEncoder(),
    'venue_name': LabelEncoder()
}

for col, encoder in encoders.items():
    df[col + '_enc'] = encoder.fit_transform(df[col].astype(str))

# 4. Prepare features and target
X = df[['home_team_enc', 'away_team_enc', 'toss_won_enc', 'decision_enc', 'venue_name_enc']]
y = df['1st_inning_runs']

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Train XGBoost Regressor
model = XGBRegressor(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# Ensure models directory exists
os.makedirs('../models', exist_ok=True)

# 7. Save model and all encoders as a dictionary
joblib.dump(model, '../models/score_predictor_model.pkl')
joblib.dump(encoders, '../models/score_label_encoders.pkl')

# 8. Confirm
print("✅ Model saved to: ../models/score_predictor_model.pkl")
print("✅ Encoders saved to: ../models/score_label_encoders.pkl")


✅ Model saved to: ../models/score_predictor_model.pkl
✅ Encoders saved to: ../models/score_label_encoders.pkl


In [32]:
print("Input shape used for training:", X_train.shape[1])


Input shape used for training: 5


In [33]:
# Check allowed values for venue_name
print(encoders['venue_name'].classes_)


['Arun Jaitley Stadium, Delhi' 'Barabati Stadium, Cuttack'
 'Barsapara Cricket Stadium, Guwahati'
 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow'
 'Brabourne Stadium, Mumbai' 'Buffalo Park, East London'
 'Diamond Oval, Kimberley' 'Dr DY Patil Sports Academy, Mumbai'
 'Dr DY Patil Sports Academy, Navi Mumbai'
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam'
 'Dubai International Cricket Stadium' 'Eden Gardens, Kolkata'
 'Green Park, Kanpur'
 'Himachal Pradesh Cricket Association Stadium, Dharamsala'
 'Holkar Cricket Stadium, Indore'
 'JSCA International Stadium Complex, Ranchi' 'Kingsmead, Durban'
 'M.Chinnaswamy Stadium, Bengaluru'
 'MA Chidambaram Stadium, Chepauk, Chennai'
 'Maharashtra Cricket Association Stadium, Pune'
 'Mangaung Oval, Bloemfontein' 'Narendra Modi Stadium, Motera, Ahmedabad'
 'Nehru Stadium, Kochi' 'Newlands, Cape Town'
 'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh'
 'Rajiv Gandhi International Stad