# This is an optimized version for Cricket Score prediction!


We are going to use a Stacking Regressor for an optimized and robust output



In [77]:
import pickle
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV,train_test_split,KFold
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

In [78]:
df=pickle.load(open('dataset_level3.pkl','rb'))

In [79]:
df.sample()

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,current_run_rate,last_five,runs_x
12861,West Indies,New Zealand,Auckland,44,102,10,14.666667,,180


# Splitting of data into Training and Testing dataset


In [80]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]


In [81]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [82]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid dummy variable trap

# Fit and transform on the training data for the categorical columns
X_train_encoded = ohe.fit_transform(X_train[['batting_team', 'bowling_team', 'city']])
X_test_encoded = ohe.transform(X_test[['batting_team', 'bowling_team', 'city']])
    
# Create DataFrames from the one-hot encoded arrays
encoded_columns_train = pd.DataFrame(X_train_encoded, columns=ohe.get_feature_names_out(['batting_team', 'bowling_team', 'city']))
encoded_columns_test = pd.DataFrame(X_test_encoded, columns=ohe.get_feature_names_out(['batting_team', 'bowling_team', 'city']))

# Concatenate the original DataFrame with the one-hot encoded DataFrame
X_train = pd.concat([X_train.reset_index(drop=True), encoded_columns_train.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test.reset_index(drop=True), encoded_columns_test.reset_index(drop=True)], axis=1)

# Drop the original categorical columns
X_train.drop(['batting_team', 'bowling_team', 'city'], axis=1, inplace=True)
X_test.drop(['batting_team', 'bowling_team', 'city'], axis=1, inplace=True)


In [83]:
X_train.sample()

Unnamed: 0,current_score,balls_left,wickets_left,current_run_rate,last_five,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,...,city_Mount Maunganui,city_Mumbai,city_Nagpur,city_Nottingham,city_Pallekele,city_Southampton,city_St Lucia,city_Sydney,city_Trinidad,city_Wellington
4699,2,118,10,2.0,,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
param_grid={
    'n_estimators':[10,50,100,200,400,500,800,1000],
    'learning_rate':[0.01,0.02,0.05,0.09,0.1,0.5,0.9],
    'max_depth':[2,3,4,5,6]
}
xgb=XGBRegressor()
grid_search=GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train,y_train)
print('Best parameters : ', grid_search.best_params_)
print('Best R2 Score : ', grid_search.best_score_)

Fitting 5 folds for each of 280 candidates, totalling 1400 fits
Best parameters :  {'learning_rate': 0.5, 'max_depth': 6, 'n_estimators': 800}
Best R2 Score :  0.9441490694785953


In [69]:
param_grid2={
    'n_estimators':[10,50,100,200,400,500,800,1000],
    'bootstrap':[True,False],
    'max_depth':[2,3,4,5,6]
}
rf=RandomForestRegressor()
grid_search=GridSearchCV(
    estimator=rf,
    param_grid=param_grid2,
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train,y_train)
print('Best parameters : ', grid_search.best_params_)
print('Best R2 Score : ', grid_search.best_score_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
Best parameters :  {'bootstrap': True, 'max_depth': 6, 'n_estimators': 400}
Best R2 Score :  0.5637681047762952


# We have created our stack layers of XGBoost

In [108]:
#defining layer1 of the stacking model
layer_1_models=[
    ('xgb1',XGBRegressor(n_estimators=1000,learning_rate=0.05,max_depth=12,random_state=1)),
    ('xgb2',XGBRegressor(n_estimators=1000,learning_rate=0.01,max_depth=12,random_state=2)),
    ('xgb3',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=12,random_state=3)),
    ('xgb4',XGBRegressor(n_estimators=1000,learning_rate=0.1,max_depth=12,random_state=4)),
    ('xgb5',XGBRegressor(n_estimators=1000,learning_rate=0.09,max_depth=12,random_state=5))
]

layer_2_models=[
    ('xgb6',XGBRegressor(n_estimators=1000,learning_rate=0.05,max_depth=6,random_state=6)),
    ('xgb7',XGBRegressor(n_estimators=1000,learning_rate=0.2,max_depth=6,random_state=7)),
    ('xgb6',XGBRegressor(n_estimators=1000,learning_rate=0.1,max_depth=6,random_state=8)),
]
final_estimator=RandomForestRegressor(n_estimators=500,max_depth=6,bootstrap=True)

In [89]:
import numpy as np

In [109]:
# Use k-fold cross validation for layer 1
kf=KFold(n_splits=5, shuffle=True, random_state=42)

#Layer-1 Creating predictions for 5 ,odels using K-fold
layer_1_predictions=np.zeros((X_train.shape[0],len(layer_1_models)))

for i,(name,model) in enumerate(layer_1_models):
    layer_1_model_predictions = np.zeros(X_train.shape[0])
    for train_idx, val_idx in kf.split(X_train):
        X_tr,X_val=X_train.iloc[train_idx],X_train.iloc[val_idx]
        y_tr,y_val=y_train.iloc[train_idx],y_train.iloc[val_idx]
        model.fit(X_tr,y_tr)
        layer_1_model_predictions[val_idx] = model.predict(X_val)
    layer_1_predictions[:,i]=layer_1_model_predictions


# layer 2- Creating predictions from 3 models using the predictions of layer 1
layer_2_predictions=np.zeros((X_train.shape[0],len(layer_2_models)))

for i,(name,model) in enumerate(layer_2_models):
    layer_2_model_predictions=np.zeros(X_train.shape[0])
    for train_idx,val_idx in kf.split(layer_1_predictions):
        X_tr,X_val=layer_1_predictions[train_idx],layer_1_predictions[val_idx]
        y_tr,y_val= y_train.iloc[train_idx], y_train.iloc[val_idx]
        model.fit(X_tr, y_tr)
        layer_2_model_predictions[val_idx]= model.predict(X_val)
    layer_2_predictions[:, i]= layer_2_model_predictions

# Final Layer - Use the Final Estimator on layer 2 predictions

final_estimator.fit(layer_2_predictions, y_train)

# Prepare testing data for layer_wise predictions
layer_1_test_predictions= np.column_stack([model.predict(X_test) for _,model in layer_1_models])
layer_2_test_predictions= np.column_stack([model.predict(layer_1_test_predictions) for _,model in layer_2_models])

# Final Prediction
final_predictions= final_estimator.predict(layer_2_test_predictions)

# Evaluate the model
r2Score= r2_score(y_test,final_predictions)

print(f"R2_Score : {r2Score}")






R2_Score : 0.9435293199839794


In [None]:
# estm1=XGBRegressor(n_estimators=800,learning_rate=0.2,max_depth=6)
# estm2=RandomForestRegressor(n_estimators=500,bootstrap=True,max_depth=2)
# stacking_reg=StackingRegressor(
#     estimators=[('xgb', estm1), ('rf', estm2)],
#     final_estimator=Ridge(),
#     cv=5,
#     n_jobs=-1,
#     passthrough=False,
#     verbose=2
# )
# stacking_reg.fit(X_train,y_train)


Making a pipeline of the whole process!


In [111]:
import numpy as np
import pandas as pd
import pickle
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.base import BaseEstimator, RegressorMixin

In [131]:
# Load Datraset
df=pickle.load(open('dataset_level3.pkl','rb'))
X=df.drop(columns=['runs_x'])
y=df['runs_x']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [133]:
#Step 1 One Hot Encoding
categorical_features=['bowling_team','batting_team','city']
ohe=ColumnTransformer(
    transformers=[
        ('ohe',OneHotEncoder(drop='first',sparse_output=False,dtype=int),categorical_features)
    ],
    remainder='passthrough'
)

In [141]:
trf=ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first',dtype=int),['batting_team','bowling_team','city'])
],remainder='passthrough'
)


In [135]:
# step 2: Define Custom Stacking Model for layered predictions

class ThreeLayeredStackingRegressor(BaseEstimator,RegressorMixin):
    def __init__(self):
        #Layer1 Base Models
        self.layer_1_models=[
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12,enable_categorical=True, random_state=1),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12,enable_categorical=True, random_state=2),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12,enable_categorical=True, random_state=3),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12,enable_categorical=True, random_state=4),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12,enable_categorical=True, random_state=5)
        ]
        # Layer 2 models
        self.layer_2_models = [
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6,enable_categorical=True, random_state=6),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6,enable_categorical=True, random_state=7),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6,enable_categorical=True, random_state=8)
        ]
        
        # Final layer estimator
        self.final_estimator = RandomForestRegressor(n_estimators=500, max_depth=6, bootstrap=True)
    

    def fit(self,X,y):
        kf=KFold(n_splits=5,shuffle=True,random_state=42)

        #Layer1 Predictions
        layer_1_predictions=np.zeros((X.shape[0],len(self.layer_1_models)))
        for i,model in enumerate(self.layer_1_models):
            layer_1_model_predictions=np.zeros(X.shape[0])
            for train_idx, val_idx in kf.split(X):
                X_tr,X_val=X_train.iloc[train_idx],X_train.iloc[val_idx]
                y_tr=y_train.iloc[train_idx]
                model.fit(X_tr,y_tr)
                layer_1_model_predictions[val_idx]=model.predict(X_val)
            layer_1_predictions[:,i]=layer_1_model_predictions
        
        # Layer 2 Models

        layer_2_predictions=np.zeros((X.shape(0),len(layer_2_models)))
        for i,model in enumerate(layer_2_models):
            layer_2_model_predictions=np.zero(X.shape(0))
            for train_idx,val_idx in kf.split(layer_1_predictions):
                X_tr,X_val=layer_1_predictions[train_idx],layer_1_predictions[val_idx]
                y_tr=y.iloc[train_idx]
                model.fit(X_tr,y_tr)
                layer_2_model_predictions[val_idx]=model.predict(X_val)
            layer_2_predictions[:,i]=layer_2_model_predictions

        # final layer fit
        self.final_estimator.fit(layer_2_predictions,y)
        return self
    

    def predict(self,X):
        # Layer 1 test predictions
        layer_1_test_predictions=np.column_stack([model.predict(X) for model in self.layer_1_models])

        # Layer 2 test predictions 

        layer_2_test_predictions=np.column_stack([model.predict(layer_1_test_predictions) for model in  layer_2_models ])

        # Final test predictions

        return self.final_estimator.predict(layer_2_test_predictions)


In [142]:
# Pipeline Creation

pipeline=Pipeline([
    ('preprocessing',trf),
    ('stacking_model',ThreeLayeredStackingRegressor())
])

#Fit the pipeline
pipeline.fit(X_train,y_train)


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:batting_team: object, bowling_team: object, city: object

In [146]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, RegressorMixin
import numpy as np
import pandas as pd

# Step 1: One Hot Encoding
categorical_features = ['bowling_team', 'batting_team', 'city']
ohe = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'
)

# Step 2: Define Custom Stacking Model for layered predictions
class ThreeLayeredStackingRegressor(BaseEstimator, RegressorMixin):
    def __init__(self):
        # Layer 1 Base Models
        self.layer_1_models = [
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, enable_categorical=True, random_state=1),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, enable_categorical=True, random_state=2),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, enable_categorical=True, random_state=3),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, enable_categorical=True, random_state=4),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=12, enable_categorical=True, random_state=5)
        ]
        # Layer 2 models
        self.layer_2_models = [
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6, enable_categorical=True, random_state=6),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6, enable_categorical=True, random_state=7),
            XGBRegressor(n_estimators=1000, learning_rate=0.2, max_depth=6, enable_categorical=True, random_state=8)
        ]
        
        # Final layer estimator
        #self.final_estimator = RandomForestRegressor(n_estimators=500, max_depth=6, bootstrap=True)
        self.final_estimator = Ridge(alpha=1.0)


    def fit(self, X, y):
        kf = KFold(n_splits=5, shuffle=True, random_state=42)

        # Layer 1 Predictions
        layer_1_predictions = np.zeros((X.shape[0], len(self.layer_1_models)))
        for i, model in enumerate(self.layer_1_models):
            layer_1_model_predictions = np.zeros(X.shape[0])
            for train_idx, val_idx in kf.split(X):
                X_tr, X_val = X[train_idx], X[val_idx]
                y_tr = y.iloc[train_idx]
                model.fit(X_tr, y_tr)
                layer_1_model_predictions[val_idx] = model.predict(X_val)
            layer_1_predictions[:, i] = layer_1_model_predictions
        
        # Layer 2 Models
        layer_2_predictions = np.zeros((X.shape[0], len(self.layer_2_models)))
        for i, model in enumerate(self.layer_2_models):
            layer_2_model_predictions = np.zeros(X.shape[0])
            for train_idx, val_idx in kf.split(layer_1_predictions):
                X_tr, X_val = layer_1_predictions[train_idx], layer_1_predictions[val_idx]
                y_tr = y.iloc[train_idx]
                model.fit(X_tr, y_tr)
                layer_2_model_predictions[val_idx] = model.predict(X_val)
            layer_2_predictions[:, i] = layer_2_model_predictions

        # Final layer fit
        self.final_estimator.fit(layer_2_predictions, y)
        return self

    def predict(self, X):
        # Layer 1 test predictions
        layer_1_test_predictions = np.column_stack([model.predict(X) for model in self.layer_1_models])

        # Layer 2 test predictions
        layer_2_test_predictions = np.column_stack([model.predict(layer_1_test_predictions) for model in self.layer_2_models])

        # Final test predictions
        return self.final_estimator.predict(layer_2_test_predictions)

# Pipeline Creation
pipeline = Pipeline([
    ('preprocessing', ohe),
    ('stacking_model', ThreeLayeredStackingRegressor())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)


In [147]:
final_predictions=pipeline.predict(X_test)
r2score=r2_score(y_test,final_predictions)
print(f"R2 Score : {r2score}")

R2 Score : 0.9469729820451263


In [149]:
pickle.dump(pipeline,open('pipeline.pkl','wb'))