In [68]:
import numpy as np
import pandas as pd

In [69]:
df = pd.read_csv("T20_mens_dataset.csv")
df.head(10)

Unnamed: 0,batting_team,bowling_team,city,current_score,balls_left,wickets_left,crr,last_five,runs_x
0,Sri Lanka,Australia,Colombo,0,119,10,0.0,,128
1,Sri Lanka,Australia,Colombo,1,118,10,3.0,,128
2,Sri Lanka,Australia,Colombo,5,117,10,10.0,,128
3,Sri Lanka,Australia,Colombo,6,116,10,9.0,,128
4,Sri Lanka,Australia,Colombo,7,115,10,8.4,,128
5,Sri Lanka,Australia,Colombo,9,114,10,9.0,,128
6,Sri Lanka,Australia,Colombo,9,113,10,7.714286,,128
7,Sri Lanka,Australia,Colombo,9,113,10,7.714286,,128
8,Sri Lanka,Australia,Colombo,9,112,9,6.75,,128
9,Sri Lanka,Australia,Colombo,9,111,9,6.0,,128


In [70]:
df.shape

(50609, 9)

### **Dropping all rows containing null values**

In [73]:
df.isnull().sum()

batting_team         0
bowling_team         0
city                 0
current_score        0
balls_left           0
wickets_left         0
crr                  0
last_five        12053
runs_x               0
dtype: int64

In [112]:
df.dropna(inplace=True)
df.isnull().sum()

batting_team     0
bowling_team     0
city             0
current_score    0
balls_left       0
wickets_left     0
crr              0
last_five        0
runs_x           0
dtype: int64

In [72]:
df.dtypes

batting_team      object
bowling_team      object
city              object
current_score      int64
balls_left         int64
wickets_left       int64
crr              float64
last_five        float64
runs_x             int64
dtype: object

## **EDA: Exploratory Data Analysis**

In [113]:
df.describe()

Unnamed: 0,current_score,balls_left,wickets_left,crr,last_five,runs_x
count,38556.0,38556.0,38556.0,38556.0,38556.0,38556.0
mean,94.078224,45.88448,6.733634,7.563875,38.484023,160.082192
std,41.630432,26.542472,2.034015,1.722,11.810156,32.203038
min,12.0,0.0,0.0,2.27027,9.0,72.0
25%,60.0,23.0,5.0,6.375,30.0,139.0
50%,89.0,46.0,7.0,7.5,38.0,159.0
75%,123.0,69.0,8.0,8.630497,46.0,182.0
max,263.0,98.0,10.0,16.6,89.0,263.0


# **Model Building**

### **1. Train Test Split**

In [76]:
from sklearn.model_selection import train_test_split

y = df[['runs_x']]
X = df.drop(columns=['runs_x'])

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [81]:
X_test.shape

(7712, 8)

### **2. Data Transformation**

In [101]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [102]:
trf = ColumnTransformer([
    ('trf', OneHotEncoder(sparse_output=False), ['batting_team', 'bowling_team', 'city'])
], remainder='passthrough')

In [86]:
scl = StandardScaler()


### **3. Model Taining**

In [100]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [108]:
algos = [XGBRegressor(n_estimators = 1000, learning_rate = 0.15, max_depth = 12, random_state = 1), RandomForestRegressor(n_estimators=100, random_state=1, oob_score=True), LinearRegression()]

for algo in algos:
        pipe = Pipeline(steps=[
        ("step1", trf),
        ('step2', StandardScaler()),
        ('step3', algo)     
    ])
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        print(f'R2 score is: {r2_score(y_test, y_pred)}')
        print(f'mean absolute error is: {mean_absolute_error(y_test, y_pred)}')
        print(f'mean squared error is: {mean_squared_error(y_test, y_pred)}')

        print("-"*25)


R2 score is: 0.9891198369000512
mean absolute error is: 1.5571246997944053
mean squared error is: 11.341998688228887
-------------------------


  return fit_method(estimator, *args, **kwargs)


R2 score is: 0.9825328467890253
mean absolute error is: 1.9641168741355466
mean squared error is: 18.20858998031927
-------------------------
R2 score is: 0.7077214828220033
mean absolute error is: 13.0455764886242
mean squared error is: 304.6850059118985
-------------------------


### **Final Model Pipeline: XGBRegressor**

In [109]:
pipe = Pipeline(steps=[
        ("step1", trf),
        ('step2', StandardScaler()),
        ('step3', XGBRegressor(n_estimators = 1000, learning_rate = 0.15, max_depth = 12, random_state = 1))     
    ])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(f'R2 score is: {r2_score(y_test, y_pred)}')
print(f'mean absolute error is: {mean_absolute_error(y_test, y_pred)}')
print(f'mean squared error is: {mean_squared_error(y_test, y_pred)}')

R2 score is: 0.9891198369000512
mean absolute error is: 1.5571246997944053
mean squared error is: 11.341998688228887


### **4. Save the Model**

In [111]:
import pickle 
pickle.dump(pipe, open('model_pipeline.pkl', 'wb'))