### Model Training

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dill
import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_columns', None)
%matplotlib inline

In [2]:
data = pd.read_csv('data.csv')
data = data.sample(axis=0, frac=1, random_state=51)
data.head()

Unnamed: 0,Batting,Bowling,City,Runs_left,Balls_left,Wicket_left,Total_run,crr,rrr,Result
128460,Rajasthan Royals,Royal Challengers Bangalore,Jaipur,-47,92,7,189,41.82,-3.07,0
180973,Delhi Capitals,Mumbai Indians,Mumbai,-129,23,-4,162,17.01,-33.65,0
170970,Kolkata Knight Riders,Rajasthan Royals,Cape Town,-41,84,3,150,27.43,-2.93,0
91041,Delhi Capitals,Sunrisers Hyderabad,Visakhapatnam,57,38,8,167,7.57,9.0,0
24252,Mumbai Indians,Kolkata Knight Riders,Abu Dhabi,36,25,6,155,7.13,8.64,0


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

In [4]:
models = {
    'LogisticRegression' : LogisticRegression(),
    'RandomForestClassifier' : RandomForestClassifier()
}

In [5]:
params = {
    'LogisticRegression' : {
        'penalty': ['l2'],
        'solver': ['liblinear', 'newton-cholesky'],
        'C':[100]
    },
    'RandomForestClassifier' : {
#         'n_estimators':range(10, 100, 10),
#         'criterion':['gini', 'entropy', 'log_loss'],
#         'max_depth':range(3, 25, 1),
#         'min_samples_split':range(2, 15, 1),
#         'min_samples_leaf':range(1, 15, 1),
         'max_features':['sqrt','log2']
    }
}

In [6]:
x = data.drop('Result', axis=1)
y = data.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1)

In [7]:
def transform_pipe():
    num_cols = ['Runs_left', 'Balls_left', 'Wicket_left', 'Total_run', 'crr', 'rrr']
    cat_cols = ['Batting', 'Bowling', 'City']
    num_pipe = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )
    cat_pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder()
    )
    transformer_obj = ColumnTransformer([
        ('num_pipe', num_pipe, num_cols),
        ('cat_pipe', cat_pipe, cat_cols)
    ], remainder='passthrough')
    return transformer_obj

In [8]:
def training_pipe(x_train, x_test, y_train, y_test):
    transformer_obj = transform_pipe()
    train_arr = transformer_obj.fit_transform(x_train)
    test_arr = transformer_obj.fit_transform(x_test)
    test_result = {}
    
    for i in models:
        model = models[i]
        param = params[i]
        gs = GridSearchCV(
            model, param, scoring='accuracy', n_jobs=-1, verbose=2, cv=5, error_score='raise'
        )
        gs.fit(train_arr, y_train)
        
        model.set_params(**gs.best_params_)
        model.fit(train_arr, y_train)
        
        train_pred = model.predict(train_arr)
        train_score = accuracy_score(y_train, train_pred)*100
        
        test_pred = model.predict(test_arr)
        test_score = accuracy_score(y_test, test_pred)*100
        
        test_result[i] = {
            'train score':[train_score],
            'test score':[test_score],
            'best parameters':[gs.best_params_]
        }
    return pd.DataFrame(test_result)

In [9]:
result = training_pipe(x_train, x_test, y_train, y_test)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 2 candidates, totalling 10 fits


In [10]:
result.to_csv('train_test.csv', index=False)
print(result)

                                                LogisticRegression  \
train score                                    [67.96815744078182]   
test score                                     [67.83547226547817]   
best parameters  [{'C': 100, 'penalty': 'l2', 'solver': 'liblin...   

                     RandomForestClassifier  
train score             [99.99261660726391]  
test score              [99.80937550340975]  
best parameters  [{'max_features': 'sqrt'}]  


In [11]:
transformer_obj = transform_pipe()
transformer_obj.fit(x)
with open('preprocessor.pkl', 'wb') as file:
    dill.dump(transformer_obj, file)

In [12]:
with open('preprocessor.pkl', 'rb') as file:
    pre = dill.load(file)
train_arr = pre.transform(x_train)
param = {'C': 100, 'penalty': 'l2', 'solver': 'newton-cholesky'}
model = LogisticRegression()
model.set_params(**param)
model.fit(train_arr, y_train)
with open('LRegressor.pkl', 'wb') as file:
    dill.dump(model, file)

In [13]:
with open('preprocessor.pkl', 'rb') as file:
    pre = dill.load(file)
train_arr = pre.transform(x_train)
param = {'criterion': 'log_loss'}
model = RandomForestClassifier()
model.set_params(**param)
model.fit(train_arr, y_train)
with open('RFClassifier.pkl', 'wb') as file:
    dill.dump(model, file)

In [14]:
with open('preprocessor.pkl', 'rb') as file:
    preprocessor = dill.load(file)

with open('RFClassifier.pkl', 'rb') as file:
    model = dill.load(file)

df = data[data['Batting']=='Chennai Super Kings'].tail(10)
x_n = df.drop('Result', axis=1)
y_n = df.iloc[:,-1]

In [15]:
arr = preprocessor.transform(x_test)
model.fit(arr, y_test)
pred = model.predict(arr)
print('Testing percentage : ', np.round(accuracy_score(y_test, pred)*100, 2))

Testing percentage :  100.0


In [16]:
arr = preprocessor.transform(x_n)
model.fit(arr, y_n)
pred = model.predict(arr)
print('Prediction percentage : ', np.round(accuracy_score(y_n, pred)*100, 2))

Prediction percentage :  100.0


In [17]:
prob = model.predict_proba(arr)
print('Winning and loss probabilities : ')
print(np.round(prob*100, 2))

Winning and loss probabilities : 
[[88. 12.]
 [11. 89.]
 [18. 82.]
 [28. 72.]
 [19. 81.]
 [67. 33.]
 [19. 81.]
 [62. 38.]
 [86. 14.]
 [86. 14.]]
