<a href="https://colab.research.google.com/github/spaceo521/Data-Science-Projects/blob/main/Week6_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.expand_frame_repr',False)
pd.set_option('max_colwidth',-1)

In [None]:
df = pd.read_csv('T20I_ball_by_ball_updated.csv')

df = df.sort_values(['match_id','innings','ball'],ascending=[True,True,True]).reset_index(drop=True)

df['overs'] = df['ball']

df['ball'] = df['ball'].astype(str)

df['over'] = df['ball'].str[0:-1]

df['over'] = df['over'].str.replace('.','')

df['ball'] = df['ball'].str[-1:]

df['total_runs'] = df['runs_off_bat'] + df['extras']

df = df[['match_id','innings','batting_team','bowling_team','venue','striker','overs','over','ball','total_runs','player_dismissed']]

df = df.rename(columns={'match_id': 'id'})

main_teams = ['Australia', 'New Zealand', 'England', 'South Africa',
       'West Indies', 'Sri Lanka', 'Pakistan', 'India',
       'Bangladesh']

df = df[(df.batting_team.isin(main_teams)) & (df.bowling_team.isin(main_teams))]

df = df.replace(np.nan,0)

df['inn_total'] = df.groupby(['id','innings'])['total_runs'].transform('sum')

df['current_score'] = df.groupby(['id','innings'])['total_runs'].apply(lambda x : x.cumsum())

df['player_dismissed'] = np.where(df['player_dismissed']==0,0,1)

df['current_wickets'] = df.groupby(['id','innings'])['player_dismissed'].apply(lambda x : x.cumsum())

temp = df.groupby(['id','innings'])['total_runs'].rolling(min_periods=1,window=30).sum().reset_index()

df['runs_last30balls'] = temp['total_runs'].to_list()

temp = df.groupby(['id','innings'])['player_dismissed'].rolling(min_periods=1,window=30).sum().reset_index()

df['wickets_last30balls'] = temp['player_dismissed'].to_list()

df['dots_last30balls'] = df['total_runs']

df['dots_last30balls'] = np.where(df['dots_last30balls']==0,1,0)

temp = df.groupby(['id','innings'])['dots_last30balls'].rolling(min_periods=1,window=30).sum().reset_index()

df['dots_last30balls'] = temp['dots_last30balls'].to_list()

df['boundaries_last30balls'] = df['total_runs']

df['boundaries_last30balls'] = np.where(df['boundaries_last30balls']>3,1,0)

temp = df.groupby(['id','innings'])['boundaries_last30balls'].rolling(min_periods=1,window=30).sum().reset_index()

df['boundaries_last30balls'] = temp['boundaries_last30balls'].to_list()

df = df[['id','innings','batting_team','bowling_team','venue','striker','overs','over','ball','total_runs','player_dismissed','current_score','current_wickets','runs_last30balls','wickets_last30balls','dots_last30balls','boundaries_last30balls','inn_total']]

df['inn_wickets'] = df.groupby(['id','innings'])['player_dismissed'].transform('sum')

# Added 2 new features 1) Run rate 2) Run rate in last 30 balls

df['run_rate'] = df.apply(lambda x : x['current_score'] / x['overs'], axis = 1)

df['runrate_last30balls'] = df.apply(lambda x : x['runs_last30balls'] / x['overs'], axis = 1)

In [None]:
convert_dict = {'runs_last30balls' : int,
                'wickets_last30balls': int,
                'dots_last30balls' : int,
                'boundaries_last30balls' : int
               }

df = df.astype(convert_dict)

In [None]:
df = pd.get_dummies(data = df, columns = ['batting_team', 'bowling_team'])

In [None]:
df.columns

Index(['id', 'innings', 'venue', 'striker', 'overs', 'over', 'ball',
       'total_runs', 'player_dismissed', 'current_score', 'current_wickets',
       'runs_last30balls', 'wickets_last30balls', 'dots_last30balls',
       'boundaries_last30balls', 'inn_total', 'inn_wickets', 'run_rate',
       'runrate_last30balls', 'batting_team_Australia',
       'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India',
       'batting_team_New Zealand', 'batting_team_Pakistan',
       'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia',
       'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India',
       'bowling_team_New Zealand', 'bowling_team_Pakistan',
       'bowling_team_South Africa', 'bowling_team_Sri Lanka',
       'bowling_team_West Indies'],
      dtype='object')

In [None]:
df = df[['id','batting_team_Australia',
       'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India',
       'batting_team_New Zealand', 'batting_team_Pakistan',
       'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia',
       'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India',
       'bowling_team_New Zealand', 'bowling_team_Pakistan',
       'bowling_team_South Africa', 'bowling_team_Sri Lanka',
       'bowling_team_West Indies', 'overs', 'current_score', 'current_wickets',
       'runs_last30balls', 'wickets_last30balls', 'dots_last30balls',
       'boundaries_last30balls', 'inn_total', 'run_rate', 'runrate_last30balls']]

In [None]:
# Train Test Split

In [None]:
X = df.drop(labels = ['inn_total', 'id'], axis=1)
y = df['inn_total'].values

In [None]:
y

array([179, 179, 179, ...,  71,  71,  71], dtype=int64)

In [None]:
X.head()

Unnamed: 0,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_Sri Lanka,bowling_team_West Indies,overs,current_score,current_wickets,runs_last30balls,wickets_last30balls,dots_last30balls,boundaries_last30balls,run_rate,runrate_last30balls
0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.1,0,0,0,0,1,0,0.0,0.0
1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.2,1,0,1,0,1,0,5.0,5.0
2,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.3,1,0,1,0,2,0,3.333333,3.333333
3,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.4,1,0,1,0,3,0,2.5,2.5
4,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.5,1,0,1,0,4,0,2.0,2.0


In [None]:
len(X)

239117

In [None]:
len(y)

239117

In [None]:
# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [None]:
X_train = X_train.values
X_test = X_test.values
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(179337, 27) (59780, 27) (179337,) (59780,)


In [None]:
# Training LR Model

In [None]:
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

In [None]:
# creating a pickle file for the classifier
filename = 'lr-model.pkl'
pickle.dump(LR_model, open(filename, 'wb'))

In [None]:
prediction = LR_model.predict(X_test)
mean_absolute_error(y_test, prediction)

18.757883198120076

In [None]:
# Function to predict scores

def score_prediction(Bat_Team, Bowl_Team, overs, total_score, total_wickets, prev_30_runs, prev_30_wickets, prev_30_dots, prev_30_boundaries, RR, P30RR):

    temp_array = list()

    if Bat_Team == 'AUS':
        temp_array = temp_array + [1, 0, 0, 0, 0, 0, 0, 0, 0]
    elif Bat_Team == 'BAN':
        temp_array = temp_array + [0, 1, 0, 0, 0, 0, 0, 0, 0]
    elif Bat_Team == 'ENG':
        temp_array = temp_array + [0, 0, 1, 0, 0, 0, 0, 0, 0]
    elif Bat_Team == 'IND':
        temp_array = temp_array + [0, 0, 0, 1, 0, 0, 0, 0, 0]
    elif Bat_Team == 'NZ':
        temp_array = temp_array + [0, 0, 0, 0, 1, 0, 0, 0, 0]
    elif Bat_Team == 'PAK':
        temp_array = temp_array + [0, 0, 0, 0, 0, 1, 0, 0, 0]
    elif Bat_Team == 'SA':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 1, 0, 0]
    elif Bat_Team == 'SL':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 1, 0]
    elif Bat_Team == 'WI':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 0, 1]

    if Bowl_Team == 'AUS':
        temp_array = temp_array + [1, 0, 0, 0, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'BAN':
        temp_array = temp_array + [0, 1, 0, 0, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'ENG':
        temp_array = temp_array + [0, 0, 1, 0, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'IND':
        temp_array = temp_array + [0, 0, 0, 1, 0, 0, 0, 0, 0]
    elif Bowl_Team == 'NZ':
        temp_array = temp_array + [0, 0, 0, 0, 1, 0, 0, 0, 0]
    elif Bowl_Team == 'PAK':
        temp_array = temp_array + [0, 0, 0, 0, 0, 1, 0, 0, 0]
    elif Bat_Team == 'SA':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 1, 0, 0]
    elif Bowl_Team == 'SL':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 1, 0]
    elif Bowl_Team == 'WI':
        temp_array = temp_array + [0, 0, 0, 0, 0, 0, 0, 0, 1]

    temp_array = temp_array + [overs, current_score, current_wickets, runs_last30balls, wickets_last30balls, dots_last30balls,
       boundaries_last30balls, run_rate, runrate_last30balls]
    data = np.array([temp_array])
    my_prediction = int(LR_model.predict(data))

    print('Predicted score :', my_prediction)
    print('Predicted score range :', my_prediction - 19, 'to ', my_prediction + 19)

In [None]:
Bat_team = 'AUS'
Bowl_team = 'IND'
overs = 10.3               # current overs
current_score = 67         # current score
current_wickets = 2        # current_wickets
runs_last30balls = 37      # current_runs_in_last_30_balls
wickets_last30balls = 1    # current_wickets_in_last_30_balls
dots_last30balls = 8       # current_dots_in_last_30_balls
boundaries_last30balls = 5 # current_boundaries_in_last_30_balls
run_rate = 6.38            # current run rate
runrate_last30balls = 7.4  # run_rate in last 30 balls

score_prediction(Bat_team, Bowl_team, overs, current_score, current_wickets, runs_last30balls, wickets_last30balls, dots_last30balls,
       boundaries_last30balls, run_rate, runrate_last30balls)



Predicted score : 136
Predicted score range : 117 to  155
