In [None]:
import pandas as pd
import numpy as np
import functools
import xgboost as xgb

In [None]:
def load_and_transform_data():
    #Load and transform match data
    match_data=pd.read_csv('t20i_Matches_Data.csv')
    match_data=match_data[match_data['Match Result Text'].str.lower().apply(lambda x:'no result' in x)==False]
    match_data['team1_total_runs']=match_data['Team1 Runs Scored']+match_data['Team1 Extras Rec']
    match_data['team2_total_runs']=match_data['Team2 Runs Scored']+match_data['Team2 Extras Rec']
    match_data['run_diff']=match_data['team2_total_runs']-match_data['team1_total_runs']
    match_data['rpw1']=match_data['Team1 Runs Scored']/match_data['Team1 Wickets Fell']
    match_data.loc[match_data['Team1 Wickets Fell']==0,'rpw1']=match_data['Team1 Runs Scored']
    match_data.loc[match_data['Team2 Wickets Fell']==0,'rpw2']=match_data['Team2 Runs Scored']
    match_data['rpw_diff']=match_data['rpw2']-match_data['rpw1']
    upper_bound=match_data['rpw_diff'].quantile(.99)
    lower_bound=match_data['rpw_diff'].quantile(.01)
    match_data.loc[match_data['rpw_diff']>=upper_bound,'rpw_diff']=upper_bound
    match_data.loc[match_data['rpw_diff']<=lower_bound,'rpw_diff']=lower_bound
    match_data['flip_flag']=(match_data['Match Winner']==match_data['Toss Winner'])
    match_data['Match Date']=pd.to_datetime(match_data['Match Date'])
    match_data['year']=match_data['Match Date'].dt.year
    match_data['extras_ratio']=match_data['Team1 Extras Rec']/(match_data['Team1 Extras Rec']+match_data['Team1 Runs Scored'])

    #Load and transform batter data

    batter_data=pd.read_csv('t20i_Batting_Card.csv')
    batter_data['rpb']=batter_data['runs']/batter_data['balls']
    batter_data['wicket2']=batter_data['wicketType']
    batter_data.loc[batter_data['wicketType'].isin(['retired not out','hitwicket','obstructing the field','retired out','timed out']),'wicket2']='other'
    batter_data['caught_flag']=(batter_data['wicket2']=='caught')*1.0
    batter_data=batter_data[batter_data['wicket2']!='DNB']
    batter_data['fours_runs']=batter_data['fours']*4
    batter_data['sixes_runs']=batter_data['sixes']*6


    #Load and transform bowler data
    
    bowler_data=pd.read_csv('t20i_Bowling_Card.csv')
    bowler_data=bowler_data[bowler_data['balls']>6]
    bowler_data['rpb']=bowler_data['conceded']/bowler_data['balls']
    bowler_data['fours_runs']=bowler_data['fours']*4
    bowler_data['sixes_runs']=bowler_data['sixes']*6

    #Additional mapping
    
    batter_data['match_team']=batter_data['Match ID'].astype(str)+'_'+batter_data['team']
    deliveries_received=batter_data.groupby('match_team')['balls'].sum()
    match_data['match_team1']=match_data['Match ID'].astype(str)+'_'+match_data['Team1 Name']
    match_data['match_team2']=match_data['Match ID'].astype(str)+'_'+match_data['Team2 Name']
    match_data['team1_deliveries_received']=match_data['match_team1'].map(deliveries_received)
    match_data['team2_deliveries_received']=match_data['match_team2'].map(deliveries_received)
    match_data['team1_rpd']=match_data['Team1 Runs Scored']/match_data['team1_deliveries_received']
    match_data['team2_rpd']=match_data['Team2 Runs Scored']/match_data['team2_deliveries_received']
    match_data['rpd_diff']=match_data['team2_rpd']-match_data['team1_rpd']
    match_data2=match_data.copy()
    match_data2.index=match_data2['Match ID']
    batter_data['match_date']=batter_data['Match ID'].map(match_data2['Match Date'])
    bowler_data['match_date']=bowler_data['Match ID'].map(match_data2['Match Date'])
    
    return match_data,batter_data,bowler_data

In [None]:
#Helper functions for creating model training data
def get_historical_batter_averages(player_matchdate,batter_data):
    player_id=int(player_matchdate.split('_')[0])
    match_date=np.datetime64(player_matchdate.split('_')[-1])
    batter_data_local=batter_data[batter_data['batsman']==player_id]
    batter_data_local=batter_data_local[batter_data_local['match_date']<match_date]
    batter_data_local['match_order']=batter_data_local['match_date'].rank(ascending=False)
    batter_data_local=batter_data_local[batter_data_local['match_order']<=10]
    if len(batter_data_local)>0:
        batter_stats=({'rpd':batter_data_local['runs'].sum()/batter_data_local['balls'].sum(),
                       'avg_deliveries':batter_data_local['balls'].mean(),
                       'fours_rate':batter_data_local['fours'].sum()/batter_data_local['balls'].sum(),
                       'sixes_rate':batter_data_local['sixes'].sum()/batter_data_local['balls'].sum()
                       })
    else:
        batter_stats=({'rpd':-1,
                       'avg_deliveries':-1,
                       'fours_rate':-1,
                       'sixes_rate':-1
                       })    
    return batter_stats

def get_historical_bowler_averages(player_matchdate,bowler_data):
    player_id=int(player_matchdate.split('_')[0])
    match_date=np.datetime64(player_matchdate.split('_')[-1])
    bowler_data_local=bowler_data[bowler_data['bowler id']==player_id]
    bowler_data_local=bowler_data_local[bowler_data_local['match_date']<match_date]
    bowler_data_local['match_order']=bowler_data_local['match_date'].rank(ascending=False)
    bowler_data_local=bowler_data_local[bowler_data_local['match_order']<=10]
    if len(bowler_data_local)>0:
        bowler_stats=({'rpd_allowed':bowler_data_local['conceded'].sum()/bowler_data_local['balls'].sum(),
                       'wide_rate':bowler_data_local['wides'].sum()/bowler_data_local['balls'].sum(),
                       'fours_allowed_rate':bowler_data_local['fours'].sum()/bowler_data_local['balls'].sum(),
                       'sixes_allowed_rate':bowler_data_local['sixes'].sum()/bowler_data_local['balls'].sum(),
                       'wicket_rate':bowler_data_local['wickets'].sum()/bowler_data_local['balls'].sum(),
                       })
    else:
        bowler_stats=({'rpd_allowed':-1,
                       'wide_rate':-1,
                       'fours_allowed_rate':-1,
                       'sixes_allowed_rate':-1,
                       'wicket_rate':-1
                       })    
    return bowler_stats

In [None]:
def build_model_dataset(match_data,batter_data,bowler_data):
    #Get median player stats to fill in for players that don't have any data
    
    null_fill_lookup_batters=({
        'rpd':(batter_data['runs']/batter_data['balls']).median(),
        'avg_deliveries':batter_data['balls'].median(),
        'fours_rate':(batter_data['fours']/batter_data['balls']).median(),
        'sixes_rate':(batter_data['sixes']/batter_data['balls']).median(),

        })
    
    null_fill_lookup_bowlers=({
        'rpd_allowed':(bowler_data['conceded']/bowler_data['balls']).median(),
        'wide_rate':(bowler_data['wides']/bowler_data['balls']).median(),
        'fours_allowed_rate':(bowler_data['fours']/bowler_data['balls']).median(),
        'sixes_allowed_rate':(bowler_data['sixes']/bowler_data['balls']).median(),
        'wicket_rate':(bowler_data['wickets']/bowler_data['balls']).median()
        })    

    #Loop through each of the playing 11 and compile the stats over their last 10 games
    for teamNum in [1,2]:
        for i in range(1,12):
            match_data[f'team{teamNum}_batter_{i}']=match_data[f'Team{teamNum} Playing 11'].apply(lambda x:eval(x)[i-1])
            match_data[f'team{teamNum}_batter_{i}_matchdate']=match_data[f'team{teamNum}_batter_{i}']+'_'+(match_data['Match Date'].dt.date).astype(str)
            #Attach batting stats
            match_data[f'team{teamNum}_batter_{i}_stats']=match_data[f'team{teamNum}_batter_{i}_matchdate'].apply(functools.partial(get_historical_batter_averages,batter_data=batter_data))
            match_data[f'team{teamNum}_batter_{i}_rpd']=match_data[f'team{teamNum}_batter_{i}_stats'].apply(lambda x:x['rpd'])
            match_data[f'team{teamNum}_batter_{i}_avg_deliveries']=match_data[f'team{teamNum}_batter_{i}_stats'].apply(lambda x:x['avg_deliveries'])
            match_data[f'team{teamNum}_batter_{i}_fours_rate']=match_data[f'team{teamNum}_batter_{i}_stats'].apply(lambda x:x['fours_rate'])
            match_data[f'team{teamNum}_batter_{i}_sixes_rate']=match_data[f'team{teamNum}_batter_{i}_stats'].apply(lambda x:x['sixes_rate'])
            #Attach bowling stats
            match_data[f'team{teamNum}_bowler_{i}_stats']=match_data[f'team{teamNum}_batter_{i}_matchdate'].apply(functools.partial(get_historical_bowler_averages,bowler_data=bowler_data))
            match_data[f'team{teamNum}_bowler_{i}_rpd_allowed']=match_data[f'team{teamNum}_bowler_{i}_stats'].apply(lambda x:x['rpd_allowed'])
            match_data[f'team{teamNum}_bowler_{i}_wide_rate']=match_data[f'team{teamNum}_bowler_{i}_stats'].apply(lambda x:x['wide_rate'])
            match_data[f'team{teamNum}_bowler_{i}_fours_allowed_rate']=match_data[f'team{teamNum}_bowler_{i}_stats'].apply(lambda x:x['fours_allowed_rate'])
            match_data[f'team{teamNum}_bowler_{i}_sixes_allowed_rate']=match_data[f'team{teamNum}_bowler_{i}_stats'].apply(lambda x:x['sixes_allowed_rate'])
            match_data[f'team{teamNum}_bowler_{i}_wicket_rate']=match_data[f'team{teamNum}_bowler_{i}_stats'].apply(lambda x:x['wicket_rate'])
            
            for stat in null_fill_lookup_batters.keys():
                match_data.loc[match_data[f'team{teamNum}_batter_{i}_{stat}']==-1,f'team{teamNum}_batter_{i}_{stat}']=null_fill_lookup_batters[stat]
            for stat in null_fill_lookup_bowlers.keys():
                match_data.loc[match_data[f'team{teamNum}_bowler_{i}_{stat}']==-1,f'team{teamNum}_bowler_{i}_{stat}']=null_fill_lookup_bowlers[stat]
    match_data=match_data[match_data['Match Winner'].notnull()]
    match_data['team1_win_flag']=(match_data['Team1 Name']==match_data['Match Winner'])*1.0

In [None]:
def build_model(match_data):
    input_stats=['rpd','avg_deliveries','fours_rate','sixes_rate','rpd_allowed','wide_rate','fours_allowed_rate','sixes_allowed_rate','wicket_rate']
    model_inputs=[]
    for item in input_stats:
        model_inputs+=[col for col in match_data.columns if item in col and ('batter' in col or 'bowler' in col)]
    model_inputs=list(set(model_inputs))

    model_data=match_data[model_inputs+['team1_win_flag']]
    train=model_data.sample(frac=.7)
    test=model_data[model_data.index.isin(train.index)==False]
    dtrain1=xgb.DMatrix(train[model_inputs],label=train['team1_win_flag'])
    dtrain2=xgb.DMatrix(test[model_inputs],label=test['team1_win_flag'])
    params={'max_depth': 4, 'eta': .01, 'silent': 1, 'objective': 'binary:logistic','nthread':4,'eval_metric':'logloss','min_child_weight':1,'gamma':0.0,'subsample':.95,'colsample_bytree':.95,'reg_alpha':10,'reg_lambda':1,'tree_method':'hist'}
    eval_list=[(dtrain1,'train'),(dtrain2,'test')]
    model=xgb.train(params=params,dtrain=dtrain1,evals=eval_list,num_boost_round=7000,early_stopping_rounds=10)
    feature_importances=pd.Series(model.get_score(importance_type='gain')).sort_values(ascending=False)
    test['predicted_probability']=model.predict(dtrain2)
    
    #Evaluate model for calibration
    test['rounded']=test['predicted_probability'].round(1)
    test.groupby('rounded')['team1_win_flag'].mean().plot()
    return model

In [None]:
#Full sequence of events
match_data,batter_data,bowler_data=load_and_transform_data()
build_model_dataset(match_data,batter_data,bowler_data)
cricket_model=build_model(match_data)