In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
import dill
if __name__ == "__main__":
    # Load the model
    with open("model.dill", "rb") as f:
        model = dill.load(f)
print(model)

<__main__.BlendedModel object at 0x00000220FFA76020>


In [5]:
## Print multiple output per cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [6]:
## Importing libraries
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [7]:
df_test = pd.read_csv('test.csv')
df_batsman = pd.read_csv('dataset/batsman_level_scorecard.csv')
df_bowler = pd.read_csv('dataset/bowler_level_scorecard.csv')
df_match = pd.read_csv('dataset/match_level_scorecard.csv')

In [8]:
def giveLastNgamesPlayer(player_id, date, n, bat_or_bowl):
    '''
    Function to get last n games stats of a player before an input date.
    
    Input-
    1. player_id: id of the player to get historical data.
    2. date: date to look-back and get n games. Stats returned are before this input date.
    3. n: Number of historical games stats to return.
    4. bat_or_bowl: Kind of stats to return. {'bat': batting stats to return, 'bowl': bowling stats to return}
    
    Output-None
    
    Returns- dataframe having bowling/batting stats from last n games of a player before an input date. 
    The results are sorted by date.
    '''
    if bat_or_bowl == 'bat':
        df_topick = df_batsman
        id_col = 'batsman_id'
    else:
        df_topick = df_bowler
        id_col = 'bowler_id'
        
    return df_topick[(df_topick['match_dt']<date)&(df_topick[id_col]==float(player_id))]\
                .sort_values(by='match_dt', ascending=False).head(n)

In [9]:
def no50sLastn(player_list, date, n):
    '''
    Function to get total number of 50s scored by players in the roster of a team in last n games.
    
    Input-
    1. player_list: ':' separated list of player ids in the roster of a team.
    2. date: match date of the game to calculate this feature.
    3. n: Number of games to look-back and create this feature.
    
    Output-None
    
    Returns- int value denoting sum of 50s scored by all players in the roster.
    '''
    
    player_list = str(player_list).split(':') # split string of ':' separated ids into a list of ids
    res_list = []
    for player in player_list: # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat') # getting batting stats from last n games for each player.
        df_rel['gte_50runs'] = np.where(df_rel['runs']>=50, 1, 0) # binary indicator to denote whether the player scored a 50 in the game (runs>=50).
        res_list.append(np.nansum(df_rel['gte_50runs']))# Sum up number of 50s for the player and append to a list. We will do this for all players.
    return np.nansum(res_list)# Sum up values of the list which is sum of 50s by all players in the roster.

df_test['team1_count_50runs_last15'] = df_test.progress_apply(lambda x: \
            no50sLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
df_test['team2_count_50runs_last15'] = df_test.progress_apply(lambda x: \
            no50sLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
df_test['team_count_50runs_last15'] = (df_test['team1_count_50runs_last15'])/(df_test['team2_count_50runs_last15']+1)
df_test.drop(columns=['team1_count_50runs_last15','team2_count_50runs_last15'], inplace=True)

100%|██████████| 207/207 [00:05<00:00, 37.15it/s]
100%|██████████| 207/207 [00:04<00:00, 48.80it/s]


In [10]:
def winpLastn(team_id, date, n):
    '''
    Get a team's win % in last n games. If a team has won 3 game out of their last 5, win% is 60%.
    
    Input-
    1. team_id: ID of the team to get their last n games and winner information from those games.
    2. date: match date from which to get last n historical games.
    3. n: look-back window of games.
    
    Output- None
    
    Returns- Float value denoting win% of the team in last n games.
    '''
    # filter out games with either team1/2_id as input team id, match_dt being before current game's date, sort desc by date, and get top n rows (games)
    df_rel = df_match[(df_match['match_dt']<date)&\
                      ((df_match['team1_id']==team_id)|(df_match['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n) 
    win_count = df_rel[df_rel['winner_id']==team_id].shape[0] # count number of rows having winner as the input team
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return win% rounded to two decimal points

df_test['team1_winp_last5'] = df_test.progress_apply(lambda x: \
            winpLastn(team_id=x['team1_id'], date=x['match_dt'], n=5), axis=1)
df_test['team2_winp_last5'] = df_test.progress_apply(lambda x: \
            winpLastn(team_id=x['team2_id'], date=x['match_dt'], n=5), axis=1)
df_test['team_winp_last5'] = (df_test['team1_winp_last5']+1)/(df_test['team2_winp_last5']+1)
df_test.drop(columns=['team1_winp_last5','team2_winp_last5'], inplace=True)
df_test.shape
df_test.head(2)

100%|██████████| 207/207 [00:00<00:00, 834.56it/s]
100%|██████████| 207/207 [00:00<00:00, 981.29it/s] 


(207, 21)

Unnamed: 0,match id,team1,team1_id,team1_roster_ids,team2,team2_id,team2_roster_ids,toss winner,toss decision,venue,...,match_dt,lighting,series_name,season,ground_id,team_count_50runs_last15,team_winp_last5,team1only_avg_runs_last15,team1_winp_team2_last15,ground_avg_runs_last15
0,9272619,Me Ss,33949,4003390.0:7960994.0:3901078.0:2669316.0:373710...,Ht Hs,33928,5843200.0:4223883.0:4655384.0:6249256.0:216159...,Ht Hs,field,Be Ol Ht,...,2023-01-09,night match,Bg Bh Le,2022/23,6348,0.666667,1.0,167.933333,60.0,154.115385
1,9086958,Na,209,5836452.0:8246468.0:7500324.0:3065502.0:363350...,Si La,69,7200598.0:4403531.0:3260564.0:2420760.0:239834...,Si La,field,GA Sm Sh Gg Va,...,2022-10-16,day/night match,Si La tr of Aa,2022/23,3961,0.142857,0.207921,141.888889,0.0,142.833333


In [11]:
# derived feature computed using toss winner & toss decision to denote the inning team1 bats.
# If team1 won the toss and chose to bat or team2 won the toss and chose to bowl, the feature takes the value 1, else 2.
df_match['team1_bat_inning'] = np.where( ((df_match['team1']==df_match['toss winner'])&(df_match['toss decision']=='bat'))|\
                                               ((df_match['team2']==df_match['toss winner'])&(df_match['toss decision']=='field')) , 1, 2)

In [12]:
def teamAvgRunsLastn(team_id, date, n):
    '''
    Function to calculate a team's average runs in their last n games.
    
    Input-
    1. team_id: ID of the team to calculate average runs.
    2. date: match date of the current game for which the feature is calculated.
    3. n: look-back window of games for the team.
    
    Output- None
    
    Return- Float value denoting average of runs scored by team1 in their last n games.
    '''
    # filter out games with either team1/2_id as input team_id, match date less than current game's input date, sort desc by date, and top n rows (games) returned
    df_rel = df_match[(df_match['match_dt']<date)&\
                      ((df_match['team1_id']==team_id)|(df_match['team2_id']==team_id))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    # combine two dataframes - one where input team is batting first, and another one where input team is batting second.
    df_rel = pd.concat([ df_rel[df_rel['team1_bat_inning']==1][['inning1_runs']].rename(columns={'inning1_runs':'runs'}), \
                         df_rel[df_rel['team1_bat_inning']==2][['inning2_runs']].rename(columns={'inning2_runs':'runs'}) ] )
    return df_rel['runs'].mean() # return mean of the combined dataframe.


df_test['team1only_avg_runs_last15'] = df_test.progress_apply(lambda x: \
            teamAvgRunsLastn(x['team1_id'], x['match_dt'], 15), axis=1)
df_test['team2only_avg_runs_last15'] = df_test.progress_apply(lambda x: \
            teamAvgRunsLastn(x['team2_id'], x['match_dt'], 15), axis=1)
df_test['team_avg_runs_last15'] = (df_test['team1only_avg_runs_last15']+1) / \
                                               (df_test['team2only_avg_runs_last15'] + 1)
df_test.drop(columns=['team1only_avg_runs_last15', 'team2only_avg_runs_last15'], inplace=True)

100%|██████████| 207/207 [00:00<00:00, 393.06it/s]
100%|██████████| 207/207 [00:00<00:00, 401.77it/s]


In [13]:
def winpCrossLastn(team1_id, team2_id, date, n):
    '''
    Function to compute team1's win% against team2 from the current game in their past n encounters.
    
    Input-
    1. team1_id: ID of team1 to calculate win% of.
    2. team2_id: ID of team2 to calculate win% against.
    3: date: match date of the current game for which the feature is to be calculated.
    4. n: look-back window of games for both these teams.
    
    Output- None
    
    Returns- Float value denoting team1's win% against team2 in their past n games against each other.
    '''
    # filter out games where either team1_id is input team1 and team2_id is input team2, or where team2_id is input team1 and team1_id is input team2.
    # Also, match date is less than current games's input date, sort desc by date and get top n rows (games)
    df_rel = df_match[(df_match['match_dt']<date)&\
                      (((df_match['team1_id']==team1_id)&(df_match['team2_id']==team2_id))|((df_match['team1_id']==team2_id)&(df_match['team2_id']==team1_id)))]\
                        .sort_values(by='match_dt', ascending=False).head(n)
    win_count = df_rel[df_rel['winner_id']==team1_id].shape[0] # Counting number of rows (games) where winner is input team1.
    if win_count == 0:
        return 0
    return round(win_count*100/df_rel.shape[0], 2) # return Float denoting team1's win% against team2 in past n games rounded to 2 decimal places.

df_test['team1_winp_team2_last15'] = df_test.progress_apply(lambda x: \
                                  winpCrossLastn(x['team1_id'], x['team2_id'], x['match_dt'], 5), axis=1)

100%|██████████| 207/207 [00:00<00:00, 753.76it/s]


In [14]:
def avgRunsGround(ground_id, date, n):
    '''
    Function to calculate average runs scored in ground/venue.
    
    Input-
    1. ground_id: ID of the ground to calculate the feature for.
    2. date: match date of the current game to calculate the feature for.
    3. n: look-back window of games for the ground.
    
    Output- None
    
    Returns- Average runs scored in the ground.
    '''
    # filter out games with ground_id being the input ground_id and date earlier than current game's input date. Sort desc by date, and select top n rows (games).
    df_rel = df_match[(df_match['match_dt']<date)&(df_match['ground_id']==ground_id)].sort_values(by='match_dt', ascending=False).head(n)
    df_rel['avg_runs_inn'] = (df_rel['inning1_runs']+df_rel['inning2_runs'])/2 # take the mean of inning1_runs and inning2_runs in a separate column.
    return df_rel['avg_runs_inn'].mean() # Return the mean value of the computed column above.

df_test['ground_avg_runs_last15'] = df_test.progress_apply(lambda x: \
                                  avgRunsGround(x['ground_id'], x['match_dt'], 15), axis=1)

100%|██████████| 207/207 [00:00<00:00, 441.35it/s]


In [15]:
def avgBowlingEconomyLastn(player_list, date, n):
    
    player_list = str(player_list).split(':')  # split string of ':' separated ids into a list of ids
    economy_list = []
    for player in player_list:  # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bowl')  # getting bowling stats from last n games for each player.
        df_rel['economy'] = df_rel['runs'] / (df_rel['balls_bowled'] / 6)  # Calculate economy rate
        economy_list.append(np.nanmean(df_rel['economy']))  # Append average economy rate to the list
    return np.nanmean(economy_list)  # Return the mean of the list which is the average economy rate of all players in the roster.

# Similarly for test dataset
df_test['team1_avg_bowling_economy_last15'] = df_test.progress_apply(lambda x: \
            avgBowlingEconomyLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
df_test['team2_avg_bowling_economy_last15'] = df_test.progress_apply(lambda x: \
            avgBowlingEconomyLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

df_test['team_avg_bowling_economy_last15'] = 1 - (df_test['team1_avg_bowling_economy_last15']+1) / \
                                               (df_test['team2_avg_bowling_economy_last15'] + 1)
df_test.drop(columns=['team1_avg_bowling_economy_last15', 'team2_avg_bowling_economy_last15'], inplace=True)

  0%|          | 0/207 [00:00<?, ?it/s]

  economy_list.append(np.nanmean(df_rel['economy']))  # Append average economy rate to the list
100%|██████████| 207/207 [00:05<00:00, 36.67it/s]
  economy_list.append(np.nanmean(df_rel['economy']))  # Append average economy rate to the list
100%|██████████| 207/207 [00:04<00:00, 43.48it/s]


In [16]:
def avgStrikeRateLastn(player_list, date, n):
    '''
    
    '''
    player_list = str(player_list).split(':')  # split string of ':' separated ids into a list of ids
    strike_rate_list = []
    for player in player_list:  # loop over each player_id in roster
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')  # getting batting stats from last n games for each player.
        df_rel['strike_rate'] = (df_rel['runs'] / df_rel['balls_faced']) * 100  # Calculate strike rate
        strike_rate_list.append(np.nanmean(df_rel['strike_rate']))  # Append average strike rate to the list
    return np.nanmean(strike_rate_list)  # Return the mean of the list which is the average strike rate of all players in the roster.

# Similarly for test dataset
df_test['team1_avg_strike_rate_last15'] = df_test.progress_apply(lambda x: \
            avgStrikeRateLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
df_test['team2_avg_strike_rate_last15'] = df_test.progress_apply(lambda x: \
            avgStrikeRateLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)
df_test['team_avg_strike_rate_last15'] = (df_test['team1_avg_strike_rate_last15'] +1)/ \
                                           (df_test['team2_avg_strike_rate_last15'] + 1)
df_test.drop(columns=['team1_avg_strike_rate_last15', 'team2_avg_strike_rate_last15'], inplace=True)

  strike_rate_list.append(np.nanmean(df_rel['strike_rate']))  # Append average strike rate to the list
100%|██████████| 207/207 [00:05<00:00, 40.57it/s]
  strike_rate_list.append(np.nanmean(df_rel['strike_rate']))  # Append average strike rate to the list
100%|██████████| 207/207 [00:04<00:00, 42.12it/s]


In [17]:
def avgTotalBoundariesLastn(player_list, date, n):
   
    player_list = str(player_list).split(':')
    total_boundaries = 0
    total_matches = 0
    
    for player in player_list:
        df_rel = giveLastNgamesPlayer(player_id=player, date=date, n=n, bat_or_bowl='bat')
        if not df_rel.empty and 'Fours' in df_rel.columns and 'Sixes' in df_rel.columns:
            total_boundaries += df_rel['Fours'].sum() + df_rel['Sixes'].sum()
            total_matches += len(df_rel)
    
    avg_boundaries = (total_boundaries / total_matches) if total_matches else np.nan
    
    return avg_boundaries

df_test['team1_avg_total_boundaries_last15'] = df_test.progress_apply(lambda x: \
            avgTotalBoundariesLastn(player_list=x['team1_roster_ids'], date=x['match_dt'], n=15), axis=1)
df_test['team2_avg_total_boundaries_last15'] = df_test.progress_apply(lambda x: \
            avgTotalBoundariesLastn(player_list=x['team2_roster_ids'], date=x['match_dt'], n=15), axis=1)

df_test['team_avg_total_boundaries_last15'] = (df_test['team1_avg_total_boundaries_last15']+1) / \
                                               (df_test['team2_avg_total_boundaries_last15'] + 1)

df_test.drop(columns=['team1_avg_total_boundaries_last15', 'team2_avg_total_boundaries_last15'], inplace=True)

100%|██████████| 207/207 [00:03<00:00, 52.54it/s]
100%|██████████| 207/207 [00:04<00:00, 41.99it/s]


In [18]:
def avgWinMargin(team_id, date, n):
    df_team = df_match[((df_match['team1_id'] == team_id) | (df_match['team2_id'] == team_id)) & (df_match['winner_id'] == team_id)]
    df_team_last_n = df_team[df_team['match_dt'] < date].tail(n)
    win_margin_runs = np.nanmean(df_team_last_n[df_team_last_n['by'] == 'runs']['win amount'])
    win_margin_wickets = np.nanmean(df_team_last_n[df_team_last_n['by'] == 'wickets']['win amount'])
    return win_margin_runs, win_margin_wickets

df_test['team1_avg_win_margin_runs'], df_test['team1_avg_win_margin_wickets'] = zip(*df_test.progress_apply(lambda x: \
            avgWinMargin(team_id=x['team1_id'], date=x['match_dt'], n=15), axis=1))
df_test['team2_avg_win_margin_runs'], df_test['team2_avg_win_margin_wickets'] = zip(*df_test.progress_apply(lambda x: \
            avgWinMargin(team_id=x['team2_id'], date=x['match_dt'], n=15), axis=1))
df_test['team_avg_win_runs_ratio'], df_test['team_avg_win_wickets_ratio'] = (df_test['team1_avg_win_margin_runs'] + 1) / (df_test['team2_avg_win_margin_runs'] + 1),(df_test['team1_avg_win_margin_wickets']+1)/(df_test['team2_avg_win_margin_wickets']+1)
df_test.drop(columns=['team1_avg_win_margin_runs', 'team2_avg_win_margin_runs'], inplace=True)

  win_margin_runs = np.nanmean(df_team_last_n[df_team_last_n['by'] == 'runs']['win amount'])
  win_margin_wickets = np.nanmean(df_team_last_n[df_team_last_n['by'] == 'wickets']['win amount'])
100%|██████████| 207/207 [00:00<00:00, 619.76it/s]
  win_margin_runs = np.nanmean(df_team_last_n[df_team_last_n['by'] == 'runs']['win amount'])
  win_margin_wickets = np.nanmean(df_team_last_n[df_team_last_n['by'] == 'wickets']['win amount'])
100%|██████████| 207/207 [00:00<00:00, 585.97it/s]


In [19]:
df_test['toss_winner_01'] = np.where(df_test['toss winner']==df_test['team2'], 1, 0)
df_test['toss_decision_01'] = np.where(df_test['toss decision']=='bat', 1, 0)

In [20]:
def lightingEffectiveness(team_name, lighting, date, n):
    df_team = df_match[((df_match['team1'] == team_name) | (df_match['team2'] == team_name)) & 
                       (df_match['lighting'] == lighting) & 
                       (df_match['match_dt'] < date)]
    

    df_team_last_n = df_team.sort_values(by='match_dt', ascending=False).head(n)
    
   
    wins_under_lighting = df_team_last_n[df_team_last_n['winner'] == team_name]
    
    effectiveness = len(wins_under_lighting) / len(df_team_last_n) if len(df_team_last_n) > 0 else 0
    
    return effectiveness

df_test['team1_lighting_effectiveness'] = df_test.progress_apply(lambda x: 
    lightingEffectiveness(team_name=x['team1'], lighting=x['lighting'], date=x['match_dt'], n=15), axis=1)
df_test['team2_lighting_effectiveness'] = df_test.progress_apply(lambda x: 
    lightingEffectiveness(team_name=x['team2'], lighting=x['lighting'], date=x['match_dt'], n=15), axis=1)
df_test['lighting_effectiveness_ratio'] = (df_test['team1_lighting_effectiveness'] + 1) / (df_test['team2_lighting_effectiveness'] + 1)
df_test.drop(columns=['team1_lighting_effectiveness', 'team2_lighting_effectiveness'], inplace=True)

100%|██████████| 207/207 [00:00<00:00, 556.99it/s]
100%|██████████| 207/207 [00:00<00:00, 473.87it/s]


In [21]:
def get_last_5_avg_runs(df, team_name, ground_id, match_date):
    team_matches = df[((df['team1'] == team_name) | (df['team2'] == team_name)) & 
                      (df['ground_id'] == ground_id) & 
                      (df['match_dt'] < match_date)]
    
    last_5_matches = team_matches.sort_values(by='match_dt', ascending=False).head(5)
    
    # Calculate the average runs
    runs = []
    for index, row in last_5_matches.iterrows():
        if row['team1'] == team_name:
            runs.append(row['inning1_runs'])
        elif row['team2'] == team_name:
            runs.append(row['inning2_runs'])
    
    if runs:
        return sum(runs) / len(runs)
    else:
        return 0
    
# Function to calculate the ratio for each row in train_data
def calculate_runs_ratio(row, match_df):
    team1_avg_runs = get_last_5_avg_runs(match_df, row['team1'], row['ground_id'], row['match_dt'])
    team2_avg_runs = get_last_5_avg_runs(match_df, row['team2'], row['ground_id'], row['match_dt'])
    
    if team2_avg_runs == 0:
        return 2
    else:
        return team1_avg_runs / team2_avg_runs
df_test['runs_ratio_last_5'] = df_test.progress_apply(lambda row: calculate_runs_ratio(row, df_match), axis=1)

  0%|          | 0/207 [00:00<?, ?it/s]

100%|██████████| 207/207 [00:01<00:00, 190.10it/s]


In [22]:
df_test['team_avg_bowling_economy_last15'] = df_test['team_avg_bowling_economy_last15'].astype('float')

In [23]:
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # Enable IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from itertools import combinations, product
import pandas as pd
from sklearn.metrics import classification_report

# Handling missing values using SimpleImputer
numerical_features = [
    'lighting_effectiveness_ratio', 'team_count_50runs_last15', 'team_winp_last5', 
    'team_avg_runs_last15', 'team1_winp_team2_last15', 'runs_ratio_last_5',
    'team_avg_bowling_economy_last15', 'team_avg_strike_rate_last15', 
    'team_avg_win_runs_ratio', 'team_avg_win_wickets_ratio']

categorical_features = ['toss_winner_01', 'toss_decision_01']

knn_imputer = KNNImputer(n_neighbors=3)
df_test[numerical_features] = knn_imputer.fit_transform(df_test[numerical_features])


iterative_imputer = IterativeImputer(max_iter=10, random_state=42)
df_test[categorical_features] = iterative_imputer.fit_transform(df_test[categorical_features])

In [24]:
features = [
    'toss_winner_01', 'toss_decision_01','lighting_effectiveness_ratio', 'team_count_50runs_last15', 'team_winp_last5', 
    'team_avg_runs_last15', 'team1_winp_team2_last15', 'runs_ratio_last_5',
    'team_avg_bowling_economy_last15', 'team_avg_strike_rate_last15', 
    'team_avg_win_runs_ratio', 'team_avg_win_wickets_ratio'
]

In [25]:
x = model.predict(df_test[features])

In [26]:
df_test['y_pred_01'] =model.predict(df_test[features])
df_test['win_pred_score'] =model.predict_proba(df_test[features])
df_test['win_pred_score'] = np.where((df_test['y_pred_01'] == 0), (1 - df_test['win_pred_score']), df_test['win_pred_score'])
df_test['win_pred_team_id'] = np.where((df_test['y_pred_01'] == 0), (df_test['team1_id']), df_test['team2_id'])

In [27]:
df_test['dataset_type'] = 'r2'

In [28]:
# Get feature importances
feature_importances = model.feature_importances_()

# Create a DataFrame for feature importances
df_feat_importance = pd.DataFrame({'feat_name': features, 'model_feat_imp_train': feature_importances * 1.00})\
                      .sort_values(by='model_feat_imp_train', ascending=False).reset_index(drop=True).head(10)

print(df_feat_importance)

                         feat_name  model_feat_imp_train
0         team_count_50runs_last15             13.491455
1  team_avg_bowling_economy_last15              8.934594
2      team_avg_strike_rate_last15              6.632005
3             team_avg_runs_last15              5.123550
4     lighting_effectiveness_ratio              4.209973
5                  team_winp_last5              4.117647
6                   toss_winner_01              2.352220
7       team_avg_win_wickets_ratio              2.292997
8                runs_ratio_last_5              2.080130
9          team_avg_win_runs_ratio              1.436772


In [29]:
df_file1 = df_test[['match id','dataset_type','win_pred_team_id','win_pred_score',] + list(df_feat_importance['feat_name'].head(10))]
                     

renaming_dict = {}
for i,col in enumerate(list(df_feat_importance['feat_name'].head(10))):
    renaming_dict[col] = f'indep_feat_id{i+1}'
df_file1.rename(columns=renaming_dict, inplace=True)

for i in range(1,11):
    if f'indep_feat_id{i}' not in df_file1.columns:
        df_file1[f'indep_feat_id{i}'] = np.nan



df_file1['train_algorithm'] = 'XGBClassifier;LGBMClassifier;CatBoostClassifier'
df_file1['is_ensemble'] = 'yes'
df_file1['train_hps_trees'] = '95;28;20'
df_file1['train_hps_depth'] = '2;2;5'
df_file1['train_hps_lr'] = '0.05;0.05;0.001'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_file1.rename(columns=renaming_dict, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_file1['train_algorithm'] = 'XGBClassifier;LGBMClassifier;CatBoostClassifier'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_file1['is_ensemble'] = 'yes'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

In [30]:
df_file1.shape
df_file1.head(10)

(207, 19)

Unnamed: 0,match id,dataset_type,win_pred_team_id,win_pred_score,indep_feat_id1,indep_feat_id2,indep_feat_id3,indep_feat_id4,indep_feat_id5,indep_feat_id6,indep_feat_id7,indep_feat_id8,indep_feat_id9,indep_feat_id10,train_algorithm,is_ensemble,train_hps_trees,train_hps_depth,train_hps_lr
0,9272619,r2,33928,0.545729,0.666667,0.076696,0.792694,0.916456,0.913043,1.0,1.0,0.9,1.0,1.703795,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
1,9086958,r2,69,0.552462,0.142857,-0.001459,0.832025,0.884578,1.0,0.207921,1.0,1.08871,2.0,0.782563,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
2,9433654,r2,9701,0.619801,0.666667,-0.233044,0.810792,1.069814,0.818182,0.344262,1.0,0.952941,1.217188,0.75,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
3,9097248,r2,22763,0.570798,0.5,-0.381374,0.964552,0.973162,0.654545,0.259259,0.0,0.8,2.0,0.782563,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
4,9097234,r2,23750,0.556975,0.166667,-0.082654,1.154436,1.118402,1.0,1.0,1.0,0.932184,2.0,1.735294,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
5,9516555,r2,30393,0.50365,0.941176,-0.080585,0.948897,1.02091,0.952381,1.487805,1.0,1.277778,2.0,0.851383,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
6,9587129,r2,36098,0.546145,0.8,-0.105003,0.947254,1.016289,0.969697,1.952381,1.0,1.0,0.667584,0.851883,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
7,9615003,r2,27,0.52523,0.421053,0.034552,1.012855,1.134098,1.181818,0.259259,0.0,1.073864,0.708122,1.01534,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
8,9185406,r2,30393,0.526301,0.6,0.090311,1.11027,0.982409,1.090909,1.97561,1.0,1.230769,2.0,0.973309,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001
9,9128790,r2,30435,0.512594,1.4,0.047856,0.818661,1.07296,0.904762,0.512195,1.0,1.238095,0.94697,0.27027,XGBClassifier;LGBMClassifier;CatBoostClassifier,yes,95;28;20,2;2;5,0.05;0.05;0.001


In [31]:
df_file1.to_csv('output_rnd2/2024_DS_Track_File1_YoManas@IITG.csv', index=False)