### Importing Modules

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


from sklearn.model_selection import train_test_split
# !pip install river
# from river.ensemble import AdaptiveRandomForestClassifier

from lightgbm import LGBMClassifier

from sklearn import preprocessing
from sklearn.metrics import log_loss

import gc

### Loading Data

In order to load the data, the format has been changed into parquet to reduce size of the data and increase reading time.

Ref: [DR. ALVINLEENH](https://www.kaggle.com/datasets/alvinleenh/tps-rocket-league-data-float16-parquet-format)

In [2]:
train0_df = pd.read_parquet('/kaggle/input/tps-rocket-league-data-float16-parquet-format/train_0.parquet.gzip')
train0_df.head()

Unnamed: 0,game_num,event_id,event_time,ball_pos_x,ball_pos_y,ball_pos_z,ball_vel_x,ball_vel_y,ball_vel_z,p0_pos_x,...,boost0_timer,boost1_timer,boost2_timer,boost3_timer,boost4_timer,boost5_timer,player_scoring_next,team_scoring_next,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,1,-22,-33.3125,-0.0,0.0,1.854492,-0.0,0.0,0.0,41.8125,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
1,1,-22,-33.21875,-0.0,0.0,1.854492,-0.0,0.0,0.0,42.25,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
2,1,-22,-33.09375,-0.0,0.0,1.854492,-0.0,0.0,0.0,43.21875,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
3,1,-22,-33.0,-0.0,0.0,1.854492,-0.0,0.0,0.0,43.90625,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0
4,1,-22,-32.875,-0.0,0.0,1.854492,-0.0,0.0,0.0,44.96875,...,0.0,0.0,0.0,0.0,0.0,0.0,3,B,0,0


### Feature Engineering

Preprocessing techniques inspired from [here](https://www.kaggle.com/code/alvinleenh/tpsoct22-ctb-baseline-with-parquet/notebook?scriptVersionId=107048249) and [here](https://www.kaggle.com/code/samuelcortinhas/tps-oct-22-rocket-league-eda/notebook), showcasing critical insight into the data at hand.

In [3]:
def preprocessing(data):    
    
    # Goal coordinates
    
    goal1_coord = (0,-102.5,1.2)
    goal2_coord = (0,102.5,1.2)
    
    
    
    # for train
    if 'game_num' in data.columns:
        data = data.drop(columns=['game_num', 'event_id', 'event_time',
                                  'player_scoring_next','team_scoring_next'])
    # for test
    elif 'id' in data.columns:
        data = data.drop(columns='id')
        
    
    
    # Derived features -   
    
    # For each player
    for i in range(6):
        
#         # Distance difference between ball and each player across x,y,z axes.
#         data[f'p_{i}_x_diff'] = abs(data['ball_pos_x']-data[f'p{i}_pos_x'])
#         data[f'p_{i}_y_diff'] = abs(data['ball_pos_y']-data[f'p{i}_pos_y'])
#         data[f'p_{i}_z_diff'] = abs(data['ball_pos_z']-data[f'p{i}_pos_z'])
        
#         # Checking if player hit the ball
#         data[f'p_{i}_hit_ball'] = 0
#         data.loc[(data[f'p_{i}_x_diff']+data[f'p_{i}_x_diff']+data[f'p_{i}_x_diff']) < 10,f'p_{i}_hit_ball'] = 1
        
        # Absolute velocity of the players
        data[f'p{i}_speed'] = np.sqrt((data[f'p{i}_vel_x']**2)+(data[f'p{i}_vel_y']**2)+(data[f'p{i}_vel_z']**2))
        
        # Indicating demolished player
        data[f'p{i}_demo'] = (data[f'p{i}_pos_x'].isna()).astype(int)
        
        # Distance from goal1 and goal2
            # Euclidean
        data[f'p{i}_dist_to_goal1_euclid'] = np.sqrt((data[f'p{i}_pos_x']-goal1_coord[0])**2 + (data[f'p{i}_pos_y']-goal1_coord[1])**2 + (data[f'p{i}_pos_z']-goal1_coord[2])**2)
        data[f'p{i}_dist_to_goal2_euclid'] = np.sqrt((data[f'p{i}_pos_x']-goal2_coord[0])**2 + (data[f'p{i}_pos_y']-goal2_coord[1])**2 + (data[f'p{i}_pos_z']-goal2_coord[2])**2)
            # Manhattan
        data[f'p{i}_dist_to_goal1_manhat'] = np.absolute(data[f'p{i}_pos_x']-goal1_coord[0]) + np.absolute(data[f'p{i}_pos_y']-goal1_coord[1]) + np.absolute(data[f'p{i}_pos_z']-goal1_coord[2])
        data[f'p{i}_dist_to_goal2_manhat'] = np.absolute(data[f'p{i}_pos_x']-goal2_coord[0]) + np.absolute(data[f'p{i}_pos_y']-goal2_coord[1]) + np.absolute(data[f'p{i}_pos_z']-goal2_coord[2])     
    
    
    # For ball or team
    
    # Absolute velocity of ball
    data['ball_speed'] = np.sqrt((data['ball_vel_x']**2)+(data['ball_vel_y']**2)+(data['ball_vel_z']**2))
    
    # Active players in each team
    data['active_players_A'] = 3-data['p0_demo']-data['p1_demo']-data['p2_demo']
    data['active_players_B'] = 3-data['p3_demo']-data['p4_demo']-data['p5_demo']
    
    # Distance of ball from goal1 and goal2
    
        # Euclidean
    data['ball_dist_to_goal1_euclid'] = np.sqrt((data['ball_pos_x']-goal1_coord[0])**2 + (data['ball_pos_y']-goal1_coord[1])**2 + (data['ball_pos_z']-goal1_coord[2])**2)
    data['ball_dist_to_goal2_euclid'] = np.sqrt((data['ball_pos_x']-goal2_coord[0])**2 + (data['ball_pos_y']-goal2_coord[1])**2 + (data['ball_pos_z']-goal2_coord[2])**2)
    
        # Manhattan
    data['ball_dist_to_goal1_manhat'] = np.absolute(data['ball_pos_x']-goal1_coord[0]) + np.absolute(data['ball_pos_y']-goal1_coord[1]) + np.absolute(data['ball_pos_z']-goal1_coord[2])
    data['ball_dist_to_goal2_manhat'] = np.absolute(data['ball_pos_x']-goal2_coord[0]) + np.absolute(data['ball_pos_y']-goal2_coord[1]) + np.absolute(data['ball_pos_z']-goal2_coord[2])
    
    
    return data

In [4]:
# ### Baseline Adaptive RF Classifier

# Adaptive RF Classifier is an enesmble approach created by River, a library for online machine learning. With the help of its ability to address incremental model training, all the train datasets can be learnt by our model in a continuous fashion.

In [5]:
# train test split

train = preprocessing(train0_df)
del train0_df
gc.collect()

target_cols = ['team_A_scoring_within_10sec','team_B_scoring_within_10sec']

X = train.drop(target_cols, axis=1)
y = train[target_cols]

### Baseline LightGBM

Parameters taken from [CHENSN](https://www.kaggle.com/code/chal1ce/lightgbm-baseline-with-feature-importance/notebook?scriptVersionId=107069949)'s work, which is clear and precise.

In [6]:
# Respective models for each team

# modelA = AdaptiveRandomForestClassifier(n_models = 500, max_features = None)
# modelB = AdaptiveRandomForestClassifier(n_models = 500, max_features = None)

params = {
    'objective': 'binary',
    'seed': 42,
    'num_leaves': 128,
    'n_estimators': 250,
    'max_depth': 10,
    'learning_rate': 0.1,
    #'feature_fraction': 0.75,
    'subsample': 0.7,
    'subsample_freq': 8,
    'n_jobs': -1,
    'reg_alpha': 1,
    'reg_lambda': 2,
    'min_child_samples': 100,
}

modelA = LGBMClassifier(**params)
modelB = LGBMClassifier(**params)

base_model = [modelA, modelB]

In [7]:
# Training initial model

def initial_model(X,y, model):
    # train test split
    X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.15, random_state=56)
    pred_train = y_val[target_cols].copy()
    
    # training and evaluation
    for i, feature in enumerate(target_cols):
        model[i].fit(X_train,y_train[feature],verbose=True)
        pred_train.loc[:,feature] = model[i].predict_proba(X_val)[:,1]
        loss = log_loss(y_val[feature],pred_train[feature])
        print(f"\nLogloss for {feature} = {loss}\n")
        
initial_model(X,y,base_model)


Logloss for team_A_scoring_within_10sec = 0.12686100348510285


Logloss for team_B_scoring_within_10sec = 0.12064725737337144



In [8]:
# Training online model

def incremental_model(X,y,base_model):
    # train test split
        X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.15, random_state=56)
        pred_train = y_val[target_cols].copy()
        
        # training and evaluation
        for i, feature in enumerate(target_cols):
            base_model[i].fit(X_train, y_train[feature], init_model = base_model[i], verbose=True)
            pred_train.loc[:,feature] = base_model[i].predict_proba(X_val)[:,1]
            loss = log_loss(y_val[feature],pred_train[feature])
            print(f"\nLogloss for {feature} = {loss}\n")


for i in range(1,10):
    train_df = pd.read_parquet(f'/kaggle/input/tps-rocket-league-data-float16-parquet-format/train_{i}.parquet.gzip')
    train = preprocessing(train_df)
    del train_df
    gc.collect()
    
    X = train.drop(target_cols, axis=1)
    y = train[target_cols]
    print(f'\n\nDataset: train_{i}\n')
    incremental_model(X,y,base_model)



Dataset: train_1


Logloss for team_A_scoring_within_10sec = 0.1232967881214441


Logloss for team_B_scoring_within_10sec = 0.11269403668900066



Dataset: train_2


Logloss for team_A_scoring_within_10sec = 0.11937510031336045


Logloss for team_B_scoring_within_10sec = 0.12041581740239296



Dataset: train_3


Logloss for team_A_scoring_within_10sec = 0.129254997004758


Logloss for team_B_scoring_within_10sec = 0.1260361244978028



Dataset: train_4


Logloss for team_A_scoring_within_10sec = 0.12765727454648884


Logloss for team_B_scoring_within_10sec = 0.12413902598384331



Dataset: train_5


Logloss for team_A_scoring_within_10sec = 0.13016049379142589


Logloss for team_B_scoring_within_10sec = 0.1321816147689668



Dataset: train_6


Logloss for team_A_scoring_within_10sec = 0.13937459137890332


Logloss for team_B_scoring_within_10sec = 0.13526083963496138



Dataset: train_7


Logloss for team_A_scoring_within_10sec = 0.14126064166710497


Logloss for team_B_scoring_withi

### Prediction on Test

In [9]:
# loading test data 
test_df = pd.read_parquet('/kaggle/input/tps-rocket-league-data-float16-parquet-format/test.parquet.gzip')

# loading sample submission file to make use of labels
submission_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/sample_submission.csv')

# feature engineering
test = preprocessing(test_df)

# Prediction
pred_test = submission_df[target_cols].copy()

for i, feature in enumerate(target_cols):
    pred_test.loc[:,feature] = base_model[i].predict_proba(test)[:,1]

pred_test.head(5)

Unnamed: 0,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,0.00102,0.002494
1,0.002117,0.016953
2,0.000239,0.015794
3,0.008703,0.005521
4,0.003372,0.031378


### Submission

In [10]:
submission_csv = pd.DataFrame({'id': submission_df.id, 
                       'team_A_scoring_within_10sec': pred_test['team_A_scoring_within_10sec'],
                       'team_B_scoring_within_10sec': pred_test['team_B_scoring_within_10sec']})
submission_csv.to_csv('submission.csv', index=False)
submission_csv.head()

Unnamed: 0,id,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,0,0.00102,0.002494
1,1,0.002117,0.016953
2,2,0.000239,0.015794
3,3,0.008703,0.005521
4,4,0.003372,0.031378
