In [1]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os
from colorama import Fore, Style
import lightgbm, xgboost, catboost
import pickle
import joblib

from sklearn.base import clone, BaseEstimator, TransformerMixin
from sklearn.model_selection import GroupKFold, GroupShuffleSplit, cross_val_score, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures, SplineTransformer
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor, BaggingRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer#word2vec feature

import dill#serialize and deserialize objects (such as saving and loading tree models)
import re#python's built-in regular expressions.
import gc#rubbish collection
import warnings#avoid some negligible errors
#The filterwarnings () method is used to set warning filters, which can control the output method and level of warning information.
warnings.filterwarnings('ignore')

import random#provide some function to generate random_seed.
#set random seed,to make sure model can be recurrented.
def seed_everything(seed):
    np.random.seed(seed)#numpy's random seed
    random.seed(seed)#python built-in random seed
seed_everything(seed=2024)

pd.options.mode.chained_assignment = "raise"

saved_models, oof_pred = {}, {}




In [2]:
def predict (test: pl.DataFrame, sample_sub: pl.DataFrame):
 
    #------ preprocess 전처리함수
    def preprocess(df_polars):
        """Convert the polars dataframe to pandas; extract target and groups if it is the training dataframe."""
        df = df_polars.with_columns(
            pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 1).alias('p1_selection'),
            pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 2).alias('p1_exploration').cast(pl.Float32),
            pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 3).alias('p1_playout'),
            pl.col('agent1').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 4).alias('p1_bounds'),
            pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 1).alias('p2_selection'),
            pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 2).alias('p2_exploration').cast(pl.Float32),
            pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 3).alias('p2_playout'),
            pl.col('agent2').str.extract(r'MCTS-(.*)-(.*)-(.*)-(.*)', 4).alias('p2_bounds')
        ).drop(
            [
             'num_wins_agent1', 'num_draws_agent1', 'num_losses_agent1'],
            strict=False
        ).to_pandas()
    
        # Feature engineering
        df['area'] = df['NumRows'] * df['NumColumns']
        df['row_equal_col'] = (df['NumColumns'] == df['NumRows']).astype(np.int8)
        df['Playouts/Moves'] = df['PlayoutsPerSecond'] / (df['MovesPerSecond'] + 1e-15)
        df['EfficiencyPerPlayout'] = df['MovesPerSecond'] / (df['PlayoutsPerSecond'] + 1e-15)
        df['TurnsDurationEfficiency'] = df['DurationActions'] / (df['DurationTurnsStdDev'] + 1e-15)
        df['AdvantageBalanceRatio'] = df['AdvantageP1'] / (df['Balance'] + 1e-15)
        df['ActionTimeEfficiency'] = df['DurationActions'] / (df['MovesPerSecond'] + 1e-15)
        df['StandardizedTurnsEfficiency'] = df['DurationTurnsStdDev'] / (df['DurationActions'] + 1e-15)
        df['AdvantageTimeImpact'] = df['AdvantageP1'] / (df['DurationActions'] + 1e-15)
        df['DurationToComplexityRatio'] = df['DurationActions'] / (df['StateTreeComplexity'] + 1e-15)
        df['NormalizedGameTreeComplexity'] = df['GameTreeComplexity'] / (df['StateTreeComplexity'] + 1e-15)
        df['ComplexityBalanceInteraction'] = df['Balance'] * df['GameTreeComplexity']
        df['OverallComplexity'] = df['StateTreeComplexity'] + df['GameTreeComplexity']
    
        # Handle outliers
        print("Dealing with outliers")
        df['PlayoutsPerSecond'] = df['PlayoutsPerSecond'].clip(0, 25000)
        df['MovesPerSecond'] = df['MovesPerSecond'].clip(0, 1000000)
    
        return df


    ###############################
    # Preprocess함수 확인
    test = preprocess(test)
    test['p_selection'] = (test.p1_selection.astype(str) + '-' + test.p2_selection.astype(str)).astype('category')
    test['p_exploration'] = test.p1_exploration - test.p2_exploration
    test['p_playout'] = (test.p1_playout.astype(str) + '-' + test.p2_playout.astype(str)).astype('category')
    
    print("preprocess succeed")
    print(test.columns)

    
    ###############################
    # test data 원핫인코딩

    # encoder 불러오기
    encoder_path = "/kaggle/input/submission1-finalmodel/onehot_encoder.pkl"  # 저장된 경로
    encoder = joblib.load(encoder_path)
    print("OneHotEncoder loaded successfully.")

    # 인코딩할 데이터 슬라이싱
    categorical_columns = ['p1_selection', 'p2_selection', 'p1_playout', 'p2_playout']
    test_categorical_subset = test[categorical_columns]

    # 인코딩 수행
    test_encoded = encoder.transform(test_categorical_subset)  # train 데이터에서 fit한 encoder로 transform
    encoded_columns = encoder.get_feature_names_out(categorical_columns)  # ensure encoded column names
    test_encoded = pd.DataFrame(test_encoded, columns=encoded_columns, index=test.index)

    for col in encoded_columns:
        if col not in test_encoded.columns:
            test_encoded[col] = 0  # 누락된 컬럼은 0으로 채움

    # 데이터 붙이기
    test = pd.concat([test.reset_index(drop=True), test_encoded.reset_index(drop=True)], axis=1)

    print("onehot incoding succeed")
    print(test.columns)

    #------------------------Fit-----------------------
    # Load model
    model_path = "/kaggle/input/submission1-finalmodel/stacking_model.pkl"
    loaded_model = joblib.load(model_path)
    print("Model loaded successfully.")
    
    
    # Train 데이터의 피처 이름 가져오기
    train_features = loaded_model.feature_names_in_
    test_aligned = test[train_features]

    # Test 데이터 정렬 (누락된 피처는 0으로 채움)
    test_aligned = test_aligned.reindex(columns=train_features, fill_value=0)
    
    # Predict
    preds = loaded_model.predict(test_aligned)

    # Add predictions to submission
    sample_sub = sample_sub.with_columns(pl.Series('utility_agent1', preds))
    
    return sample_sub

In [3]:
# # 로컬 환경에서 제출 전 실행해보는 코드
# test = pl.read_csv('/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv')
# sample_sub = pl.DataFrame()
# predict(test, sample_sub)

In [4]:
import kaggle_evaluation.mcts_inference_server

In [5]:
inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv',
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv'
        )
    )

Dealing with outliers
preprocess succeed
Index(['Id', 'GameRulesetName', 'agent1', 'agent2', 'Properties', 'Format',
       'Time', 'Discrete', 'Realtime', 'Turns',
       ...
       'ActionTimeEfficiency', 'StandardizedTurnsEfficiency',
       'AdvantageTimeImpact', 'DurationToComplexityRatio',
       'NormalizedGameTreeComplexity', 'ComplexityBalanceInteraction',
       'OverallComplexity', 'p_selection', 'p_exploration', 'p_playout'],
      dtype='object', length=834)
OneHotEncoder loaded successfully.
onehot incoding succeed
Index(['Id', 'GameRulesetName', 'agent1', 'agent2', 'Properties', 'Format',
       'Time', 'Discrete', 'Realtime', 'Turns',
       ...
       'p1_selection_UCB1', 'p1_selection_UCB1GRAVE', 'p1_selection_UCB1Tuned',
       'p2_selection_UCB1', 'p2_selection_UCB1GRAVE', 'p2_selection_UCB1Tuned',
       'p1_playout_NST', 'p1_playout_Random200', 'p2_playout_NST',
       'p2_playout_Random200'],
      dtype='object', length=844)
Model loaded successfully.
