In [None]:
#from pytorch_tabnet.tab_model import TabNetRegressor
import warnings
import os
#environment provided by competition hoster
import kaggle_evaluation.mcts_inference_server
from pathlib import Path
warnings.filterwarnings('ignore')
import numpy as np
import polars as pl
import pandas as pd
import plotly.graph_objects as go
pd.options.display.max_rows = None
pd.options.display.max_columns = None
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error as mse
import matplotlib.pyplot as plt
import pyarrow as pa
plt.rcParams['axes.unicode_minus'] = False
import json


In [None]:
class FE:
    
    def __init__(self, batch_size):
        self.batch_size = batch_size
        self.bad_cols=[]
    
    def agent_trans(self, df):
        # 字典定义
        agent_dict = {
            '1': ['UCB1', 'UCB1Tuned', 'UCB1GRAVE', 'ProgressiveHistory'],
            '2': ['0.1', '0.6', '1.41421356237'],
            '3': ['Random200', 'MAST', 'NST'],
            '4': ['true']
        }
        
        # 获取所有非数值类型的列（即分类列）
        cat_cols = df.select_dtypes(exclude=['number']).columns
        
        # 遍历字典进行转换
        for i in range(1,3):
            for key, value in agent_dict.items():
                for j in value:
                    # 将布尔值转化为相应的列
                    df[f'agent{i}_split{key}_{j}'] = df[f'agent{i}'].apply(lambda x: x.split('-')[int(key)] == j)
        
        # 将所有布尔类型的列转换为 0 和 1
        df[df.select_dtypes(include=['bool']).columns] = df.select_dtypes(include=['bool']).astype(int)
        
        # 针对非分类列进行类型转换
        for col in df.columns:
            if col not in cat_cols:  # 排除分类列
                # 删除NaN并检查是否为空
                cleaned_col = df[col].dropna()
                if not cleaned_col.empty:
                    val = cleaned_col.iloc[0]
                    df[col] = df[col].astype('int16') if isinstance(val, int) else df[col].astype('float32')
                else:
                    # 如果列为空，可以选择设置为默认值或跳过
                    df[col] = df[col].astype('float32')  # 或者你可以设置为 int16，取决于需要

        return df
    
    def drop_cols(self, df, bad_cols=None):
        # 多余特征列
        cols = ['Id', 
                'LudRules', 
                'EnglishRules',
                'num_wins_agent1',
                'num_draws_agent1',
                'num_losses_agent1',
                'agent1',
                'agent2'
               ]
        
        # 将多余的列添加到 bad_cols 中
        self.bad_cols.extend([col for col in cols if col in df.columns])
        
        # 删除多余的列
        df = df.drop(columns=[col for col in cols if col in df.columns])
        
        # 删除全为 NaN 的列，并将它们添加到 bad_cols
        nan_cols = [col for col in df.columns if df[col].isna().all()]
        self.bad_cols.extend(nan_cols)  # 记录全为 NaN 的列
        df = df.dropna(axis=1, how='all')
        
        # 删除值为单一的列，并将其添加到 bad_cols
        if bad_cols is None:
            bad_cols = [col for col in df.columns if df[col].nunique() == 1]
        df = df.drop(columns=bad_cols)
        
        # 将删除的单一值列添加到 bad_cols
        self.bad_cols.extend(bad_cols)
        
        return df

    
    def info(self, df):
        print(f'Shape: {df.shape}')   
        mem = df.memory_usage(deep=True).sum() / 1024**2
        print(f'Memory usage: {mem:.2f} MB\n')
        
    def apply_fe(self, path):
        # 读取数据
        df = pd.read_csv(path)
        
        # 执行数据处理
        df = self.agent_trans(df)
        df = self.drop_cols(df)
        self.info(df)
        
        return df


In [None]:
class CFG:
    
    train_path = Path('/kaggle/input/um-game-playing-strength-of-mcts-variants/train.csv')
    batch_size = 65536

    early_stop = 100
    n_features = 200
    n_splits = 5
    color = '#C9A9A6'
    
    lgb_p = {
        'objective': 'regression',
        'min_child_samples': 24,
        'num_iterations': 500,
        'learning_rate': 0.03,
        'extra_trees': True,
        'reg_lambda': 0.8,
        'reg_alpha': 0.1,
        'num_leaves': 64,
        'metric': 'rmse',
        'device': 'cpu',
        'max_depth': 8,
        'max_bin': 128,
        'verbose': -1,
        'seed': 42
    }
    
    ctb_p = {
        'loss_function': 'RMSE',
        'learning_rate': 0.03,
        'num_trees': 500,
        'random_state': 42,
        'task_type': 'CPU',
        'reg_lambda': 0.8,
        'depth': 8
    }

In [None]:
class MD:
    
    def __init__(self, early_stop, n_features, n_splits, lgb_p, ctb_p, color):
        self.early_stop = early_stop
        self.n_features = n_features
        self.n_splits = n_splits
        self.lgb_p = lgb_p
        self.ctb_p = ctb_p
        self.color = color
        
    def plot_cv(self, fold_scores, title):
        
        fold_scores = [round(score, 3) for score in fold_scores]
        mean_score = round(np.mean(fold_scores), 3)
        std_score = round(np.std(fold_scores), 3)

        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x = list(range(1, len(fold_scores) + 1)),
            y = fold_scores,
            mode = 'markers', 
            name = 'Fold Scores',
            marker = dict(size = 24, color=self.color, symbol='diamond'),
            text = [f'{score:.3f}' for score in fold_scores],
            hovertemplate = 'Fold %{x}: %{text}<extra></extra>',
            hoverlabel=dict(font=dict(size=16))  
        ))

        fig.add_trace(go.Scatter(
            x = [1, len(fold_scores)],
            y = [mean_score, mean_score],
            mode = 'lines',
            name = f'Mean: {mean_score:.3f}',
            line = dict(dash = 'dash', color = '#FFBF00'),
            hoverinfo = 'none'
        ))

        fig.update_layout(
            title = f'{title} | Cross-Validation RMSE Scores | Variation of CV scores: {mean_score} ± {std_score}',
            xaxis_title = 'Fold',
            yaxis_title = 'RMSE Score',
            plot_bgcolor = 'rgba(0,0,0,0)',
            paper_bgcolor = 'rgba(0,0,0,0)',
            xaxis = dict(
                gridcolor = 'lightgray',
                tickmode = 'linear',
                tick0 = 1,
                dtick = 1,
                range = [0.5, len(fold_scores) + 0.5]
            ),
            yaxis = dict(gridcolor = 'lightgray')
        )

        fig.show() 
        
    def train_model(self, data, title):
        # Define features (X), label (y) and grouping column (group) for CV
        X = data.drop(['utility_agent1'], axis=1)
        y = data['utility_agent1']
        group = data['GameRulesetName']
        
        # Convert 'GameRulesetName' and other categorical columns to 'category'
        X['GameRulesetName'] = X['GameRulesetName'].astype('category')
        
        # If you have other categorical columns, convert them similarly:
        cat_cols = X.select_dtypes(include=['object']).columns
        for col in cat_cols:
            X[col] = X[col].astype('category')
    
        cv = GroupKFold(n_splits=self.n_splits)
        
        models, scores = [], []
        
        # Initialize out-of-fold predictions array
        oof_preds = np.zeros(len(X))
        
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, group)):
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            categorical_cols = ['GameRulesetName']
            if title.startswith('LightGBM'):
                model = lgb.LGBMRegressor(**self.lgb_p)
    
                # Pass categorical columns
                  # Add more categorical columns if necessary
                model.fit(X_train, y_train,
                          eval_set=[(X_valid, y_valid)],
                          eval_metric='rmse',
                          categorical_feature=categorical_cols,
                          callbacks=[lgb.early_stopping(self.early_stop, verbose=0), lgb.log_evaluation(0)])
                
            elif title.startswith('CatBoost'):
                model = CatBoostRegressor(**self.ctb_p, verbose=0,cat_features=categorical_cols)
                model.fit(X_train, y_train,
                          eval_set=(X_valid, y_valid),
                          early_stopping_rounds=self.early_stop, verbose=0)
    
            models.append(model)
    
            # Store out-of-fold predictions for this fold
            oof_preds[valid_index] = model.predict(X_valid)
            score = mse(y_valid, oof_preds[valid_index], squared=False)
            scores.append(score)
        
        self.plot_cv(scores, title)
        
        return models

    
    def feature_importance(self, data, title):

        
        models = self.train_model(data, title)
        
        feature_cols = [col for col in data.columns if col != 'utility_agent1']
        
        feature_importances = np.zeros(len(feature_cols))
        for model in models:
                
            if title.startswith('LightGBM'):
                feature_importances += model.feature_importances_ / len(models)
            
            elif title.startswith('CatBoost'):
                feature_importances += model.get_feature_importance() / len(models)
        
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': feature_importances
        })
        
        feature_importance = feature_importance.sort_values('importance', ascending=False).reset_index(drop=True)
        
        drop_features = feature_importance.loc[self.n_features:, 'feature'].tolist()
                    
        return drop_features

In [None]:
fe = FE(CFG.batch_size)
md = MD(CFG.early_stop, CFG.n_features, CFG.n_splits, CFG.lgb_p, CFG.ctb_p, CFG.color)
train = fe.apply_fe(CFG.train_path)

In [None]:

chunksize = 10**4
output_file = 'number_clean_feature.csv'

train.to_csv(output_file, mode='w', header=True, index=False, chunksize=chunksize)

In [None]:
drop_lgb_features = md.feature_importance(train,'LightGBM')


In [None]:
md = MD(CFG.early_stop, CFG.n_features, CFG.n_splits, CFG.lgb_p, CFG.ctb_p, CFG.color)
drop_ctb_features = md.feature_importance(train, 'CatBoost')

In [None]:
drop_features = list(set(drop_lgb_features) & set(drop_ctb_features))

In [None]:
#importances = pd.DataFrame({
#    'drop_features': drop_features
#})

#print(f'Shape: {importances.shape}')
#display(importances)
#importances.to_csv('importances.csv', index=False)

In [None]:
import lightgbm as lgb
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import torch
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LinearRegression

# LGBM模型定义
class LGBM_Model:
    def __init__(self):
        self.model = None
        self.params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'max_depth': -1,
            'n_estimators': 5,
            'early_stopping_rounds': 40
        }

    def train(self, X_train, y_train, X_valid, y_valid):
        self.model = lgb.LGBMRegressor(**self.params)
        self.model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        return self.model.predict(X)

# CatBoost模型定义
class CatBoost_Model:
    def __init__(self):
        self.model = None
        self.params = {
            'iterations': 5,
            'learning_rate': 0.05,
            'depth': 6,
            'loss_function': 'RMSE',
            'early_stopping_rounds': 40,
            'cat_features': []
        }

    def train(self, X_train, y_train, X_valid, y_valid):
        self.model = CatBoostRegressor(**self.params)
        self.model.fit(X_train, y_train, eval_set=(X_valid, y_valid))

    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        return self.model.predict(X)

# XGBoost模型定义
class XGB_Model:
    def __init__(self):
        self.model = None
        self.params = {
            'objective': 'reg:squarederror',
            'learning_rate': 0.05,
            'max_depth': 6,
            'n_estimators': 5,
            'early_stopping_rounds': 40
        }

    def train(self, X_train, y_train, X_valid, y_valid):
        self.model = XGBRegressor(**self.params)
        self.model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)])

    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        return self.model.predict(X)

# NeuMF模型定义
import torch
import torch.nn as nn
import torch.optim as optim

class NeuMF_Model:
    def __init__(self, input_dim, hidden_dim=4):
        self.model = None
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = 1  # 回归问题

    def build_model(self):
        model = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.output_dim)
        )
        return model

    def train(self, X_train, y_train, X_valid, y_valid):
        # Check if GPU is available, otherwise use CPU
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        # Build and move the model to the device (GPU or CPU)
        self.model = self.build_model().to(device)
        
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=0.001)

        # Move data to the device
        train_data = torch.tensor(X_train.values, dtype=torch.float32).to(device)
        train_labels = torch.tensor(y_train.values, dtype=torch.float32).to(device)
        valid_data = torch.tensor(X_valid.values, dtype=torch.float32).to(device)
        valid_labels = torch.tensor(y_valid.values, dtype=torch.float32).to(device)

        epochs = 5
        for epoch in range(epochs):
            self.model.train()
            optimizer.zero_grad()
            predictions = self.model(train_data)
            loss = criterion(predictions, train_labels.view(-1, 1))
            loss.backward()
            optimizer.step()

            if (epoch + 1) % 10 == 0:
                self.model.eval()
                val_predictions = self.model(valid_data)
                val_loss = criterion(val_predictions, valid_labels.view(-1, 1))
                print(f"Epoch [{epoch+1}/{epochs}], Train Loss: {loss.item():.4f}, Validation Loss: {val_loss.item():.4f}")

    def predict(self, X):
        if self.model is None:
            raise ValueError("Model has not been trained yet.")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.eval()
        data = torch.tensor(X.values, dtype=torch.float32).to(device)
        with torch.no_grad():
            predictions = self.model(data)
        return predictions.cpu().numpy().flatten()  # Return to CPU if needed

# Stacking模型定义
class Stacking_Model:
    def __init__(self, models, meta_model=None):
        self.models = models
        self.meta_model = meta_model if meta_model else LinearRegression()
        self.trained_models = []  # 用于保存训练后的最终子模型
    
    def train(self, X, y, groups, drop_features=None, n_splits=2):
        cv = GroupKFold(n_splits=n_splits)
        oof_preds = np.zeros(len(X))
        all_meta_features = []
    
        for fold, (train_index, valid_index) in enumerate(cv.split(X, y, groups)):
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
            # 删除不需要的特征
            if drop_features:
                X_train = X_train.drop(columns=drop_features, errors='ignore')
                X_valid = X_valid.drop(columns=drop_features, errors='ignore')
    
            meta_features_train = []
            for model in self.models:
                # 创建新的模型实例
                model_instance = model(input_dim=X_train.shape[1]) if model == NeuMF_Model else model()
                
                print(f"Training {model.__name__}...")
                model_instance.train(X_train, y_train, X_valid, y_valid)
                
                # 只保存最后一个折的模型
                if fold == n_splits - 1:
                    self.trained_models.append(model_instance)  # 仅保存最后折的模型实例
                
                meta_features_train.append(model_instance.predict(X_valid))
    
            # 将不同模型的预测结果合并，确保特征数一致
            meta_features_train = np.column_stack(meta_features_train)
            all_meta_features.append(meta_features_train)
            oof_preds[valid_index] = np.mean(meta_features_train, axis=1)
    
        # 训练元学习器
        meta_features_train = np.concatenate(all_meta_features, axis=0)
        self.meta_model.fit(meta_features_train, y)
        print("Stacking model training completed.")
        return oof_preds
    
    def predict(self, X):
        meta_features_test = []
        for model_instance in self.trained_models:  # 只使用最终的训练好的子模型
            print(f"Predicting with {model_instance.__class__.__name__}...")
            preds = model_instance.predict(X)
            print(preds.shape)
            meta_features_test.append(preds)
    
        # 将不同模型的预测结果合并，确保特征数一致
        meta_features_test = np.column_stack(meta_features_test)
        print("Prediction completed.")
        return self.meta_model.predict(meta_features_test)


# 全局模型初始化
data=train
X = data.drop(['utility_agent1','GameRulesetName'], axis=1)
y = data['utility_agent1']
group = data['GameRulesetName']


# 全局模型初始化
models = [LGBM_Model, CatBoost_Model, XGB_Model, NeuMF_Model]  # 包含模型类，而不是实例
model = Stacking_Model(models)  # 初始化堆叠模型

counter = 0

def predict(test, submission):
    global counter
    global drop_features
    if counter == 0:
        # 训练模型，如果已经训练过则不再训练
        global model
        global drop_features
        model.train(X, y, group, drop_features=drop_features, n_splits=2)  # 训练堆叠模型
        counter += 1  # 训练完后计数器增加，避免重复训练

    # Use the trained model to predict on the test data
    test= test.to_pandas()
    test=fe.agent_trans(test)
    test=test.drop(columns=[col for col in fe.bad_cols if col in test.columns],axis=1)
    test.drop(['GameRulesetName'],axis=1,inplace=True)
    test.drop(drop_features,axis=1,inplace=True)
    predictions = model.predict(test)  # 使用训练好的堆叠模型预测

    # Create a Polars Series for the final submission
    return submission.with_columns(pl.Series('utility_agent1', predictions))


    
inference_server = kaggle_evaluation.mcts_inference_server.MCTSInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    # For local testing
    inference_server.run_local_gateway(
        (
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/test.csv',  # Replace with the actual test file path
            '/kaggle/input/um-game-playing-strength-of-mcts-variants/sample_submission.csv'  # Replace with the actual submission file path
        )
    )