# 모듈로딩 및 파이프라이닝 함수

사용된 모듈 버전은 아래와 같습니다
- lightgbm 4.1.0
- catboost 1.2.2
- xgboost 2.0.3
- scikit-learn 1.4.1.post1
- pandas 2.1.4

In [None]:
import warnings
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import KFold
from sklearn import set_config
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

set_config(transform_output="pandas")

warnings.filterwarnings("ignore")


def convert_traffic_medium_pipe(df):
    traffic_medium_map = {"(not set)": "(none)"}
    df["traffic_medium"] = df["traffic_medium"].replace(traffic_medium_map)
    return df


def filter_major_browser(df):
    df["browser"] = df["browser"].replace(
        {
            "Safari (in-app)": "Safari",
            "Opera Mini": "Opera",
        }
    )
    major_browsers = [
        "Chrome",
        "Safari",
        "Android Webview",
        "Firefox",
        "Internet Explorer",
        "Opera",
        "Edge",
    ]
    df.loc[~df["browser"].isin(major_browsers), "browser"] = "Other"
    return df


def filter_os(df):
    major_os = [
        "Windows",
        "Macintosh",
        "Android",
        "iOS",
        "Linux",
        "Chrome OS",
    ]
    df.loc[~df["OS"].isin(major_os), "OS"] = "Other"
    return df


def convert_region(df):
    large_countries = ["United States"]
    df["region"] = df.country[df.country.isin(large_countries)]
    remaining_field = df["region"].isna()
    df.loc[remaining_field, "region"] = df["continent"]
    return df

# Scikit-Learn Trnasformer 기반 파이프라이닝 함수

In [None]:
def get_covering_list(df, column, percentage=0.95):
    value_counts = df[column].value_counts(normalize=True)
    cumulative_sum = value_counts.cumsum()
    covering_nums = cumulative_sum[cumulative_sum < percentage].count() + 1
    covering_values = cumulative_sum.iloc[:covering_nums].index.tolist()
    return covering_values


class PercentileTransformer(TransformerMixin, BaseEstimator):
    def __init__(self, column, percentage=0.95):
        super().__init__()
        self.percentage = percentage
        self.column = column

    def fit(self, X, y=None):
        self.covering_values = get_covering_list(X, self.column, self.percentage)
        return self

    def transform(self, X):
        X = X.copy()
        X[self.column] = (
            X[self.column]
            .apply(lambda x: x if x in self.covering_values else "other")
            .astype("category")
        )
        return X

# 총괄 파이프라인 함수

In [None]:
def get_categorical_covering_transformer(cat_cols, percentage=0.99):
    transformers = []
    for col in cat_cols:
        transformer = make_pipeline(
            PercentileTransformer(
                column=col,
                percentage=(
                    percentage if isinstance(percentage, float) else percentage.get(col)
                ),
            )
        )
        transformers.append((col, transformer, [col]))
    return ColumnTransformer(transformers=transformers, remainder="passthrough")


def get_preprocessor(cat_cols, num_cols, percentage=0.99):
    categorical_transformer = get_categorical_covering_transformer(cat_cols, percentage)

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
        ]
    )

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="passthrough",
    )

    return preprocessor


def load_data_after_pipeline():
    df_train = pd.read_csv("data/train.csv", index_col=0)
    df_test = pd.read_csv("data/test.csv", index_col=0)

    df_train = (
        df_train.pipe(convert_traffic_medium_pipe)
        .pipe(filter_major_browser)
        .pipe(filter_os)
        .pipe(convert_region)
    )
    df_train = df_train.fillna("unknown")
    df_test = (
        df_test.pipe(convert_traffic_medium_pipe)
        .pipe(filter_major_browser)
        .pipe(filter_os)
        .pipe(convert_region)
    )
    df_test = df_test.fillna("unknown")

    return df_train, df_test

# 하이퍼파라미터 정의

In [None]:
num_cols = [
    "quality",
    "duration",
    "transaction",
    "transaction_revenue",
    "new",
    "bounced",
]

cat_cols = [
    "browser",
    "OS",
    "region",
    "traffic_medium",
    "traffic_source",
    "continent",
    "subcontinent",
]
processed_num_cols = [f"num__{col}" for col in num_cols]
processed_cat_cols = [f"cat__{col}__{col}" for col in cat_cols]
seed = 42

In [None]:
df_train, df_test = load_data_after_pipeline()

df_train = df_train.astype({col: "category" for col in cat_cols})
df_test = df_test.astype({col: "category" for col in cat_cols})
test = df_test[cat_cols + num_cols]

X = df_train[cat_cols + num_cols]
y = df_train["TARGET"]

percentage = {
    "browser": 0.86,
    "OS": 0.88,
    "region": 0.94,
    "traffic_medium": 0.86,
    "traffic_source": 0.98,
    "continent": 0.99,
    "subcontinent": 0.94,
}
preprocessor = get_preprocessor(cat_cols, num_cols, percentage=percentage)

In [None]:
lgbm_params = {
    "num_leaves": 172,
    "max_depth": 9,
    "learning_rate": 0.01194948251805873,
    "n_estimators": 619,
    "min_child_samples": 35,
}
xgb_params = {
    "max_depth": 10,
    "learning_rate": 0.05902429978427805,
    "n_estimators": 114,
    "subsample": 0.6403714650860028,
    "colsample_bytree": 0.7883825622005194,
    "gamma": 1.3729676312858936,
    "reg_lambda": 9.859187480942719,
    "reg_alpha": 4.4524950565152785,
}
cat_params = {}

# 학습 및 교차검증

In [None]:

lgb = LGBMRegressor(random_state=seed, verbose=-1, **lgbm_params)
xgb = XGBRegressor(random_state=seed, enable_categorical=True, **xgb_params)
cat = CatBoostRegressor(
    random_state=seed, cat_features=processed_cat_cols, verbose=False, **cat_params
)

estimators = [
    ("lgb", lgb),
    ("xgb", xgb),
    ("cat", cat),
]


kf = KFold(n_splits=5, shuffle=True, random_state=seed)
stack = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
pipe = Pipeline([("preprocessing", preprocessor), ("stack", stack)])

rmse_scorer = make_scorer(mean_squared_error, squared=False)
cv_scores = cross_val_score(pipe, X, y, cv=kf, scoring=rmse_scorer)
print(f"{cv_scores.mean()}")

# 추론

In [None]:
model = pipe.fit(X, y)
df_submit = pd.read_csv("data/sample_submission.csv")
df_submit["TARGET"] = model.predict(test)
df_submit.to_csv("submit.csv", index=False)