## MinMaxScaler 전체 코드

In [1]:
# ============================================================
# 방법 2 + MinMaxScaler 전체 코드
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import ndcg_score
import lightgbm as lgb

from xgboost import XGBRanker

df = pd.read_csv(r"C:\work\Project\sports-analysis-fighter\csv_파일\8800개_데이터.csv")

# ============================================================
# 1) 질문 단위 train / test 분리
# ============================================================

queries_0 = df[df['사용자 유형'] == 0]['질문'].unique()
queries_1 = df[df['사용자 유형'] == 1]['질문'].unique()

train_q0, test_q0 = train_test_split(queries_0, test_size=0.2, random_state=42)
train_q1, test_q1 = train_test_split(queries_1, test_size=0.2, random_state=42)

train_df = df[df['질문'].isin(list(train_q0) + list(train_q1))].sort_values('질문').copy()
test_df  = df[df['질문'].isin(list(test_q0)  + list(test_q1))].sort_values('질문').copy()

# ============================================================
# 2) 팀 인코딩 (OrdinalEncoder, train에만 fit)
# ============================================================

enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

train_df[['기존_응원팀_encoded', '매칭팀_encoded']] = enc.fit_transform(
    train_df[['기존 응원 팀', '매칭팀']]
)

test_df[['기존_응원팀_encoded', '매칭팀_encoded']] = enc.transform(
    test_df[['기존 응원 팀', '매칭팀']]
)

# ============================================================
# 3) 질문 벡터화 + 차원 축소 (TF-IDF + SVD)
# ============================================================

tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=2, max_df=0.9)

Xq_train_tfidf = tfidf.fit_transform(train_df['질문'])
Xq_test_tfidf  = tfidf.transform(test_df['질문'])

svd = TruncatedSVD(n_components=50, random_state=42)

Xq_train_svd = svd.fit_transform(Xq_train_tfidf)
Xq_test_svd  = svd.transform(Xq_test_tfidf)

# ============================================================
# 4) 질문 클러스터링 → q_cluster
# ============================================================

kmeans = KMeans(n_clusters=20, random_state=42, n_init="auto")

train_df['q_cluster'] = kmeans.fit_predict(Xq_train_svd)
test_df['q_cluster']  = kmeans.predict(Xq_test_svd)

# ============================================================
# 5) feature 구성 + One-Hot
# ============================================================

base_features = [
    '사용자 유형',
    'sbert_score',
    'n2v_score',
    'vector_score',
    '기존_응원팀_encoded',
    '매칭팀_encoded',
    'q_cluster'
]

X_train = train_df[base_features].copy()
X_test  = test_df[base_features].copy()

# q_cluster One-Hot
X_train = pd.get_dummies(X_train, columns=['q_cluster'], prefix='qcl')
X_test  = pd.get_dummies(X_test, columns=['q_cluster'], prefix='qcl')

# 컬럼 정렬
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# ============================================================
# 6) ✅ MinMaxScaler 적용 (train에만 fit)
# ============================================================

scaler = MinMaxScaler()

# pandas → numpy 변환 후 스케일링 → 다시 DataFrame
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

# ============================================================
# 7) 라벨 생성
# ============================================================

y_train = train_df['llm_rank'].apply(lambda x: 4 - x if x <= 3 else 0).astype(float)
y_test  = test_df['llm_rank'].apply(lambda x: 4 - x if x <= 3 else 0).astype(float)

# ============================================================
# 8) 그룹 정보
# ============================================================

group_train = train_df.groupby('질문').size().to_list()
group_test  = test_df.groupby('질문').size().to_list()

assert sum(group_train) == len(train_df)
assert sum(group_test)  == len(test_df)

# ============================================================
# 9) XGBRanker 학습
# ============================================================

model = lgb.LGBMRanker(
    objective="lambdarank",
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    force_row_wise = True ,
    random_state=42
)

model.fit(X_train_scaled, y_train, group=group_train)

# ============================================================
# 10) 예측 + NDCG 평가
# ============================================================

pred_test = model.predict(X_test_scaled)

def mean_ndcg_by_group(y_true, y_pred, group_sizes, k=5):
    scores = []
    start = 0
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    for g in group_sizes:
        end = start + g
        kk = min(k, g)
        scores.append(
            ndcg_score(
                y_true[start:end].reshape(1, -1),
                y_pred[start:end].reshape(1, -1),
                k=kk
            )
        )
        start = end
    return float(np.mean(scores))

print("NDCG@3 :", mean_ndcg_by_group(y_test, pred_test, group_test, k=3))
print("NDCG@5 :", mean_ndcg_by_group(y_test, pred_test, group_test, k=5))
print("NDCG@10:", mean_ndcg_by_group(y_test, pred_test, group_test, k=10))

[LightGBM] [Info] Total Bins 845
[LightGBM] [Info] Number of data points in the train set: 6810, number of used features: 26
NDCG@3 : 0.676259584592097
NDCG@5 : 0.661541266557055
NDCG@10: 0.6545697732809159


In [2]:
from itertools import product
from xgboost import XGBRanker
from sklearn.metrics import ndcg_score
import numpy as np

param_grid = {
    "learning_rate": [0.05, 0.1],
    "max_depth": [4, 6],
    "n_estimators": [200, 400],
    "subsample": [0.8, 1.0]
}

best_score = -1
best_params = None

for lr, md, ne, ss in product(
    param_grid["learning_rate"],
    param_grid["max_depth"],
    param_grid["n_estimators"],
    param_grid["subsample"]
):
    model = lgb.LGBMRanker(
        objective="lambdarank",
        learning_rate=lr,
        max_depth=md,
        n_estimators=ne,
        subsample=ss,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        group = group_train
    )

    preds = model.predict(X_test)

    score = ndcg_score(
        [y_test],
        [preds]
    )

    if score > best_score:
        best_score = score
        best_params = {
            "learning_rate": lr,
            "max_depth": md,
            "n_estimators": ne,
            "subsample": ss
        }

print("Best NDCG:", best_score)
print("Best Params:", best_params)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 6810, number of used features: 26
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 6810, number of used features: 26
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000248 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 846
[LightGBM] [Info] Number of data points in the train set: 6810, number of used features: 26
[LightGBM] 