In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 설치 (최초 1회만 필요)
!pip -q install optuna tqdm ipywidgets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/400.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m82.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip -q install xgboost==1.7.6
# ↳ 꼭 런타임 재시작(코랩: Runtime > Restart runtime)

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.3/200.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import xgboost as xgb, inspect
print(xgb.__version__)  # 1.7.6 기대
"callbacks" in inspect.signature(xgb.XGBClassifier.fit).parameters  # True

1.7.6


True

In [None]:
import os, json, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score, accuracy_score,
    precision_recall_curve
)
import optuna
from tqdm.notebook import tqdm
import xgboost as xgb

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --------------------------------
# 파일 경로
# --------------------------------
FILE_PATHS = {
    "Amazon":  '/content/drive/MyDrive/1014/data/new_amazon.csv',
    "Coursera":'/content/drive/MyDrive/1014/data/new_coursera.csv',
    "Audible": '/content/drive/MyDrive/1014/data/new_audible.csv',
    "Hotel":   '/content/drive/MyDrive/1014/data/new_hotel.csv'
}

# --------------------------------
# S1 피처
# --------------------------------
S1_FEATURES = {
    "Amazon":  ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Price','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Is_Photo','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Coursera":['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Num_of_Reviews','Num_of_Enrolled','Num_of_top_instructor_courses','Num_of_top_instructor_learners','Text_Length','Valence','Arousal','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Audible": ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Hotel":   ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth','Is_Photo','Hotel_Grade','Employee_Friendliness_Score','Facility_Score','Cleanliness_Score','Comfort_Score','Value_For_Money_Score','Location_Score']
}
# --------------------------------
# Config
# --------------------------------
TARGET_COLUMN = 'binary_helpfulness'
TEST_SPLIT_RATIO = 0.2
RANDOM_STATE = 42
N_TRIALS = 50

def _make_numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    num = df.apply(pd.to_numeric, errors='coerce')
    med = num.median()
    return num.fillna(med)

def find_best_threshold(y_true, prob):
    precision, recall, thresholds = precision_recall_curve(y_true, prob)
    thresholds = np.concatenate([thresholds, [1.0]])
    f1s = (2 * precision * recall) / np.clip(precision + recall, 1e-9, None)
    idx = int(np.nanargmax(f1s))
    return float(thresholds[idx]), float(f1s[idx])

def run_s1_pipeline(platform, csv_path, features):
    print("\n" + "="*60)
    print(f"▶ Platform: {platform}")
    print("="*60)

    # Step 1. 데이터 로드
    df = pd.read_csv(csv_path)
    labels = df[TARGET_COLUMN].astype(int).values
    exists = [c for c in features if c in df.columns]
    X_all = _make_numeric_df(df[exists]).to_numpy()

    # Stratified split
    idx = np.arange(len(df))
    tr_idx, te_idx = train_test_split(idx, test_size=TEST_SPLIT_RATIO,
                                      random_state=RANDOM_STATE, stratify=labels)
    X_train, X_test = X_all[tr_idx], X_all[te_idx]
    y_train, y_test = labels[tr_idx], labels[te_idx]
    print(f"✅ Train={len(y_train)}, Test={len(y_test)}")

    # ---------- Optuna (XGBoost + GPU) ----------
    def objective(trial):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "aucpr",          # PR AUC
            "tree_method": "gpu_hist",       # GPU
            "random_state": RANDOM_STATE,
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 50),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),  # ★
            "reg_alpha":  trial.suggest_float("reg_alpha",  1e-8, 10.0, log=True),  # ★
            "n_estimators": trial.suggest_int("n_estimators", 200, 2000, step=100),
        }

        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        pr_aucs = []
        for tr, va in skf.split(X_train, y_train):
            model = xgb.XGBClassifier(**params, verbosity=0)
            model.fit(
                X_train[tr], y_train[tr],
                eval_set=[(X_train[va], y_train[va])],
                verbose=False,
                callbacks=[xgb.callback.EarlyStopping(rounds=100, save_best=True)]  # ★
            )
            prob = model.predict_proba(X_train[va])[:, 1]
            pr_aucs.append(average_precision_score(y_train[va], prob))
        return float(np.mean(pr_aucs))

    # ✅ notebook tqdm으로 움직이는 진행바
    study = optuna.create_study(direction="maximize")
    with tqdm(total=N_TRIALS, desc=f"Optuna Tuning [{platform}]", unit="trial") as pbar:
        def cb(study, trial):
            pbar.update(1)
        study.optimize(objective, n_trials=N_TRIALS, callbacks=[cb])

    best_params = study.best_params
    print("🧪 Best Params:", best_params)

    # Step 3. 최종 학습
    clf = xgb.XGBClassifier(
        **best_params,
        tree_method="gpu_hist",
        random_state=RANDOM_STATE,
        verbosity=1
    )
    clf.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        verbose=True,
        callbacks=[xgb.callback.EarlyStopping(rounds=100, save_best=True)]  # ★
    )

    # Step 4. 평가
    test_prob = clf.predict_proba(X_test)[:, 1]
    best_th, _ = find_best_threshold(y_train, clf.predict_proba(X_train)[:, 1])
    test_pred = (test_prob >= best_th).astype(int)
    metrics = {
        "Accuracy": accuracy_score(y_test, test_pred),
        "PR_AUC": average_precision_score(y_test, test_prob),
        "ROC_AUC": roc_auc_score(y_test, test_prob),
        "F1_score": f1_score(y_test, test_pred),
        "Best_Threshold": float(best_th)
    }
    print("=== Test Metrics ===", metrics)

    # Step 5. 저장
    save_dir = f"/content/drive/MyDrive/1014/result_xgboost/{platform}"
    os.makedirs(save_dir, exist_ok=True)
    pd.DataFrame({"index": te_idx, "s1_pred_proba": test_prob,
                  "y_true": y_test, "y_pred_at_best_th": test_pred}
                ).to_csv(f"{save_dir}/s1_pred_proba.csv", index=False)
    with open(f"{save_dir}/results.json", "w") as f:
        json.dump(metrics, f, indent=2)
    clf.save_model(f"{save_dir}/xgboost_model.json")

# ---- 전체 실행 (플랫폼 루프도 notebook tqdm)
for platform, path in tqdm(FILE_PATHS.items(), desc="전체 플랫폼 진행 (XGBoost)", unit="platform"):
    run_s1_pipeline(platform, path, S1_FEATURES[platform])

전체 플랫폼 진행 (XGBoost):   0%|          | 0/4 [00:00<?, ?platform/s]


▶ Platform: Amazon
✅ Train=71941, Test=17986


Optuna Tuning [Amazon]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'learning_rate': 0.011344637676877104, 'max_depth': 8, 'min_child_weight': 4, 'subsample': 0.9508782589812411, 'colsample_bytree': 0.6807000753045458, 'reg_lambda': 1.7150525227726505e-07, 'reg_alpha': 0.009501046375036239, 'n_estimators': 1800}
[0]	validation_0-logloss:0.68508
[1]	validation_0-logloss:0.67728
[2]	validation_0-logloss:0.66957
[3]	validation_0-logloss:0.66222
[4]	validation_0-logloss:0.65506
[5]	validation_0-logloss:0.64807
[6]	validation_0-logloss:0.64109
[7]	validation_0-logloss:0.63431
[8]	validation_0-logloss:0.62774
[9]	validation_0-logloss:0.62121
[10]	validation_0-logloss:0.61471
[11]	validation_0-logloss:0.60836
[12]	validation_0-logloss:0.60212
[13]	validation_0-logloss:0.59612
[14]	validation_0-logloss:0.59025
[15]	validation_0-logloss:0.58446
[16]	validation_0-logloss:0.57881
[17]	validation_0-logloss:0.57317
[18]	validation_0-logloss:0.56762
[19]	validation_0-logloss:0.56223
[20]	validation_0-logloss:0.55710
[21]	validation_0-logloss:0.55187


Optuna Tuning [Coursera]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'learning_rate': 0.011092648998599828, 'max_depth': 12, 'min_child_weight': 3, 'subsample': 0.8639355397374132, 'colsample_bytree': 0.6403645176457127, 'reg_lambda': 3.223901271128392e-08, 'reg_alpha': 0.0019376451512578927, 'n_estimators': 1400}
[0]	validation_0-logloss:0.68338
[1]	validation_0-logloss:0.67386
[2]	validation_0-logloss:0.66461
[3]	validation_0-logloss:0.65560
[4]	validation_0-logloss:0.64677
[5]	validation_0-logloss:0.63807
[6]	validation_0-logloss:0.62960
[7]	validation_0-logloss:0.62121
[8]	validation_0-logloss:0.61308
[9]	validation_0-logloss:0.60501
[10]	validation_0-logloss:0.59707
[11]	validation_0-logloss:0.58950
[12]	validation_0-logloss:0.58210
[13]	validation_0-logloss:0.57477
[14]	validation_0-logloss:0.56755
[15]	validation_0-logloss:0.56038
[16]	validation_0-logloss:0.55341
[17]	validation_0-logloss:0.54669
[18]	validation_0-logloss:0.54002
[19]	validation_0-logloss:0.53339
[20]	validation_0-logloss:0.52694
[21]	validation_0-logloss:0.52070

Optuna Tuning [Audible]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'learning_rate': 0.017562415894736644, 'max_depth': 11, 'min_child_weight': 1, 'subsample': 0.8034011259919875, 'colsample_bytree': 0.7860663067724507, 'reg_lambda': 0.030790206359967643, 'reg_alpha': 0.00048735975467172917, 'n_estimators': 400}
[0]	validation_0-logloss:0.67931
[1]	validation_0-logloss:0.66658
[2]	validation_0-logloss:0.65434
[3]	validation_0-logloss:0.64194
[4]	validation_0-logloss:0.63043
[5]	validation_0-logloss:0.61879
[6]	validation_0-logloss:0.60744
[7]	validation_0-logloss:0.59708
[8]	validation_0-logloss:0.58646
[9]	validation_0-logloss:0.57611
[10]	validation_0-logloss:0.56601
[11]	validation_0-logloss:0.55623
[12]	validation_0-logloss:0.54724
[13]	validation_0-logloss:0.53807
[14]	validation_0-logloss:0.52969
[15]	validation_0-logloss:0.52106
[16]	validation_0-logloss:0.51325
[17]	validation_0-logloss:0.50511
[18]	validation_0-logloss:0.49764
[19]	validation_0-logloss:0.48992
[20]	validation_0-logloss:0.48291
[21]	validation_0-logloss:0.47566


Optuna Tuning [Hotel]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'learning_rate': 0.04195251410296209, 'max_depth': 12, 'min_child_weight': 8, 'subsample': 0.9986958543413751, 'colsample_bytree': 0.7078197250916807, 'reg_lambda': 0.7264833427951398, 'reg_alpha': 2.386473189194866, 'n_estimators': 500}
[0]	validation_0-logloss:0.66542
[1]	validation_0-logloss:0.63969
[2]	validation_0-logloss:0.61576
[3]	validation_0-logloss:0.59366
[4]	validation_0-logloss:0.57323
[5]	validation_0-logloss:0.55420
[6]	validation_0-logloss:0.53685
[7]	validation_0-logloss:0.52061
[8]	validation_0-logloss:0.50547
[9]	validation_0-logloss:0.49117
[10]	validation_0-logloss:0.47815
[11]	validation_0-logloss:0.46591
[12]	validation_0-logloss:0.45437
[13]	validation_0-logloss:0.44376
[14]	validation_0-logloss:0.43374
[15]	validation_0-logloss:0.42418
[16]	validation_0-logloss:0.41540
[17]	validation_0-logloss:0.40723
[18]	validation_0-logloss:0.39936
[19]	validation_0-logloss:0.39199
[20]	validation_0-logloss:0.38495
[21]	validation_0-logloss:0.37867
[22]	val