In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip -q install optuna catboost tqdm ipywidgets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m81.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ============================================
# CatBoost (GPU 안전 스위치 + 경고 억제 + 견고성 패치)
# ============================================
import os, json, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score, accuracy_score,
    precision_recall_curve
)
import optuna
from tqdm.notebook import tqdm
from catboost import CatBoostClassifier, Pool

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --------------------------------
# 파일 경로
# --------------------------------
FILE_PATHS = {
    "Amazon":  '/content/drive/MyDrive/1014/data/new_amazon.csv',
    "Coursera":'/content/drive/MyDrive/1014/data/new_coursera.csv',
    "Audible": '/content/drive/MyDrive/1014/data/new_audible.csv',
    "Hotel":   '/content/drive/MyDrive/1014/data/new_hotel.csv'
}

# --------------------------------
# S1 피처
# --------------------------------
S1_FEATURES = {
    "Amazon":  ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Price','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Is_Photo','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Coursera":['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Num_of_Reviews','Num_of_Enrolled','Num_of_top_instructor_courses','Num_of_top_instructor_learners','Text_Length','Valence','Arousal','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Audible": ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Hotel":   ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth','Is_Photo','Hotel_Grade','Employee_Friendliness_Score','Facility_Score','Cleanliness_Score','Comfort_Score','Value_For_Money_Score','Location_Score']
}

# --------------------------------
# Config
# --------------------------------
TARGET_COLUMN   = 'binary_helpfulness'
TEST_SPLIT_RATIO= 0.2
RANDOM_STATE    = 42
N_TRIALS        = 50

# GPU 사용 여부 (코랩이 CPU면 False로)
USE_GPU = True
STATIC_CTB = {
    'task_type': 'GPU' if USE_GPU else 'CPU',
    # devices는 CPU에선 넘기지 않음
    **({'devices':'0'} if USE_GPU else {}),
    'logging_level': 'Silent',           # verbose와 동시 설정 금지!
    'bootstrap_type': 'Bernoulli',
    'random_seed': RANDOM_STATE,
    'allow_writing_files': False,        # 불필요한 로그/스냅샷 생성 방지
}

def _make_numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    num = df.apply(pd.to_numeric, errors='coerce')
    med = num.median()
    return num.fillna(med)

def _best_threshold_by_f1(y_true, prob):
    precision, recall, thresholds = precision_recall_curve(y_true, prob)
    thresholds = np.concatenate([thresholds, [1.0]])
    f1s = (2 * precision * recall) / np.clip(precision + recall, 1e-9, None)
    idx = int(np.nanargmax(f1s))
    return float(thresholds[idx])

def run_s1_pipeline(platform, csv_path, features):
    print("\n" + "="*60)
    print(f"▶ Platform: {platform}")
    print("="*60)

    # Step 1. 데이터 로드
    df = pd.read_csv(csv_path)
    if TARGET_COLUMN not in df.columns:
        raise ValueError(f"[{platform}] '{TARGET_COLUMN}' 컬럼이 없습니다.")
    labels = df[TARGET_COLUMN].astype(int).values

    exists = [c for c in features if c in df.columns]
    if not exists:
        raise ValueError(f"[{platform}] 사용 가능한 S1 피처가 없습니다.")

    # 수치화 + Inf/NaN 방어
    X_all = (_make_numeric_df(df[exists])
             .replace([np.inf, -np.inf], np.nan)
             .fillna(0.0)
             .to_numpy())

    # Stratified split
    idx = np.arange(len(df))
    tr_idx, te_idx = train_test_split(
        idx, test_size=TEST_SPLIT_RATIO, random_state=RANDOM_STATE, stratify=labels
    )
    X_train, X_test = X_all[tr_idx], X_all[te_idx]
    y_train, y_test = labels[tr_idx], labels[te_idx]
    print(f"✅ Train={len(y_train)}, Test={len(y_test)}")

    # ---------- Optuna (CatBoost) ----------
    # 내부 메트릭은 GPU에서 경고 없는 'AUC'로, 선택은 외부 PR AUC로
    def objective(trial):
        tuned = {
            'objective': 'Logloss',
            'eval_metric': 'AUC',  # PRAUC 대신 AUC로 경고 제거
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'iterations': trial.suggest_int('iterations', 100, 2000, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
            'depth': trial.suggest_int('depth', 3, 12),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        }
        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
        pr_aucs = []
        for tr, va in skf.split(X_train, y_train):
            model = CatBoostClassifier(**STATIC_CTB, **tuned)
            model.fit(
                Pool(X_train[tr], y_train[tr]),
                eval_set=Pool(X_train[va], y_train[va]),
                early_stopping_rounds=100,
                use_best_model=True
            )
            prob = model.predict_proba(X_train[va])[:, 1]
            pr_aucs.append(average_precision_score(y_train[va], prob))
        return float(np.mean(pr_aucs))

    study = optuna.create_study(direction="maximize")
    with tqdm(total=N_TRIALS, desc=f"Optuna Tuning [{platform}]", unit="trial") as pbar:
        study.optimize(objective, n_trials=N_TRIALS, callbacks=[lambda s, t: pbar.update(1)])

    best_params = study.best_params
    print("🧪 Best Params:", best_params)

    # Step 3. 최종 학습
    clf = CatBoostClassifier(**STATIC_CTB, **best_params)
    clf.fit(
        Pool(X_train, y_train),
        eval_set=Pool(X_test, y_test),
        early_stopping_rounds=100,
        use_best_model=True
    )

    # Step 4. 평가 (+ 임계값 F1 기준)
    train_prob = clf.predict_proba(X_train)[:, 1]
    test_prob  = clf.predict_proba(X_test)[:, 1]
    best_th = _best_threshold_by_f1(y_train, train_prob)
    test_pred = (test_prob >= best_th).astype(int)

    metrics = {
        "Accuracy": float(accuracy_score(y_test, test_pred)),
        "PR_AUC":   float(average_precision_score(y_test, test_prob)),
        "ROC_AUC":  float(roc_auc_score(y_test, test_prob)),
        "F1_score": float(f1_score(y_test, test_pred)),
        "Best_Threshold": float(best_th)
    }
    print("=== Test Metrics ===", metrics)

    # Step 5. 저장
    save_dir = f"/content/drive/MyDrive/1014/result_catboost/{platform}"
    os.makedirs(save_dir, exist_ok=True)
    pd.DataFrame({
        "index": te_idx,
        "s1_pred_proba": test_prob,
        "y_true": y_test,
        "y_pred_at_best_th": test_pred
    }).to_csv(f"{save_dir}/s1_pred_proba.csv", index=False)
    with open(f"{save_dir}/results.json", "w") as f:
        json.dump(metrics, f, indent=2)
    clf.save_model(f"{save_dir}/catboost_model.cbm")

# ---- 전체 실행
for platform, path in tqdm(FILE_PATHS.items(), desc="전체 플랫폼 진행 (CatBoost)", unit="platform"):
    run_s1_pipeline(platform, path, S1_FEATURES[platform])

전체 플랫폼 진행 (CatBoost):   0%|          | 0/4 [00:00<?, ?platform/s]


▶ Platform: Amazon
✅ Train=71941, Test=17986


Optuna Tuning [Amazon]:   0%|          | 0/50 [00:00<?, ?trial/s]

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric perio

🧪 Best Params: {'subsample': 0.9579966729248548, 'iterations': 1600, 'learning_rate': 0.01965109541178725, 'depth': 12, 'l2_leaf_reg': 9.764166424575516}
=== Test Metrics === {'Accuracy': 0.906038029578561, 'PR_AUC': 0.49698669322164424, 'ROC_AUC': 0.8459153803420699, 'F1_score': 0.476456009913259, 'Best_Threshold': 0.2859947908172602}

▶ Platform: Coursera
✅ Train=97108, Test=24278


Optuna Tuning [Coursera]:   0%|          | 0/50 [00:00<?, ?trial/s]

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric perio

🧪 Best Params: {'subsample': 0.9025107655723889, 'iterations': 800, 'learning_rate': 0.058951220224030584, 'depth': 7, 'l2_leaf_reg': 1.4945756473335907}
=== Test Metrics === {'Accuracy': 0.9634648653101573, 'PR_AUC': 0.5845371375016998, 'ROC_AUC': 0.9332255513431789, 'F1_score': 0.5372978612415232, 'Best_Threshold': 0.2934766514647009}

▶ Platform: Audible
✅ Train=74391, Test=18598


Optuna Tuning [Audible]:   0%|          | 0/50 [00:00<?, ?trial/s]

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric perio

🧪 Best Params: {'subsample': 0.8037268586686506, 'iterations': 1600, 'learning_rate': 0.05369871863770585, 'depth': 11, 'l2_leaf_reg': 9.74952562224928}
=== Test Metrics === {'Accuracy': 0.9263361651790515, 'PR_AUC': 0.4574738867252533, 'ROC_AUC': 0.8550650123524427, 'F1_score': 0.4635865309318716, 'Best_Threshold': 0.2175702245198038}

▶ Platform: Hotel
✅ Train=71604, Test=17901


Optuna Tuning [Hotel]:   0%|          | 0/50 [00:00<?, ?trial/s]

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric perio