In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


	•	stratify로 클래스 균형 맞추고 random_state=42 고정해서 분할
	•	LightGBM + Optuna(PR_AUC 기준) 튜닝
	•	Accuracy / PR_AUC / ROC_AUC / F1_score 출력
	•	테스트 확률을 s1_pred_proba.csv로 저장


In [None]:
# 의존성
!apt-get -y update
!apt-get -y install --no-install-recommends \
    cmake git \
    libboost-dev libboost-system-dev libboost-filesystem-dev \
    ocl-icd-opencl-dev opencl-headers

# 소스 클론(서브모듈 포함)
%cd /content
!rm -rf LightGBM
!git clone --recursive https://github.com/microsoft/LightGBM
%cd LightGBM

# GPU 빌드 (중요: build 폴더에서 cmake ..)
!mkdir -p build
%cd build
!cmake -DUSE_GPU=1 ..
!make -j$(nproc)

# 파이썬 패키지 설치
%cd /content/LightGBM/python-package
!python setup.py install

# 확인
import lightgbm as lgb
print("LightGBM version:", lgb.__version__)

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.81)] [Connecting to security.ub                                                                               Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com] [Waiting for heade                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com] [Waiting for heade                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com] [Waiting for headers] [Connecting to ppa0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.81)] [                                                    

In [None]:
import os, json, warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score, accuracy_score,
    precision_recall_curve
)
import lightgbm as lgb
import optuna
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")
optuna.logging.set_verbosity(optuna.logging.WARNING)

# --------------------------------
# 파일 경로 (단일 CSV)
# --------------------------------
FILE_PATHS = {
    "Amazon":  '/content/drive/MyDrive/1014/data/new_amazon.csv',
    "Coursera":'/content/drive/MyDrive/1014/data/new_coursera.csv',
    "Audible": '/content/drive/MyDrive/1014/data/new_audible.csv',
    "Hotel":   '/content/drive/MyDrive/1014/data/new_hotel.csv'
}

# --------------------------------
# S1 피처
# --------------------------------
S1_FEATURES = {
    "Amazon":  ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Price','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Is_Photo','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Coursera":['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Num_of_Reviews','Num_of_Enrolled','Num_of_top_instructor_courses','Num_of_top_instructor_learners','Text_Length','Valence','Arousal','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Audible": ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth'],
    "Hotel":   ['Average_Rating','Rating','Deviation_Of_Star_Ratings','Time_Lapsed','Text_Length','Valence','Arousal','Title_Length','Num_of_Ratings','Flesch_Reading_Ease','FOG_Index','Sentiment_Score','new_depth','new_breadth','Is_Photo','Hotel_Grade','Employee_Friendliness_Score','Facility_Score','Cleanliness_Score','Comfort_Score','Value_For_Money_Score','Location_Score']
}

# --------------------------------
# 공통 Config
# --------------------------------
TARGET_COLUMN = 'binary_helpfulness'
TEST_SPLIT_RATIO = 0.2
RANDOM_STATE = 42
N_TRIALS = 50

# ✅ (선택) GPU 시도: 코랩의 LightGBM이 GPU 빌드가 아닐 수 있음
GPU_PARAMS = {"device": "gpu"}  # 필요 시 {"device_type": "gpu"}

def _make_numeric_df(df: pd.DataFrame) -> pd.DataFrame:
    num = df.apply(pd.to_numeric, errors='coerce')
    med = num.median()
    return num.fillna(med)

def find_best_threshold(y_true, prob):
    precision, recall, thresholds = precision_recall_curve(y_true, prob)
    thresholds = np.concatenate([thresholds, [1.0]])
    f1s = (2 * precision * recall) / np.clip(precision + recall, 1e-9, None)
    idx = int(np.nanargmax(f1s))
    return float(thresholds[idx]), float(f1s[idx])

def run_s1_pipeline(platform, csv_path, features):
    print("\n" + "="*60)
    print(f"▶ Platform: {platform}")
    print("="*60)

    # Step 1. 데이터 로드
    df = pd.read_csv(csv_path)
    assert TARGET_COLUMN in df.columns, f"{TARGET_COLUMN} 없음"
    labels = df[TARGET_COLUMN].astype(int).values

    exists = [c for c in features if c in df.columns]
    missing = sorted(set(features) - set(exists))
    if missing:
        print(f"[경고] 제외된 피처: {missing}")

    X_all = _make_numeric_df(df[exists]).to_numpy()

    # Stratified split
    indices = np.arange(len(df))
    train_idx, test_idx = train_test_split(
        indices,
        test_size=TEST_SPLIT_RATIO,
        random_state=RANDOM_STATE,
        stratify=labels
    )
    X_train, X_test = X_all[train_idx], X_all[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]

    print(f"✅ Train={len(y_train)}, Test={len(y_test)}")

    # Step 2. Optuna 튜닝 (진행률 표시)
    def objective(trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 200, 1200),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 31, 255),
            "max_depth": trial.suggest_int("max_depth", -1, 16),
            "min_child_samples": trial.suggest_int("min_child_samples", 10, 200),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "objective": "binary",
            "verbose": -1,
            **GPU_PARAMS,   # ⚠️ GPU 빌드가 아닐 경우 무시되거나 에러날 수 있음
        }
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
        pr_aucs = []
        for tr, va in skf.split(X_train, y_train):
            clf = lgb.LGBMClassifier(**params)
            clf.fit(
                X_train[tr], y_train[tr],
                eval_set=[(X_train[va], y_train[va])],
                eval_metric="average_precision",
                callbacks=[lgb.early_stopping(100, verbose=False)]
            )
            prob = clf.predict_proba(X_train[va])[:, 1]
            pr_aucs.append(average_precision_score(y_train[va], prob))
        return float(np.mean(pr_aucs))

    # ✅ Optuna 진행률: notebook tqdm (한 줄에서 움직임)
    with tqdm(total=N_TRIALS,
              desc=f"Optuna Tuning [{platform}]",
              unit="trial", dynamic_ncols=True, leave=True) as pbar:
        def callback(study, trial):
            pbar.update(1)
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=N_TRIALS, callbacks=[callback])

    best_params = study.best_params
    print("🧪 Best Params:", best_params)

    # Step 3. 최종 학습
    clf = lgb.LGBMClassifier(
        **best_params,
        objective="binary",
        random_state=RANDOM_STATE,
        n_jobs=-1,
        verbose=-1,
        **GPU_PARAMS,  # ⚠️ GPU 빌드가 아닐 경우 제거 필요
    )
    clf.fit(X_train, y_train)

    test_prob = clf.predict_proba(X_test)[:, 1]
    best_th, _ = find_best_threshold(y_train, clf.predict_proba(X_train)[:, 1])
    test_pred = (test_prob >= best_th).astype(int)

    # Step 4. 결과 출력
    metrics = {
        "Accuracy": accuracy_score(y_test, test_pred),
        "PR_AUC": average_precision_score(y_test, test_prob),
        "ROC_AUC": roc_auc_score(y_test, test_prob),
        "F1_score": f1_score(y_test, test_pred),
        "Best_Threshold": best_th
    }
    print("=== Test Metrics ===")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

    # Step 5. 저장
    save_dir = f"/content/drive/MyDrive/1014/result/{platform}"
    os.makedirs(save_dir, exist_ok=True)

    pd.DataFrame({
        "index": test_idx,
        "s1_pred_proba": test_prob,
        "y_true": y_test,
        "y_pred_at_best_th": test_pred
    }).to_csv(f"{save_dir}/s1_pred_proba.csv", index=False)

    with open(f"{save_dir}/results.json","w") as f:
        json.dump({
            **metrics,
            "features_used": exists,
            "missing_features_ignored": missing
        }, f, indent=2)

    clf.booster_.save_model(f"{save_dir}/lgbm_model.txt")
    print(f"📁 Results saved in {save_dir}")

# ---- 전체 실행 (플랫폼 루프도 notebook tqdm)
for platform, path in tqdm(FILE_PATHS.items(),
                           desc="전체 플랫폼 진행",
                           unit="platform", dynamic_ncols=True, leave=True):
    run_s1_pipeline(platform, path, S1_FEATURES[platform])

전체 플랫폼 진행:   0%|          | 0/4 [00:00<?, ?platform/s]


▶ Platform: Amazon
✅ Train=71941, Test=17986


Optuna Tuning [Amazon]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'n_estimators': 1050, 'learning_rate': 0.01782554470338834, 'num_leaves': 114, 'max_depth': 14, 'min_child_samples': 12, 'subsample': 0.7996526580303993, 'colsample_bytree': 0.6301286277541367, 'reg_alpha': 0.5316339072769136, 'reg_lambda': 2.0077049531609035}
=== Test Metrics ===
Accuracy: 0.9040
PR_AUC: 0.5086
ROC_AUC: 0.8541
F1_score: 0.4938
Best_Threshold: 0.2677
📁 Results saved in /content/drive/MyDrive/1014/result/Amazon

▶ Platform: Coursera
✅ Train=97108, Test=24278


Optuna Tuning [Coursera]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'n_estimators': 402, 'learning_rate': 0.02924135076800882, 'num_leaves': 212, 'max_depth': 0, 'min_child_samples': 32, 'subsample': 0.8438234750629929, 'colsample_bytree': 0.8439118926302716, 'reg_alpha': 0.07787299942933497, 'reg_lambda': 4.96513316589749}
=== Test Metrics ===
Accuracy: 0.9651
PR_AUC: 0.5910
ROC_AUC: 0.9385
F1_score: 0.5374
Best_Threshold: 0.3316
📁 Results saved in /content/drive/MyDrive/1014/result/Coursera

▶ Platform: Audible
✅ Train=74391, Test=18598


Optuna Tuning [Audible]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'n_estimators': 779, 'learning_rate': 0.011334318249744048, 'num_leaves': 185, 'max_depth': 15, 'min_child_samples': 19, 'subsample': 0.8464242144199472, 'colsample_bytree': 0.6334228579618779, 'reg_alpha': 0.27200228098369583, 'reg_lambda': 0.7640181183949035}
=== Test Metrics ===
Accuracy: 0.9373
PR_AUC: 0.5365
ROC_AUC: 0.8841
F1_score: 0.5059
Best_Threshold: 0.2855
📁 Results saved in /content/drive/MyDrive/1014/result/Audible

▶ Platform: Hotel
✅ Train=71604, Test=17901


Optuna Tuning [Hotel]:   0%|          | 0/50 [00:00<?, ?trial/s]

🧪 Best Params: {'n_estimators': 393, 'learning_rate': 0.014561351088691079, 'num_leaves': 101, 'max_depth': 16, 'min_child_samples': 10, 'subsample': 0.7079351327562202, 'colsample_bytree': 0.778423140597254, 'reg_alpha': 0.6309635351325671, 'reg_lambda': 1.4099632992239302}
=== Test Metrics ===
Accuracy: 0.8635
PR_AUC: 0.2821
ROC_AUC: 0.7295
F1_score: 0.2985
Best_Threshold: 0.1722
📁 Results saved in /content/drive/MyDrive/1014/result/Hotel
