In [3]:
# 1. 라이브러리 임포트
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

!pip install catboost
from catboost import CatBoostClassifier
from google.colab import drive

# 2. 구글 드라이브 마운트 및 데이터 로드
drive.mount('/content/gdrive')

train_df = pd.read_csv('/content/gdrive/MyDrive/2025-1 Pattern recognition/train_processed.csv', engine='python')
test_df = pd.read_csv('/content/gdrive/MyDrive/2025-1 Pattern recognition/test_processed.csv', engine='python')

# 3. Feature Engineering 함수 정의
def add_features(X):
    X = X.copy()
    # 기존
    for col in ['n_tokens_content', 'num_hrefs', 'num_self_hrefs']:
        if col in X.columns:
            X[f'log_{col}'] = np.log1p(X[col].clip(lower=0))

    if 'n_tokens_content' in X.columns and 'num_hrefs' in X.columns:
        X['tokens_x_hrefs'] = X['n_tokens_content'] * X['num_hrefs']
        X['links_per_token'] = X['num_hrefs'] / (X['n_tokens_content'] + 1)

    # 추가 상호작용 항
    if 'global_subjectivity' in X.columns and 'global_sentiment_polarity' in X.columns:
        X['subj_x_polarity'] = X['global_subjectivity'] * X['global_sentiment_polarity']

    if 'average_token_length' in X.columns and 'n_tokens_content' in X.columns:
        X['avglen_x_tokens'] = X['average_token_length'] * X['n_tokens_content']

    return X

# 4. 데이터 분리 및 가공
X = train_df.drop(columns=['id', 'y', 'shares'])
y = train_df['y']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train_fe = add_features(X_train)
X_valid_fe = add_features(X_valid)

# 5. 개별 모델 정의 (튜닝 반영)
xgb_model = XGBClassifier(
    n_estimators=1000,
    max_depth=3,
    learning_rate=0.01,
    subsample=0.9,
    colsample_bytree=0.8,
    gamma=0.1,
    min_child_weight=2,
    eval_metric='auc',
    tree_method='hist',
    device='cuda',
    random_state=42
)

lgb_model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    device='CPU',
    random_state=42
)

cat_model = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.01,
    l2_leaf_reg=4,
    eval_metric='AUC',
    random_seed=42,
    verbose=0,
    task_type='CPU'
)

# 6. 스태킹 모델 정의
stack_model = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cat', cat_model)
    ],
    final_estimator=LogisticRegression(max_iter=300),
    cv=15,
    n_jobs=-1,
    passthrough=False
)

# 7. 학습
stack_model.fit(X_train_fe, y_train)

# 8. 예측 확률
y_prob_valid = stack_model.predict_proba(X_valid_fe)[:, 1]

# 9. Threshold 튜닝 (성능 균형 기준)
best_score = 0
best_thresh = 0.5

for thresh in np.arange(0.42, 0.48, 0.0005):
    preds = (y_prob_valid > thresh).astype(int)
    acc = accuracy_score(y_valid, preds)
    f1 = f1_score(y_valid, preds)
    auc = roc_auc_score(y_valid, y_prob_valid)
    mean_metric = (acc + f1 + auc) / 3
    if mean_metric > best_score:
        best_score = mean_metric
        best_thresh = thresh
        best_acc, best_f1, best_auc = acc, f1, auc

# 10. 출력
print(f"✅ Best Threshold: {best_thresh:.2f}")
print("Accuracy:", best_acc)
print("F1 Score:", best_f1)
print("AUC:", best_auc)
print("Mean Evaluation Metric (Accuracy + F1 + AUC) / 3:", best_score)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
✅ Best Threshold: 0.43
Accuracy: 0.6626126126126126
F1 Score: 0.6924024640657084
AUC: 0.7287275932678292
Mean Evaluation Metric (Accuracy + F1 + AUC) / 3: 0.69458088998205


In [4]:
# 11. 학습 때 쓰인 컬럼 저장
feature_cols = X_train_fe.columns.tolist()

# 12. 테스트 데이터 전처리
X_test_raw  = test_df.drop(columns=['id'], errors='ignore')   # id 제외
X_test_fe   = add_features(X_test_raw)                        # 동일 FE 적용

# 모델에 넣을 전용 DataFrame만 따로 만듬.
X_test_model = X_test_fe[feature_cols].reindex(columns=feature_cols, fill_value=0)

# 13. 예측
y_prob_test = stack_model.predict_proba(X_test_model)[:, 1]
y_pred_test = (y_prob_test > best_thresh).astype(int)

# 14. test_df 원본은 그대로 두고 예측 컬럼만 추가
test_df['y_predict'] = y_pred_test
test_df['y_prob']    = y_prob_test

# 15. 저장 및 다운로드
output_path = '/content/prediction.csv'
test_df.to_csv(output_path, index=False)

print(test_df[['id', 'y_predict', 'y_prob']].head())
from google.colab import files
files.download(output_path)


      id  y_predict    y_prob
0   4979          0  0.234247
1  15552          0  0.407629
2  29370          1  0.613724
3  37272          0  0.241613
4   6836          1  0.483548


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>