In [1]:
# 1) 라이브러리 불러오기
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

# Load data form drive
data = pd.read_csv('/content/gdrive/MyDrive/2025-1 Pattern recognition/train_processed.csv', engine='python')
data.info()


# 피처/타깃 분리
target_col = 'y'
feature_cols = [c for c in data.columns if c != target_col]

X = data[feature_cols]
y = data[target_col]

# 피처셋 구성: id, shares, 원본 y 모두 제거
X = data.drop(columns=['id', 'shares', 'y'])

# 칼럼명 공백 → 언더스코어
X.columns = [c.strip().replace(' ', '_') for c in X.columns]

def drop_corr_features(X, threshold):
    # 1) 원본 상관행렬 계산
    corr = X.corr()
    # 2) 상삼각(주대각선 위)만 남기기
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    # 3) 양(+) 또는 음(–) 상관관계가 threshold보다 클 경우 제거 대상
    to_drop = [
        col for col in upper.columns
        if (upper[col] >  threshold).any()  # 강한 양의 상관
        or (upper[col] < -threshold).any()  # 강한 음의 상관
    ]
    return to_drop

# 사용 예시
to_drop = drop_corr_features(X, threshold=0.8)
print(f"Dropping {len(to_drop)} high-corr features (>|0.8|):", to_drop)

# 제거 후 데이터
X_reduced = X.drop(columns=to_drop)





Mounted at /content/gdrive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22200 entries, 0 to 22199
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            22200 non-null  int64  
 1   n_tokens_title                22200 non-null  float64
 2   n_tokens_content              22200 non-null  float64
 3   n_unique_tokens               22200 non-null  float64
 4   n_non_stop_words              22200 non-null  float64
 5   n_non_stop_unique_tokens      22200 non-null  float64
 6   num_hrefs                     22200 non-null  float64
 7   num_self_hrefs                22200 non-null  float64
 8   num_imgs                      22200 non-null  float64
 9   num_videos                    22200 non-null  float64
 10  average_token_length          22200 non-null  float64
 11  num_keywords                  22200 non-null  float64
 12  kw_min_min                    222

In [3]:
# 2) 데이터 준비 (예시: 유방암 진단 데이터)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 3) 기본 모델 정의
lgbm = LGBMClassifier(
    n_estimators=100,       # 트리 개수
    learning_rate=0.05,     # 학습률
    num_leaves=31,          # 하나의 트리가 가질 수 있는 잎사귀 최대 개수
    max_depth=-1,           # 트리 최대 깊이
    min_child_samples=20,   # 리프 하나가 갖춰야 할 최소 데이터 수
    subsample=1.0,          # row 샘플링 비율 (bagging)
    colsample_bytree=1.0,   # feature 샘플링 비율
    reg_alpha=0.0,          # L1 규제
    reg_lambda=0.0,         # L2 규제
    random_state=42
)


rf = RandomForestClassifier(
    n_estimators=100,       # 나무 개수
    max_depth=None,         # 트리 최대 깊이
    max_features='sqrt',    # 분할 시 고려할 feature 비율
    min_samples_split=2,    # 내부 노드를 분할하기 위한 최소 샘플 수
    min_samples_leaf=1,     # 리프가 되기 위한 최소 샘플 수
    bootstrap=True,
    random_state=42
)


lr   = LogisticRegression(max_iter=1000, random_state=42)

In [4]:
# --- 방법 A: Voting Ensemble ---
voting_hard = VotingClassifier(
    estimators=[('lgbm', lgbm), ('rf', rf)],
    voting='hard'               # 'soft'로 바꾸면 클래스별 확률 평균
)

voting_soft = VotingClassifier(
    estimators=[('lgbm', lgbm), ('rf', rf)],
    voting='soft'
)

# 교차검증으로 성능 확인
for name, model in [('Voting-hard', voting_hard), ('Voting-soft', voting_soft)]:
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name} CV Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")

# 모델 학습 및 평가
voting_soft.fit(X_train, y_train)
print("Voting-soft Test Acc:", voting_soft.score(X_test, y_test))

[LightGBM] [Info] Number of positive: 7043, number of negative: 7165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005271 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8578
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495707 -> initscore=-0.017174
[LightGBM] [Info] Start training from score -0.017174
[LightGBM] [Info] Number of positive: 7043, number of negative: 7165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8583
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [bin

In [None]:
# --- 방법 B: Stacking Ensemble ---
stack = StackingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('rf', rf),
    ],
    final_estimator=lr,      # 1차 모델 예측을 입력받아 학습할 메타 모델
    cv=5,                    # 스태킹 시 내부 교차검증 설정
    passthrough=True         # True면 원본 특성도 메타 모델에 같이 투입
)

# 스태킹 교차검증
stack_scores = cross_val_score(stack, X_train, y_train, cv=5, scoring='accuracy')
print(f"Stacking CV Accuracy: {stack_scores.mean():.4f} ± {stack_scores.std():.4f}")

# 최종 학습 및 테스트
stack.fit(X_train, y_train)
print("Stacking Test Acc:", stack.score(X_test, y_test))


[LightGBM] [Info] Number of positive: 7043, number of negative: 7165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003132 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8578
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495707 -> initscore=-0.017174
[LightGBM] [Info] Start training from score -0.017174
[LightGBM] [Info] Number of positive: 5634, number of negative: 5732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8549
[LightGBM] [Info] Number of data points in the train set: 11366, number of used features: 57
[LightGBM] [Info] [bin

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 7043, number of negative: 7165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8583
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495707 -> initscore=-0.017174
[LightGBM] [Info] Start training from score -0.017174
[LightGBM] [Info] Number of positive: 5634, number of negative: 5732
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8567
[LightGBM] [Info] Number of data points in the train set: 11366, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495689 -> initscore=-0.017245
[Light

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 7042, number of negative: 7166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002729 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8576
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495636 -> initscore=-0.017455
[LightGBM] [Info] Start training from score -0.017455
[LightGBM] [Info] Number of positive: 5634, number of negative: 5732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8546
[LightGBM] [Info] Number of data points in the train set: 11366, number of used features: 57
[LightGBM] [Info] [bin

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 7042, number of negative: 7166
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8578
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495636 -> initscore=-0.017455
[LightGBM] [Info] Start training from score -0.017455
[LightGBM] [Info] Number of positive: 5634, number of negative: 5732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8549
[LightGBM] [Info] Number of data points in the train set: 11366, number of used features: 57
[LightGBM] [Info] [bin

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 7042, number of negative: 7166
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8573
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495636 -> initscore=-0.017455
[LightGBM] [Info] Start training from score -0.017455
[LightGBM] [Info] Number of positive: 5634, number of negative: 5732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8550
[LightGBM] [Info] Number of data points in the train set: 11366, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495689 -> initscore=-0.017245
[Light

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Stacking CV Accuracy: 0.6093 ± 0.0057
[LightGBM] [Info] Number of positive: 8803, number of negative: 8957
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8595
[LightGBM] [Info] Number of data points in the train set: 17760, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495664 -> initscore=-0.017343
[LightGBM] [Info] Start training from score -0.017343
[LightGBM] [Info] Number of positive: 7043, number of negative: 7165
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8578
[LightGBM] [Info] Number of data points in the train set: 14208, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=