In [13]:
import lightgbm as lgb

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

# Load data form drive
df = pd.read_csv('/content/gdrive/MyDrive/2025-1 Pattern recognition/train_processed.csv', engine='python')
df.info()

Mounted at /content/gdrive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22200 entries, 0 to 22199
Data columns (total 60 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            22200 non-null  int64  
 1   n_tokens_title                22200 non-null  float64
 2   n_tokens_content              22200 non-null  float64
 3   n_unique_tokens               22200 non-null  float64
 4   n_non_stop_words              22200 non-null  float64
 5   n_non_stop_unique_tokens      22200 non-null  float64
 6   num_hrefs                     22200 non-null  float64
 7   num_self_hrefs                22200 non-null  float64
 8   num_imgs                      22200 non-null  float64
 9   num_videos                    22200 non-null  float64
 10  average_token_length          22200 non-null  float64
 11  num_keywords                  22200 non-null  float64
 12  kw_min_min                    222

In [4]:
# 피처/타깃 분리
target_col = 'y'
feature_cols = [c for c in df.columns if c != target_col]

X = df[feature_cols]
y = df[target_col]

# 피처셋 구성: id, shares, 원본 y 모두 제거
X = df.drop(columns=['id', 'shares', 'y'])

# 칼럼명 공백 → 언더스코어
X.columns = [c.strip().replace(' ', '_') for c in X.columns]

In [5]:
def drop_corr_features(X, threshold):
    # 1) 원본 상관행렬 계산
    corr = X.corr()
    # 2) 상삼각(주대각선 위)만 남기기
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    # 3) 양(+) 또는 음(–) 상관관계가 threshold보다 클 경우 제거 대상
    to_drop = [
        col for col in upper.columns
        if (upper[col] >  threshold).any()  # 강한 양의 상관
        or (upper[col] < -threshold).any()  # 강한 음의 상관
    ]
    return to_drop

# 사용 예시
to_drop = drop_corr_features(X, threshold=0.8)
print(f"Dropping {len(to_drop)} high-corr features (>|0.8|):", to_drop)

# 제거 후 데이터
X_reduced = X.drop(columns=to_drop)

Dropping 5 high-corr features (>|0.8|): ['n_non_stop_unique_tokens', 'average_token_length', 'kw_min_avg', 'self_reference_max_shares', 'self_reference_avg_sharess']


In [6]:
# 차원(행 × 열) 확인
print("Shape:", X_reduced.shape)

# 남은 컬럼 리스트
print("Columns:", X_reduced.columns.tolist())

# 통계 요약
print("Describe:\n", X_reduced.describe())

Shape: (22200, 52)
Columns: ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens', 'n_non_stop_words', 'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'num_keywords', 'kw_min_min', 'kw_max_min', 'kw_avg_min', 'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_max_avg', 'kw_avg_avg', 'self_reference_min_shares', 'LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity', 'global_sentiment_polarity', 'global_rate_positive_words', 'global_rate_negative_words', 'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity', 'min_positive_polarity', 'max_positive_polarity', 'avg_negative_polarity', 'min_negative_polarity', 'max_negative_polarity', 'title_subjectivity', 'title_sentiment_polarity', 'abs_title_subjectivity', 'abs_title_sentiment_polarity', 'channel_Business', 'channel_Entertainment', 'channel_Lifestyle', 'channel_Social_Media', 'channel_Tech', 'channel_World', 'weekday_Monday', 'weekday_Tuesday', 'weekday_Wednesday', 'weekday_Thursday', 'weekday_Frid

In [11]:
# 교차검증으로 과적합 여부 확인
clf = lgb.LGBMClassifier(
    objective='binary',       # binary classification
    metric='binary_logloss',  # Binary log loss
    learning_rate=0.09,
    num_leaves=21,            # Max number of leaves that one tree can have
    min_child_samples=31,
    reg_alpha=6.2,
    reg_lambda=4.8,
    random_state=42
)

skf = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)
auc_scores = cross_val_score(clf, X_reduced, y,
                             cv=skf,
                             scoring='roc_auc',
                             n_jobs=-1)
print("5-Fold AUC scores:", np.round(auc_scores,4))
print("Mean AUC:", np.round(auc_scores.mean(),4))

5-Fold AUC scores: [0.6929 0.7208 0.721  0.7115 0.7012 0.7196 0.725  0.7178]
Mean AUC: 0.7137


In [14]:
# 최종 Train/Val 분리 & 평가
X_tr, X_val, y_tr, y_val = train_test_split(
    X_reduced, y, test_size=0.2,
    stratify=y, random_state=42
)
clf.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='binary_logloss',
    callbacks=[lgb.early_stopping(stopping_rounds=30),
               lgb.log_evaluation(period=20)]
)
y_pred = clf.predict(X_val)
y_prob = clf.predict_proba(X_val)[:,1]

[LightGBM] [Info] Number of positive: 8803, number of negative: 8957
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7323
[LightGBM] [Info] Number of data points in the train set: 17760, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495664 -> initscore=-0.017343
[LightGBM] [Info] Start training from score -0.017343
Training until validation scores don't improve for 30 rounds
[20]	valid_0's binary_logloss: 0.629225
[40]	valid_0's binary_logloss: 0.616828
[60]	valid_0's binary_logloss: 0.613087
[80]	valid_0's binary_logloss: 0.611747
[100]	valid_0's binary_logloss: 0.610939
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.610939


In [15]:
from sklearn.metrics import classification_report

print("Final Validation Accuracy:", accuracy_score(y_val, y_pred))
print("Final Validation AUC     :", roc_auc_score(y_val, y_prob))
print("\nClassification Report:\n", classification_report(y_val, y_pred))

Final Validation Accuracy: 0.6641891891891892
Final Validation AUC     : 0.7265230652598326

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.66      0.67      2239
           1       0.66      0.67      0.66      2201

    accuracy                           0.66      4440
   macro avg       0.66      0.66      0.66      4440
weighted avg       0.66      0.66      0.66      4440



In [16]:
pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 1) 데이터 로드 및 전처리
df = pd.read_csv('/content/gdrive/MyDrive/2025-1 Pattern recognition/train_processed.csv', engine='python')
y = df['y']
X = df.drop(columns=['id', 'shares', 'y'])
# 컬럼명 공백 → 언더스코어
X.columns = [c.strip().replace(' ', '_') for c in X.columns]
# 너무 강한 상관(>|0.8|) 피처 제거
def drop_corr_features(X, threshold=0.8):
    corr  = X.corr()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    return [col for col in upper.columns
            if (upper[col] >  threshold).any()
            or (upper[col] < -threshold).any()]

to_drop    = drop_corr_features(X, threshold=0.8)
X_reduced  = X.drop(columns=to_drop)

# 2) Optuna objective 함수 정의
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': 42,
        'n_jobs': -1
    }
    model = lgb.LGBMClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_reduced, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

# 3) 스터디 생성 및 최적화 수행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# 4) 최적 결과 확인
print("Best AUC    :", study.best_value)
print("Best Params :", study.best_params)

# 5) 최적 파라미터로 모델 학습
best_params = study.best_params.copy()
best_params.update({
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'random_state': 42,
    'n_jobs': -1
})
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_reduced, y)

# 6) 피처 중요도 확인
importances = pd.Series(final_model.feature_importances_, index=X_reduced.columns)
print("\nTop 10 Feature Importances:\n", importances.sort_values(ascending=False).head(10))


[I 2025-04-29 12:09:00,122] A new study created in memory with name: no-name-b5f9c76e-b33a-4dce-b027-f70ed5af8ceb
[I 2025-04-29 12:09:17,022] Trial 0 finished with value: 0.6948337889332953 and parameters: {'learning_rate': 0.0013608100191502396, 'num_leaves': 110, 'max_depth': 11, 'min_child_samples': 37, 'subsample': 0.6346053019636584, 'colsample_bytree': 0.7453399743464246, 'reg_alpha': 2.610228959906916, 'reg_lambda': 3.882420388187721}. Best is trial 0 with value: 0.6948337889332953.
[I 2025-04-29 12:09:31,572] Trial 1 finished with value: 0.6995870436201848 and parameters: {'learning_rate': 0.004440252956993858, 'num_leaves': 112, 'max_depth': 12, 'min_child_samples': 22, 'subsample': 0.7270790511446023, 'colsample_bytree': 0.7003574986298999, 'reg_alpha': 4.637886474375465, 'reg_lambda': 4.408450271556889}. Best is trial 1 with value: 0.6995870436201848.
[I 2025-04-29 12:09:44,734] Trial 2 finished with value: 0.6918387990808481 and parameters: {'learning_rate': 0.0028071980979

Best AUC    : 0.7158646161236949
Best Params : {'learning_rate': 0.09356930234213212, 'num_leaves': 21, 'max_depth': 7, 'min_child_samples': 31, 'subsample': 0.7965835646973495, 'colsample_bytree': 0.5398552348770301, 'reg_alpha': 6.215104679468559, 'reg_lambda': 4.832881988525944}
[LightGBM] [Info] Number of positive: 11004, number of negative: 11196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010359 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7345
[LightGBM] [Info] Number of data points in the train set: 22200, number of used features: 52
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495676 -> initscore=-0.017298
[LightGBM] [Info] Start training from score -0.017298

Top 10 Feature Importances:
 self_reference_min_shares     107
kw_avg_avg                    103
kw_max_avg                     94
LDA_02                         83
n_unique_tokens                80
LDA_00                 