### 라이브러리

In [4]:
pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import lightgbm as lgb

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_predict

In [5]:
import optuna
from sklearn.model_selection import cross_val_score

### 데이터 가져오기 + feature scaling

In [8]:
#데이터 불러오기
df = pd.read_csv("/content/drive/MyDrive/25-1 패턴인식 프로젝트/train_processed.csv")

# feature scaling (logistic regression)
scaling_cols = [
    "n_tokens_title", "n_tokens_content",
    "num_hrefs", "num_self_hrefs", "num_imgs", "num_videos",
    "average_token_length", "num_keywords",
    "kw_min_min", "kw_max_min", "kw_avg_min",
    "kw_min_max", "kw_max_max", "kw_avg_max",
    "kw_min_avg", "kw_max_avg", "kw_avg_avg",
    "self_reference_min_shares", "self_reference_max_shares", "self_reference_avg_sharess"
]

#feature, target 분리
X = df.drop(['id', 'shares','y'], axis=1)
y = df['y']

#train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#Scaling
scaler = StandardScaler()
X_train[scaling_cols] = scaler.fit_transform(X_train[scaling_cols])
X_val[scaling_cols] = scaler.transform(X_val[scaling_cols])

### feature engineering X, tuned xgb + rf + logisticregression  / logisticregression -> 67.1

In [9]:
#xgb, lgbm 모델 최적 파라미터
best_xgb_params = {
    'n_estimators': 427,
    'max_depth': 3,
    'learning_rate': 0.021978188969319974,
    'subsample': 0.9760848989537714,
    'colsample_bytree': 0.7301003992053027,
    'gamma': 0.2749409209747699,
    'min_child_weight': 1,
    'eval_metric': 'logloss',
    'random_state': 42
}

In [10]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
]

# Meta model
meta_model = LogisticRegression()

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

# 학습 및 평가
stacking_clf.fit(X_train, y_train)
y_pred_val = stacking_clf.predict(X_val)

print("Stacking Accuracy:", accuracy_score(y_val, y_pred_val))

Stacking Accuracy: 0.6713963963963964


### feature engineering X, tuned xgb + tuned rf(grid) + tuned logisticregression(grid)  / logisticregression -> 67.0

In [11]:
# randomforest tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)
print("Best Score:", grid.best_score_)


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Params: {'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300}
Best Score: 0.6544481981981982


In [12]:
# logistic regression tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [2000]
}

lr = LogisticRegression()
grid_lr = GridSearchCV(lr, param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)
grid_lr.fit(X_train, y_train)

print("Best Params:", grid_lr.best_params_)
print("Best Score:", grid_lr.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Params: {'C': 1, 'max_iter': 2000, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Score: 0.640427927927928


In [14]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(**grid.best_params_)),
    ('lr', LogisticRegression(**grid_lr.best_params_))
]

# Meta model
meta_model = LogisticRegression()

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

# 학습 및 평가
stacking_clf.fit(X_train, y_train)
y_pred_val = stacking_clf.predict(X_val)

print("Stacking Accuracy:", accuracy_score(y_val, y_pred_val))

Stacking Accuracy: 0.6709459459459459


### feature engineering X, tuned xgb + tuned rf(grid) + tuned logistic regression(grid) / tuned logisticregression (grid) -> 66.9

- meta model 튜닝

In [15]:
# 베이스 모델 정의
estimators = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(**grid.best_params_)),
    ('lr', LogisticRegression(**grid_lr.best_params_))
]


# 각 base 모델의 예측 결과를 추출 (oof 방식으로)
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# base 모델의 예측값 쌓기
def get_meta_features(estimators, X, y):
    meta_X = np.zeros((X.shape[0], len(estimators)))
    for i, (name, model) in enumerate(estimators):
        meta_X[:, i] = cross_val_predict(model, X, y, cv=kf, method='predict_proba')[:, 1]
    return meta_X

meta_X = get_meta_features(estimators, X_train, y_train)

# 메타 모델 튜닝
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [500]
}

meta_model = LogisticRegression()
meta_grid = GridSearchCV(meta_model, param_grid, cv=5, scoring='accuracy', verbose=1)
meta_grid.fit(meta_X, y_train)

print("Best meta model params:", meta_grid.best_params_)
print("Best meta model Score:", meta_grid.best_score_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best meta model params: {'C': 0.01, 'max_iter': 500, 'solver': 'liblinear'}
Best meta model Score: 0.659572072072072


In [16]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(**grid.best_params_)),
    ('lr', LogisticRegression(**grid_lr.best_params_))
]

# Meta model
meta_model = LogisticRegression(**meta_grid.best_params_)

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

# 학습 및 평가
stacking_clf.fit(X_train, y_train)
y_pred_val = stacking_clf.predict(X_val)

print("Stacking Accuracy:", accuracy_score(y_val, y_pred_val))

Stacking Accuracy: 0.6698198198198199


### feature engineering X, tuned xgb + tuned rf(optuna) + tuned logistic regression(optuna) / logisticregression -> 67.2

In [17]:
# randomforest tuning
def rf_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }

    model = RandomForestClassifier(random_state=42, **params)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
    return score.mean()

study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(rf_objective, n_trials=100)

print("Best Score:", study_rf.best_value)
print("Best Params:", study_rf.best_params)

[I 2025-05-12 12:29:16,165] A new study created in memory with name: no-name-1b29e8a6-cf0e-4e59-a468-c759c38771f0
[I 2025-05-12 12:29:42,978] Trial 0 finished with value: 0.6454954954954955 and parameters: {'n_estimators': 296, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 0.6454954954954955.
[I 2025-05-12 12:30:20,177] Trial 1 finished with value: 0.6527027027027027 and parameters: {'n_estimators': 274, 'max_depth': 12, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': False}. Best is trial 1 with value: 0.6527027027027027.
[I 2025-05-12 12:31:13,691] Trial 2 finished with value: 0.6524774774774774 and parameters: {'n_estimators': 225, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 1 with value: 0.6527027027027027.
[I 2025-05-12 12:31:45,293] Trial 3 finished with value: 0.653997747747747

Best Score: 0.6559684684684685
Best Params: {'n_estimators': 258, 'max_depth': 18, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}


In [18]:
# logistic regression tuning
def lr_objective(trial):
    solver = trial.suggest_categorical('solver', ['liblinear', 'lbfgs'])
    penalty = 'l2' if solver != 'liblinear' else trial.suggest_categorical('penalty', ['l1', 'l2'])

    params = {
        'C': trial.suggest_float('C', 1e-3, 100.0, log=True),
        'solver': solver,
        'penalty': penalty,
        'max_iter': 1000
    }

    model = LogisticRegression(**params)
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1)
    return score.mean()

study_lr = optuna.create_study(direction='maximize')
study_lr.optimize(lr_objective, n_trials=300)

print("Best Params:", study_lr.best_params)
print("Best Score:", study_lr.best_value)

[I 2025-05-12 13:32:02,571] A new study created in memory with name: no-name-f9a4086b-f076-4248-b46a-21c9fd518707
[I 2025-05-12 13:32:03,124] Trial 0 finished with value: 0.6400337837837838 and parameters: {'solver': 'lbfgs', 'C': 54.34323892674367}. Best is trial 0 with value: 0.6400337837837838.
[I 2025-05-12 13:32:05,195] Trial 1 finished with value: 0.6395833333333333 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 18.204261868935664}. Best is trial 0 with value: 0.6400337837837838.
[I 2025-05-12 13:32:05,699] Trial 2 finished with value: 0.6402027027027026 and parameters: {'solver': 'lbfgs', 'C': 6.442652874264538}. Best is trial 2 with value: 0.6402027027027026.
[I 2025-05-12 13:32:06,029] Trial 3 finished with value: 0.6367680180180181 and parameters: {'solver': 'lbfgs', 'C': 0.004514257593669814}. Best is trial 2 with value: 0.6402027027027026.
[I 2025-05-12 13:32:06,448] Trial 4 finished with value: 0.6318693693693693 and parameters: {'solver': 'liblinear', 'pena

Best Params: {'solver': 'lbfgs', 'C': 1.0845761956897975}
Best Score: 0.6407657657657658


In [19]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(**study_rf.best_params)),
    ('lr', LogisticRegression(**study_lr.best_params))
]

# Meta model
meta_model = LogisticRegression()

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

# 학습 및 평가
stacking_clf.fit(X_train, y_train)
y_pred_val = stacking_clf.predict(X_val)

print("Stacking Accuracy:", accuracy_score(y_val, y_pred_val))

Stacking Accuracy: 0.6722972972972973


### 상위 20개 변수 추출 // tuned xgb + rf + logistic regression / logisticregression -> 65.9

In [20]:
# 상위 20개 추출
# XGBoost로 학습
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)

# 상위 중요 feature 20개 추출
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
top_features = X.columns[indices[:20]]

# 선택된 feature로 학습 데이터 구성
X_train_selected = X_train[top_features]
X_val_selected = X_val[top_features]

# 결과 출력
print("선택된 상위 20개 feature:")
print(top_features.tolist())

선택된 상위 20개 feature:
['channel_Entertainment', 'channel_Tech', 'weekday_Saturday', 'channel_Social Media', 'weekday_Sunday', 'kw_max_max', 'kw_avg_avg', 'self_reference_avg_sharess', 'kw_max_avg', 'kw_min_avg', 'channel_World', 'self_reference_min_shares', 'weekday_Tuesday', 'min_positive_polarity', 'n_non_stop_unique_tokens', 'kw_min_min', 'LDA_02', 'n_unique_tokens', 'LDA_00', 'channel_Lifestyle']


In [25]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
]

# Meta model
meta_model = LogisticRegression()

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

# 학습 및 평가
stacking_clf.fit(X_train_selected, y_train)
y_pred_val = stacking_clf.predict(X_val_selected)

print("Stacking Accuracy:", accuracy_score(y_val, y_pred_val))

Stacking Accuracy: 0.6599099099099099


### 파생 변수 추가 // tuned xgb + rf + logistic regression / logisticregression -> 66.5

In [26]:
# 파생 변수 생성 함수
def add_engineered_features(df):
    df = df.copy()
    df["tokens_title_content_ratio"] = df["n_tokens_title"] / (df["n_tokens_content"] + 1e-5)
    df["token_length_interaction"] = df["n_tokens_title"] * df["average_token_length"]
    df["positive_negative_ratio"] = df["global_rate_positive_words"] / (df["global_rate_negative_words"] + 1e-5)
    df["title_polarity_strength"] = np.abs(df["title_sentiment_polarity"])
    df["extreme_sentiment"] = df["max_positive_polarity"] - df["min_negative_polarity"]
    lda_cols = ["LDA_00", "LDA_01", "LDA_02", "LDA_03", "LDA_04"]
    df["lda_spread"] = df[lda_cols].max(axis=1) - df[lda_cols].min(axis=1)
    df["unique_stop_ratio"] = df["n_non_stop_unique_tokens"] / (df["n_non_stop_words"] + 1e-5)
    df["unique_token_ratio"] = df["n_unique_tokens"] / (df["n_tokens_content"] + 1e-5)
    return df

# 파생 변수 추가
df = add_engineered_features(df)

# 피처 및 타겟 분리
X = df.drop(columns=["id", "shares", "y"])  # 타겟 컬럼은 y
y = df["y"]

# 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Base models
base_learners = [
    ('xgb', XGBClassifier(**best_xgb_params)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
]

# Meta model
meta_model = LogisticRegression()

# Stacking
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_model,
    cv=5,  # 내부에서 K-fold cross validation
    n_jobs=-1
)

# 학습 및 평가
stacking_clf.fit(X_train, y_train)
y_pred_val = stacking_clf.predict(X_val)

print("Stacking Accuracy:", accuracy_score(y_val, y_pred_val))

Stacking Accuracy: 0.6657657657657657


### feature engineering
- 파생 변수 추가 및 중요 변수 선택

In [None]:
# 파생 변수 생성 함수
def add_engineered_features(df):
    df = df.copy()
    df["tokens_title_content_ratio"] = df["n_tokens_title"] / (df["n_tokens_content"] + 1e-5)
    df["token_length_interaction"] = df["n_tokens_title"] * df["average_token_length"]
    df["positive_negative_ratio"] = df["global_rate_positive_words"] / (df["global_rate_negative_words"] + 1e-5)
    df["title_polarity_strength"] = np.abs(df["title_sentiment_polarity"])
    df["extreme_sentiment"] = df["max_positive_polarity"] - df["min_negative_polarity"]
    lda_cols = ["LDA_00", "LDA_01", "LDA_02", "LDA_03", "LDA_04"]
    df["lda_spread"] = df[lda_cols].max(axis=1) - df[lda_cols].min(axis=1)
    df["unique_stop_ratio"] = df["n_non_stop_unique_tokens"] / (df["n_non_stop_words"] + 1e-5)
    df["unique_token_ratio"] = df["n_unique_tokens"] / (df["n_tokens_content"] + 1e-5)
    return df

# 파생 변수 추가
df = add_engineered_features(df)

# 피처 및 타겟 분리
X = df.drop(columns=["id", "shares", "y"])  # 타겟 컬럼은 y
y = df["y"]

# 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# XGBoost로 학습
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train, y_train)

# 상위 중요 feature 20개 추출
importances = xgb.feature_importances_
indices = np.argsort(importances)[::-1]
top_features = X.columns[indices[:20]]

# 선택된 feature로 학습 데이터 구성
X_train_selected = X_train[top_features]
X_val_selected = X_val[top_features]

# 결과 출력
print("선택된 상위 20개 feature:")
print(top_features.tolist())

선택된 상위 20개 feature:
['channel_Entertainment', 'channel_Tech', 'weekday_Saturday', 'channel_Social Media', 'channel_World', 'kw_avg_avg', 'kw_max_max', 'weekday_Sunday', 'self_reference_min_shares', 'self_reference_avg_sharess', 'kw_min_min', 'kw_min_avg', 'kw_max_avg', 'weekday_Thursday', 'num_imgs', 'LDA_02', 'LDA_00', 'channel_Lifestyle', 'weekday_Tuesday', 'channel_Business']


- 선택된 변수 기준으로 xgb 튜닝 (GridSearchCV)

In [None]:
# GridSearchCV 설정
param_grid = {
    'max_depth': [3, 4],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_tune = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid = GridSearchCV(xgb_tune, param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid.fit(X_train_selected, y_train)

print("Best Params:", grid.best_params_)
print("Best CV Score:", grid.best_score_)

Fitting 3 folds for each of 32 candidates, totalling 96 fits


Parameters: { "use_label_encoder" } are not used.



Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best CV Score: 0.6537162162162161


In [None]:
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_val_selected)
print("Tuned XGBoost Accuracy:", accuracy_score(y_val, y_pred_best))

Tuned XGBoost Accuracy: 0.6497747747747747


* 선택된 변수 기준으로 xgb 튜닝 (RandomizedSearchCV)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'min_child_weight': [1, 3, 5]
}

xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state= 42
)

scorer = make_scorer(f1_score)

rs = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring=scorer,
    cv=3,
    verbose=1,
    n_jobs=-1
)

rs.fit(X_train_selected, y_train)

best_model = rs.best_estimator_
y_pred_best = best_model.predict(X_val_selected)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [None]:
print("Tuned XGBoost Accuracy:", accuracy_score(y_val, y_pred_best))
print("Tuned XGBoost F1 Score:", f1_score(y_val, y_pred_best))
print("Best Params:", rs.best_params_)

Tuned XGBoost Accuracy: 0.6475225225225225
Tuned XGBoost F1 Score: 0.649809800850302
Best Params: {'subsample': 0.6, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0.5, 'colsample_bytree': 0.6}
