###✔️ 전처리 해둔 데이터 사용

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv("train_processed.csv")
test_df = pd.read_csv("test_processed.csv")

X = train_df.drop(columns=['id', 'y', 'shares'])
y = train_df['y']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### ✔️ XGBoost 모델 사용: ***65.5%***

In [20]:
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, classification_report

xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_valid)

print("XGBoost Accuracy:", accuracy_score(y_valid, y_pred_xgb))
print("XGBoost F1 Score:", f1_score(y_valid, y_pred_xgb))
print("\n Classification Report:\n")
print(classification_report(y_valid, y_pred_xgb))


XGBoost Accuracy: 0.6547297297297298
XGBoost F1 Score: 0.6496

 Classification Report:

              precision    recall  f1-score   support

           0       0.66      0.66      0.66      2239
           1       0.65      0.65      0.65      2201

    accuracy                           0.65      4440
   macro avg       0.65      0.65      0.65      4440
weighted avg       0.65      0.65      0.65      4440



### ✔️ 튜닝 시도 - RandomizedSearchCV : ***66.5%***

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'min_child_weight': [1, 3, 5]
}

xgb_model = XGBClassifier(
    eval_metric='logloss',
    random_state= 42
)

scorer = make_scorer(f1_score)

rs = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=30,
    scoring=scorer,
    cv=3,
    verbose=1,
    n_jobs=-1
)

rs.fit(X_train, y_train)

best_model = rs.best_estimator_
y_pred_best = best_model.predict(X_valid)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [19]:
print("Tuned XGBoost Accuracy:", accuracy_score(y_valid, y_pred_best))
print("Tuned XGBoost F1 Score:", f1_score(y_valid, y_pred_best))
print("Best Params:", rs.best_params_)

Tuned XGBoost Accuracy: 0.6648648648648648
Tuned XGBoost F1 Score: 0.6635006784260515
Best Params: {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 1.0}


### ✔️ 더 정밀한 튜닝 - optuna: ***67.5%***


In [7]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.3.0


In [10]:
print("Best Accuracy:", study.best_value)
print("Best Params:", study.best_params)

Best Accuracy: 0.6747747747747748
Best Params: {'n_estimators': 427, 'max_depth': 3, 'learning_rate': 0.021978188969319974, 'subsample': 0.9760848989537714, 'colsample_bytree': 0.7301003992053027, 'gamma': 0.2749409209747699, 'min_child_weight': 1}


### ✔️ Final Model: 파라미터 최적 모델로 학습: ***67.5%***
n_trials 300으로 탐색한 결과 나온 최적 파라미터

In [12]:
from xgboost import XGBClassifier

best_params = {
    'n_estimators': 427,
    'max_depth': 3,
    'learning_rate': 0.021978188969319974,
    'subsample': 0.9760848989537714,
    'colsample_bytree': 0.7301003992053027,
    'gamma': 0.2749409209747699,
    'min_child_weight': 1,
    'eval_metric': 'logloss',
    'random_state': 42
}

final_model = XGBClassifier(**best_params)
final_model.fit(X_train, y_train)

y_pred_valid = final_model.predict(X_valid)

# 성능 평가
print("Final Tuned Model Accuracy:", accuracy_score(y_valid, y_pred_valid))
print("Final Tuned Model F1 Score:", f1_score(y_valid, y_pred_valid))
print("\nClassification Report:\n")
print(classification_report(y_valid, y_pred_valid))

Final Tuned Model Accuracy: 0.6747747747747748
Final Tuned Model F1 Score: 0.6727107887579329

Classification Report:

              precision    recall  f1-score   support

           0       0.68      0.68      0.68      2239
           1       0.67      0.67      0.67      2201

    accuracy                           0.67      4440
   macro avg       0.67      0.67      0.67      4440
weighted avg       0.67      0.67      0.67      4440



### ✔️ 튜닝 XGBoost & random Forest 앙상블: ***67.1%***

In [16]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

stack = StackingClassifier(
    estimators=[('xgb', final_model), ('rf', rf_model)],
    final_estimator=LogisticRegression()
)

stack.fit(X_train, y_train)
y_pred_stack = stack.predict(X_valid)

print("Stacking Accuracy:", accuracy_score(y_valid, y_pred_stack))
print("Stacking F1 Score:", f1_score(y_valid, y_pred_stack))
print(classification_report(y_valid, y_pred_stack))


Stacking Accuracy: 0.6709459459459459
Stacking F1 Score: 0.672053872053872
              precision    recall  f1-score   support

           0       0.68      0.66      0.67      2239
           1       0.66      0.68      0.67      2201

    accuracy                           0.67      4440
   macro avg       0.67      0.67      0.67      4440
weighted avg       0.67      0.67      0.67      4440

