In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import optuna


In [2]:

# ファイル読み込み
df = pd.read_csv('../data/combined_batches_cleaned_26k_3.csv')

# 不要な文字列列を削除
df = df.drop(columns=['url'])

# ブール値をintに変換
df = df.astype({col: int for col in df.select_dtypes('bool').columns})

# 欠損値を0で埋める
df.fillna(0, inplace=True)

# 特徴量とラベルに分離
X = df.drop(columns=['url_type'])
y = df['url_type']

# データ分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
    }
    model = GradientBoostingClassifier(**params, random_state=42)
    return cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)


[I 2025-06-09 11:57:33,164] A new study created in memory with name: no-name-8b3fb760-5a6f-4647-b19a-0ca1fab8cd27
[I 2025-06-09 12:12:51,327] Trial 0 finished with value: 0.8827083333333334 and parameters: {'n_estimators': 388, 'learning_rate': 0.23507347530057654, 'max_depth': 9, 'subsample': 0.802795164584618, 'min_samples_split': 16, 'min_samples_leaf': 14}. Best is trial 0 with value: 0.8827083333333334.
[I 2025-06-09 12:21:07,115] Trial 1 finished with value: 0.8691826923076924 and parameters: {'n_estimators': 440, 'learning_rate': 0.15870087674438801, 'max_depth': 4, 'subsample': 0.856275629941169, 'min_samples_split': 20, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.8827083333333334.
[I 2025-06-09 12:26:22,429] Trial 2 finished with value: 0.8740224358974359 and parameters: {'n_estimators': 170, 'learning_rate': 0.18029766706856493, 'max_depth': 6, 'subsample': 0.9593993313892302, 'min_samples_split': 17, 'min_samples_leaf': 15}. Best is trial 0 with value: 0.8827083333

KeyboardInterrupt: 

In [None]:

best_params = study.best_params
best_model = GradientBoostingClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

In [None]:

# 評価レポート
print("Classification Report:\n", classification_report(y_test, y_pred))

# 混同行列
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:

importances = best_model.feature_importances_
features = X.columns

importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
importance_df.sort_values(by='Importance', ascending=False, inplace=True)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title('Feature Importances')
plt.tight_layout()
plt.show()
