# LightGBM

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier  
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# 데이터 로드 
final_merged_data = pd.read_csv('./data/final_merge_data.csv')


sum_clicked_features = ['highest_education', 'imd_band', 'log_sum_click', 'log_studied_credits', 'scaled_score']
mean_clicked_features = ['highest_education', 'imd_band', 'log_mean_click', 'log_studied_credits', 'scaled_score']

sum_x = final_merged_data[sum_clicked_features]
mean_x = final_merged_data[mean_clicked_features]
y = final_merged_data['final_result']

smote = SMOTE(sampling_strategy='auto', random_state=42)

sum_x_resampled, y_resampled = smote.fit_resample(sum_x, y)
mean_x_resampled, y_resampled = smote.fit_resample(mean_x, y)

sum_x_train, sum_x_test, y_train, y_test = train_test_split(sum_x_resampled, y_resampled, test_size=0.1, random_state=42, stratify=y_resampled)
mean_x_train, mean_x_test, y_train, y_test = train_test_split(mean_x_resampled, y_resampled, test_size=0.1, random_state=42, stratify=y_resampled)

# 출력 함수
def evaluate_clf(y_true, y_pred):
    print('정확도 평가')
    print('Accuracy :', accuracy_score(y_true, y_pred))
    print('F1 Score :', f1_score(y_true, y_pred, average='weighted'))
    print('Precision :', precision_score(y_true, y_pred, average='weighted'))
    print('Recall :', recall_score(y_true, y_pred, average='weighted'))

def evaluate_model(model_name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"\n=== {model_name} 성능 ===")
    print(f"Accuracy: {acc:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

# LightGBM(sum_click)

In [2]:
lgbm_1 = LGBMClassifier(
    num_leaves=40,
    n_estimators=200,
    min_data_in_leaf=20,
    max_depth=10,
    learning_rate=0.2,
    feature_fraction=0.9,
    random_state=42) 

lgbm_1.fit(sum_x_train, y_train)
y_pred = lgbm_1.predict(sum_x_test)

evaluate_clf(y_test, y_pred)

[LightGBM] [Info] Number of positive: 144286, number of negative: 144286
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 744
[LightGBM] [Info] Number of data points in the train set: 288572, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
정확도 평가
Accuracy : 0.8669847804391217
F1 Score : 0.8663022233781629
Precision : 0.8746351657828313
Recall : 0.8669847804391217


# LightGBM(mean_click)

In [3]:
lgbm_2 = LGBMClassifier(
    num_leaves=40,
    n_estimators=200,
    min_data_in_leaf=20,
    max_depth=10,
    learning_rate=0.2,
    feature_fraction=0.9,
    random_state=42) 

lgbm_2.fit(mean_x_train, y_train)
y_pred = lgbm_2.predict(mean_x_test)

evaluate_clf(y_test, y_pred)

[LightGBM] [Info] Number of positive: 144286, number of negative: 144286
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001536 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 725
[LightGBM] [Info] Number of data points in the train set: 288572, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
정확도 평가
Accuracy : 0.8473677644710579
F1 Score : 0.8462728739576844
Precision : 0.8575541956337271
Recall : 0.8473677644710579
