<a href="https://colab.research.google.com/github/RohEunSeo/LGAimers/blob/main/lightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 라이브러리 선언

In [None]:
# Install LightGBM
!pip install lightgbm

import os
from pprint import pprint
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
import lightgbm as lgb  # Import LightGBM



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# train 데이터 불러오기

In [None]:
ROOT_DIR = "/content/drive/MyDrive/lgAimers.5(updated.ver)/modeling"
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_data_cleaned(스케일링후).csv"))
train_data

Unnamed: 0,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,Production Qty Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,0.0,0.0,0.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.014374,0.058824,Normal
1,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.007505,0.379877,0.000000,Normal
2,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1.0,1.0,1.0,0.428571,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.020534,0.058824,Normal
3,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1.0,1.0,1.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.008630,0.550308,0.000000,Normal
4,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.007880,0.248460,0.000000,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40094,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.006004,0.652977,0.000000,Normal
40095,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1.0,1.0,1.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.028747,0.058824,Normal
40096,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,0.0,0.0,0.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.252908,0.002053,0.058824,Normal
40097,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1.0,1.0,1.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.009381,0.240246,0.000000,Normal


# 언더 샘플링

In [None]:
# 언더 샘플링 비율 조정
normal_ratio = 1.0
additional_ratio = 0.5  # 추가로 유지할 Normal 데이터의 비율

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# 언더 샘플링: Normal 데이터를 AbNormal 데이터 수와 맞춤
df_normal_sampled = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)

# 추가로 일부 Normal 데이터를 유지
df_additional_normal = df_normal.sample(n=int(num_abnormal * additional_ratio), replace=False, random_state=RANDOM_STATE)

# 최종 데이터셋 결합
df_concat = pd.concat([df_normal_sampled, df_additional_normal, df_abnormal], axis=0).reset_index(drop=True)

# 결과 확인
print(df_concat.value_counts("target"))  # 오...Normal 추가 학습 시 점수 올랐음

  Total: Normal: 37774, AbNormal: 2325
target
Normal      3487
AbNormal    2325
Name: count, dtype: int64


In [None]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.2,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 2789, AbNormal: 1860 ratio: 0.6669057009680889
  Total: Normal: 698, AbNormal: 465 ratio: 0.666189111747851


# 모델 학습


In [None]:
# Define features and target
features = [col for col in df_train.columns if col != 'target']
# LightGBM automatically handles categorical features if they are labeled encoded or specified correctly
train_x = df_train[features]
train_y = df_train["target"]
val_x = df_val[features]
val_y = df_val['target']


# 범주형 데이터로 사용할 열들을 명시적으로 지정
categorical_features = [
    'Process Desc._Dam', 'Equipment_Dam', 'Model.Suffix_Dam', 'Workorder_Dam',
    'Process Desc._AutoClave', 'Equipment_AutoClave', 'Model.Suffix_AutoClave', 'Workorder_AutoClave',
    'Chamber Temp. Judge Value_AutoClave', 'Process Desc._Fill1', 'Equipment_Fill1',
    'Model.Suffix_Fill1', 'Workorder_Fill1', 'Process Desc._Fill2', 'Equipment_Fill2',
    'Model.Suffix_Fill2', 'Workorder_Fill2'
]

# 범주형 피처를 category 타입으로 변환
for col in categorical_features:
    train_x[col] = train_x[col].astype('category')
    val_x[col] = val_x[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x[col] = train_x[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_x[col] = val_x[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_x[col] = train_x[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
T

# 그리드 서치를 총해 최적의 하이퍼파라미터 찾기

In [None]:
'''param_grid = {
    'num_leaves': [31, 50, 70],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 5, 10],
    'min_data_in_leaf': [20, 50, 100],
    'feature_fraction': [0.6, 0.8, 1.0]
}'''

"param_grid = {\n    'num_leaves': [31, 50, 70],\n    'learning_rate': [0.01, 0.05, 0.1],\n    'max_depth': [-1, 5, 10],\n    'min_data_in_leaf': [20, 50, 100],\n    'feature_fraction': [0.6, 0.8, 1.0]\n}"

In [None]:
param_grid = {
    'num_leaves': [30, 50, 70],               # 리프 노드의 최대 수
    'learning_rate': [0.01, 0.05, 0.1],      # 학습률
    'max_depth': [3, 4, 5],                    # 트리의 최대 깊이
    'min_data_in_leaf': [20, 40, 60],       # 리프 노드에 포함되는 최소 데이터 수
    'feature_fraction': [0.6, 0.7, 0.8],     # 각 부스팅 단계에서 사용할 피처의 비율
}

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# LightGBM 모델 정의
model = lgb.LGBMClassifier(n_estimators=500, random_state=110)

# GridSearchCV 정의
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',  # 평가 메트릭 (필요에 따라 'accuracy', 'roc_auc' 등으로 변경 가능)
    cv=5,
    verbose=1  # 진행 상황을 출력
)

# Grid Search 수행 (early_stopping_rounds는 콜백으로 적용)
grid_search.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],  # 검증 데이터 설정
    categorical_feature=categorical_features,
    callbacks=[lgb.early_stopping(stopping_rounds=100)]  # LightGBM의 early_stopping 콜백을 사용
)

# 최적의 하이퍼파라미터 출력
print(f"Best parameters found: {grid_search.best_params_}")

 # 1번 실행결과 cv=3, n_estimator = 200 : Best parameters found: {'feature_fraction': 0.6, 'learning_rate': 0.01, 'max_depth': -1, 'min_data_in_leaf': 20, 'num_leaves': 31}
# 2번 실행결과 cv=5, n_estimators=500 : Best parameters found: {'feature_fraction': 0.6, 'learning_rate': 0.01, 'max_depth': 5, 'min_data_in_leaf': 20, 'num_leaves': 31}'''
# Best parameters found: {'feature_fraction': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'min_data_in_leaf': 20, 'num_leaves': 30}

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Early stopping, best iteration is:
[383]	valid_0's binary_logloss: 0.619783
[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3922
[LightGBM] [Info] Number of data points in the train set: 3719, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599892 -> initscore=0.405017
[LightGBM] [Info] Start training from score 0.405017
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[277]	valid_0's binary_logloss: 0.620659
[LightGBM] [Info] Number of positive: 2232, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002542 seconds.
You can s

# 최적의 부스팅 타입

In [None]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# 이미 구한 최적의 하이퍼파라미터
best_params = {
    'feature_fraction': 0.8,
    'learning_rate': 0.05,
    'max_depth': 5,
    'min_data_in_leaf': 20,
    'num_leaves': 30
}

# 부스팅 타입을 위한 그리드 서치 파라미터
param_grid = {
    'boosting_type': ['gbdt', 'dart', 'goss']  # 부스팅 타입만 그리드 서치
}

# LightGBM 모델 정의
model = lgb.LGBMClassifier(
    n_estimators=500,
    random_state=110,
    **best_params  # 이전에 구한 최적의 하이퍼파라미터 사용
)

# GridSearchCV 정의
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',  # 평가 메트릭 (필요에 따라 'accuracy', 'roc_auc' 등으로 변경 가능)
    cv=5,
    verbose=1  # 진행 상황을 출력
)

# Grid Search 수행 (early_stopping_rounds는 콜백으로 적용)
grid_search.fit(
    train_x, train_y,
    eval_set=[(val_x, val_y)],  # 검증 데이터 설정
    categorical_feature=categorical_features,
    callbacks=[lgb.early_stopping(stopping_rounds=100)]  # LightGBM의 early_stopping 콜백을 사용
)

# 최적의 부스팅 타입 출력
print(f"Best boosting type found: {grid_search.best_params_['boosting_type']}")


Fitting 5 folds for each of 3 candidates, totalling 15 fits
[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3867
[LightGBM] [Info] Number of data points in the train set: 3719, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599892 -> initscore=0.405017
[LightGBM] [Info] Start training from score 0.405017
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[236]	valid_0's binary_logloss: 0.60846
[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is n



[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003589 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3842
[LightGBM] [Info] Number of data points in the train set: 3719, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599892 -> initscore=0.405017
[LightGBM] [Info] Start training from score 0.405017




[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3859
[LightGBM] [Info] Number of data points in the train set: 3719, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599892 -> initscore=0.405017
[LightGBM] [Info] Start training from score 0.405017




[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006937 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3922
[LightGBM] [Info] Number of data points in the train set: 3719, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599892 -> initscore=0.405017
[LightGBM] [Info] Start training from score 0.405017




[LightGBM] [Info] Number of positive: 2232, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3904
[LightGBM] [Info] Number of data points in the train set: 3720, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.600000 -> initscore=0.405465
[LightGBM] [Info] Start training from score 0.405465




[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3867
[LightGBM] [Info] Number of data points in the train set: 3719, number of used features: 137
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599892 -> initscore=0.405017
[LightGBM] [Info] Start training from score 0.405017
Training until validation scores don't improve for 100 rounds
[LightGBM] [Info] Number of positive: 2231, number of negative: 1488
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3842
[LightGBM] [Info] Number o

# 최적의 하이퍼파라미터로 모델 재훈련

In [None]:
# 최적의 하이퍼파라미터로 모델 재정의 및 재훈련
#best_params = grid_search.best_params_
best_model = lgb.LGBMClassifier(
    feature_fraction=0.6,     # 최적의 하이퍼파라미터 적용
    learning_rate=0.01,
    max_depth=5,
    min_data_in_leaf=20,
    num_leaves=31,
    n_estimators=500,         # 트리 개수를 500으로 설정
    random_state=110
)

# 모델 재훈련
best_model.fit(train_x, train_y, categorical_feature=categorical_features)

[LightGBM] [Info] Number of positive: 2789, number of negative: 1860
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018054 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4093
[LightGBM] [Info] Number of data points in the train set: 4649, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.599914 -> initscore=0.405107
[LightGBM] [Info] Start training from score 0.405107


# 피처 중요도 계산, 시각화

In [None]:
import matplotlib.pyplot as plt
# 피처 중요도 계산
feature_importances = best_model.feature_importances_

# 중요도와 피처 이름을 함께 정리
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 상위 30개의 피처 선택
top_30_features = importance_df.head(30)

# 피처 중요도 시각화
plt.figure(figsize=(10, 8))
plt.barh(top_30_features['Feature'], top_30_features['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 30 Feature Importances')
plt.gca().invert_yaxis()  # 중요도가 높은 피처가 위에 오도록 y축을 반전
plt.show()


NameError: name 'plt' is not defined

In [None]:
import matplotlib.pyplot as plt
# 피처 중요도 계산
feature_importances = best_model.feature_importances_

# 중요도와 피처 이름을 함께 정리
importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# 중요도가 0.5% 이하인 피처 필터링
low_importance_threshold = 0.5 # 임계값을 0.5%로 설정
low_importance_features = importance_df[importance_df['Importance'] < low_importance_threshold]['Feature']

# 중요도 낮은 피처 삭제

In [None]:
# 중요도가 낮은 피처를 학습 데이터와 테스트 데이터에서 제거
train_x_reduced = train_x.drop(columns = low_importance_features)
train_x_reduced

# 최종 모델 학습

In [None]:
# Load and preprocess test data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_data_cleaned(스케일링후).csv"))

In [None]:
# 1. 테스트 데이터에서 중요도가 낮은 피처를 제거
df_test_x = test_data.drop(columns=low_importance_features)
# 1. 테스트 데이터에서 학습에 사용된 동일한 features 선택
df_test_x = test_data[features]
df_test_x_reduced = df_test_x.drop(columns=low_importance_features)

# 피처 순서 및 일관성 확인
df_test_x_reduced = df_test_x_reduced[train_x_reduced.columns]

# 범주형 열을 category 타입으로 변환
for col in categorical_features:
    if col in df_test_x_reduced.columns:
        df_test_x_reduced.loc[:, col] = df_test_x_reduced[col].astype('category')

# 최적의 하이퍼파라미터로 모델 재정의 및 재훈련
model_reduced = lgb.LGBMClassifier(
    feature_fraction=0.6,     # 최적의 하이퍼파라미터 적용
    learning_rate=0.01,
    max_depth=5,
    min_data_in_leaf=20,
    num_leaves=31,
    n_estimators=500,         # 트리 개수를 500으로 설정
    random_state=110
)

# categorical_features 리스트 업데이트 (train_x_reduced에 있는 피처만 포함)
categorical_features = [col for col in categorical_features if col in train_x_reduced.columns]

# 1. 학습 데이터의 범주형 피처의 범주 저장
for col in categorical_features:
    if col in train_x_reduced.columns:
        train_x_reduced[col] = train_x_reduced[col].astype('category')
        df_test_x_reduced[col] = df_test_x_reduced[col].astype('category')
        df_test_x_reduced[col] = df_test_x_reduced[col].cat.set_categories(train_x_reduced[col].cat.categories)

# 모델 재훈련
model_reduced.fit(train_x_reduced, train_y, categorical_feature=categorical_features)

In [None]:
# 테스트 데이터로 예측 (선택적 단계)
# 8. 테스트 데이터로 예측 수행
test_pred = model_reduced.predict(df_test_x_reduced)
test_pred

# 제출 데이터 읽어오기

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("/content/drive/MyDrive/lgAimers.5(updated.ver)/modeling/submission.csv")
df_sub["target"] = test_pred

# 'Normal'과 'AbNormal'의 개수 세기
count_values = df_sub['target'].value_counts()
print(count_values)

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [None]:
from sklearn.metrics import f1_score

test_y = test_data["target"]

# F1-Score 계산 (AbNormal 기준)
f1 = f1_score(test_y, test_pred, average='binary', pos_label='AbNormal')
print(f"F1-Score: {f1:.4f}")