<a href="https://colab.research.google.com/github/RohEunSeo/LGAimers/blob/main/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
ROOT_DIR = "/content/drive/MyDrive/lgAimers.5(updated.ver)/modeling"
RANDOM_STATE = 110

#데이터 불러오기
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train_data_cleaned(스케일링후).csv"))
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_data_cleaned(스케일링후).csv"))
train_data


Unnamed: 0,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE STANDBY POSITION X Collect Result_Dam,CURE STANDBY POSITION Z Collect Result_Dam,...,Head Clean Position X Collect Result_Fill2,Head Clean Position Y Collect Result_Fill2,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,Production Qty Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,0.0,0.0,0.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.014374,0.058824,Normal
1,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.007505,0.379877,0.000000,Normal
2,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1.0,1.0,1.0,0.428571,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.020534,0.058824,Normal
3,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1.0,1.0,1.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.008630,0.550308,0.000000,Normal
4,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.007880,0.248460,0.000000,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40094,Dam Dispenser,Dam dispenser #1,AJX75334501,3J1XF434-2,0.0,0.0,0.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.006004,0.652977,0.000000,Normal
40095,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1XC796-1,1.0,1.0,1.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.364023,0.028747,0.058824,Normal
40096,Dam Dispenser,Dam dispenser #1,AJX75334501,4C1XD438-1,0.0,0.0,0.0,0.857143,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.252908,0.002053,0.058824,Normal
40097,Dam Dispenser,Dam dispenser #2,AJX75334501,3I1XA258-1,1.0,1.0,1.0,0.000000,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.009381,0.240246,0.000000,Normal


In [None]:
# 언더 샘플링 비율 조정
normal_ratio = 1.0
additional_ratio = 0.5  # 추가로 유지할 Normal 데이터의 비율

df_normal = train_data[train_data["target"] == "Normal"]
df_abnormal = train_data[train_data["target"] == "AbNormal"]

num_normal = len(df_normal)
num_abnormal = len(df_abnormal)
print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}")

# 언더 샘플링: Normal 데이터를 AbNormal 데이터 수와 맞춤
df_normal_sampled = df_normal.sample(n=int(num_abnormal * normal_ratio), replace=False, random_state=RANDOM_STATE)

# 추가로 일부 Normal 데이터를 유지
df_additional_normal = df_normal.sample(n=int(num_abnormal * additional_ratio), replace=False, random_state=RANDOM_STATE)

# 최종 데이터셋 결합
df_concat = pd.concat([df_normal_sampled, df_additional_normal, df_abnormal], axis=0).reset_index(drop=True)

# 결과 확인
print(df_concat.value_counts("target"))  # 오...Normal 추가 학습 시 점수 올랐음

  Total: Normal: 37774, AbNormal: 2325
target
Normal      3487
AbNormal    2325
Name: count, dtype: int64


In [None]:
df_train, df_val = train_test_split(
    df_concat,
    test_size=0.2,
    stratify=df_concat["target"],
    random_state=RANDOM_STATE,
    shuffle=True
)


def print_stats(df: pd.DataFrame):
    num_normal = len(df[df["target"] == "Normal"])
    num_abnormal = len(df[df["target"] == "AbNormal"])

    print(f"  Total: Normal: {num_normal}, AbNormal: {num_abnormal}" + f" ratio: {num_abnormal/num_normal}")


# Print statistics
print(f"  \tAbnormal\tNormal")
print_stats(df_train)
print_stats(df_val)

  	Abnormal	Normal
  Total: Normal: 2789, AbNormal: 1860 ratio: 0.6669057009680889
  Total: Normal: 698, AbNormal: 465 ratio: 0.666189111747851


# 하이퍼파라미터 튜닝 작업

In [None]:
'''from sklearn.model_selection import GridSearchCV, train_test_split
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3,4,5,6],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [2, 3, 4],
    'bootstrap': [True, False],
}

train_x = df_train[features]
train_y = df_train["target"]

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=RANDOM_STATE),
                           param_grid=param_grid,
                           cv=5, n_jobs=-1, verbose=0, scoring='accuracy')

grid_search.fit(train_x, train_y)

print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy: ", grid_search.best_score_)'''

'from sklearn.model_selection import GridSearchCV, train_test_split\nparam_grid = {\n    \'n_estimators\': [100, 200, 300],\n    \'max_depth\': [3,4,5,6],\n    \'min_samples_split\': [2, 4, 6, 8],\n    \'min_samples_leaf\': [2, 3, 4],\n    \'bootstrap\': [True, False],\n}\n\ntrain_x = df_train[features]\ntrain_y = df_train["target"]\n\ngrid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=RANDOM_STATE),\n                           param_grid=param_grid,\n                           cv=5, n_jobs=-1, verbose=0, scoring=\'accuracy\')\n\ngrid_search.fit(train_x, train_y)\n\nprint("Best parameters found: ", grid_search.best_params_)\nprint("Best accuracy: ", grid_search.best_score_)'

In [None]:
# 최적의 하이퍼파라미터를 포함한 랜덤 포레스트 모델 생성
best_model = RandomForestClassifier(
    random_state=RANDOM_STATE,
    bootstrap= False,
    max_depth=4,
    min_samples_leaf=2,
    min_samples_split=2,
    n_estimators= 200
)


In [None]:
features = []

for col in df_train.columns:
    try:
        df_train[col] = df_train[col].astype(int)
        features.append(col)
    except:
        continue

train_x = df_train[features]
train_y = df_train["target"]

best_model.fit(train_x, train_y)

In [None]:
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test_data_cleaned(스케일링후).csv"))

In [None]:
df_test_x = test_data[features]

for col in df_test_x.columns:
    try:
        df_test_x.loc[:, col] = df_test_x[col].astype(int)
    except:
        continue

In [None]:
test_pred = best_model.predict(df_test_x)
test_pred

array(['Normal', 'Normal', 'AbNormal', ..., 'Normal', 'AbNormal',
       'Normal'], dtype=object)

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("/content/drive/MyDrive/lgAimers.5(updated.ver)/modeling/submission.csv")
df_sub["target"] = test_pred

# 'Normal'과 'AbNormal'의 개수 세기
count_values = df_sub['target'].value_counts()
print(count_values)

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

target
Normal      13749
AbNormal     3612
Name: count, dtype: int64
