# 파일 불러오기
- 날짜 없는 원본 파일 사용

In [1]:
import pandas as pd

# Load the CSV file
file_path = "./Final Results Extended.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Time_Now,Blanking_Util,Blanking_SKU1_Queue,Blanking_SKU2_Queue,Blanking_SKU3_Queue,Blanking_SKU4_Queue,Press1_Util,Press2_Util,Press3_Util,Press4_Util,...,SKU3_NVA_Time,SKU3_Transport_Time,SKU3_Wait_Time,SKU3_Other_Time,SKU4_VA_Time,SKU4_NVA_Time,SKU4_Transport_Time,SKU4_Wait_Time,SKU4_Other_Time,Blanking_Queue
0,24,0.846367,0.045715,0.056373,0.055737,0.035849,0.410297,0.434561,0.481388,0.399992,...,0.0,0.537617,0.45365,0.0,1.523338,0.0,0.536243,0.473453,0.0,58.361452
1,24,0.851097,0.051937,0.052934,0.038512,0.042248,0.455471,0.454445,0.387975,0.442986,...,0.0,0.536764,0.473677,0.0,1.523344,0.0,0.534992,0.46438,0.0,62.830599
2,24,0.846115,0.05221,0.047499,0.043181,0.040979,0.496717,0.450816,0.417308,0.352829,...,0.0,0.535925,0.42409,0.0,1.523403,0.0,0.535077,0.47533,0.0,59.365867
3,24,0.841306,0.051769,0.035436,0.046788,0.052362,0.433749,0.363004,0.443909,0.456036,...,0.0,0.535232,0.430992,0.0,1.523381,0.0,0.533233,0.463801,0.0,56.698528
4,24,0.859599,0.045874,0.046802,0.044507,0.05721,0.418329,0.396826,0.499273,0.472454,...,0.0,0.538142,0.502614,0.0,1.523363,0.0,0.537223,0.44932,0.0,65.784631


# 데이터 전처리

## 상수컬럼 및 전부 0인 컬럼 제거
- Time_Now에 동일한 24 값 -> 제거
- 결과적으로 22개 컬럼 제거
- Queue 변수 중 제거된 거 : 'Paint1_Queue', 'Paint2_Queue', 'Cell1_Queue', 'Cell2_Queue', 'Cell3_Queue', 'Cell4_Queue'
- Cell별 SKU 생산량 변수 중 제거된 거 : 'c_Cell1_SKU3', 'c_Cell2_SKU1', 'c_Cell2_SKU3', 'c_Cell3_SKU1', 'c_Cell3_SKU4', 'c_Cell4_SKU1', 'c_Cell4_SKU2'

In [2]:
# =========================
# 1. 상수컬럼 및 전부 0인 컬럼 제거
# =========================

# (1) 상수컬럼: 데이터가 모두 같은 값인 컬럼
const_cols = [col for col in df.columns if df[col].nunique() == 1]

# (2) 전부 0인 컬럼
zero_cols = [col for col in df.columns if (df[col] == 0).all()]

# 합치기 (중복 제거)
drop_cols = list(set(const_cols + zero_cols))

print(f"🗑 제거 대상 컬럼 수: {len(drop_cols)}개")
print(drop_cols)

# 실제 제거
df = df.drop(columns=drop_cols)

print(f"✅ 제거 후 컬럼 수: {df.shape[1]}")

🗑 제거 대상 컬럼 수: 22개
['Paint1_Queue', 'c_Cell4__SKU2', 'Cell3_Queue', 'Cell4_Queue', 'Time_Now', 'c_Cell3__SKU1', 'SKU2_Other_Time', 'SKU2_NVA_Time', 'c_Cell2__SKU1', 'SKU3_Other_Time', 'Cell1_Queue', 'SKU4_Other_Time', 'SKU1_Other_Time', 'SKU4_NVA_Time', 'SKU3_NVA_Time', 'Cell2_Queue', 'Paint2_Queue', 'c_Cell3__SKU4', 'c_Cell1__SKU3', 'c_Cell2__SKU3', 'SKU1_NVA_Time', 'c_Cell4__SKU1']
✅ 제거 후 컬럼 수: 56


## 결측치 확인 -> 1~3개씩 존재 => 해당 행 drop (행 3개 제거)

In [3]:
# =========================
# 2. 결측치 확인
# =========================
missing_summary = df.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]

if missing_summary.empty:
    print("✅ 결측치 없음")
else:
    print("⚠️ 결측치 요약:")
    print(missing_summary)

⚠️ 결측치 요약:
Blanking_SKU4_Queue        1
Press1_Util                1
Press2_Util                1
Press3_Util                1
Press4_Util                1
Press1_Queue               1
Press2_Queue               1
Press3_Queue               1
Press4_Queue               1
Cell1_Util                 1
Cell2_Util                 1
Cell3_Util                 1
Cell4_Util                 1
Warehouse1_Queue           1
Warehouse_2_Queue          1
Warehouse_3_Queue          1
Warehouse_4_Queue          1
c_Cycle1                   1
c_Cycle2                   1
c_Cycle3                   1
c_Cycle4                   1
c_Cell1_SKU1               1
c_Cell1__SKU2              1
c_Cell1__SKU4              1
c_Cell2__SKU2              1
c_Cell2__SKU4              1
c_Cell3__SKU2              1
c_Cell3__SKU3              1
c_Cell4__SKU3              1
c_Cell4__SKU4              1
Paint1_Util                1
Paint2_Util                1
Quality_Util               1
Quality_Queue              1
For

In [4]:
# =========================
# 3. 결측치 제거
# =========================
if not missing_summary.empty:
    before = df.shape[0]
    df = df.dropna()  # 결측치가 하나라도 있는 행 제거
    after = df.shape[0]
    print(f"✅ 결측치가 있는 행 {before - after}개 제거 완료 (남은 행: {after})")

print()
missing_summary2 = df.isnull().sum()
missing_summary2 = missing_summary2[missing_summary2 > 0]
print(missing_summary2)

✅ 결측치가 있는 행 3개 제거 완료 (남은 행: 132673)

Series([], dtype: int64)


# Warehouse Queue -> 네이밍을 Cell Queue 로 변경 (with 넘버링 매핑)
- 이름 변경 매핑: {'Warehouse1_Queue': 'Cell1_Queue', 'Warehouse_2_Queue': 'Cell2_Queue', 'Warehouse_3_Queue': 'Cell3_Queue', 'Warehouse_4_Queue': 'Cell4_Queue'}

In [5]:
# =========================
# 1) Warehouse Queue 컬럼 찾기
# =========================
warehouse_cols = [c for c in df.columns if "warehouse" in c.lower() and "queue" in c.lower()]
print("Warehouse Queue cols:", warehouse_cols)

# =========================
# 2) Warehouse 번호 → Cell 번호로 변환
# =========================
rename_map = {}
for w in warehouse_cols:
    # 숫자 추출 (예: Warehouse1_Queue → 1)
    num = ''.join(filter(str.isdigit, w))
    if num:
        new_name = f"Cell{num}_Queue"
        rename_map[w] = new_name

# =========================
# 3) 컬럼명 변경
# =========================
df = df.rename(columns=rename_map)

print("✅ 이름 변경 매핑:", rename_map)
print("변경 후 Queue 관련 컬럼:", [c for c in df.columns if "queue" in c.lower()])

Warehouse Queue cols: ['Warehouse1_Queue', 'Warehouse_2_Queue', 'Warehouse_3_Queue', 'Warehouse_4_Queue']
✅ 이름 변경 매핑: {'Warehouse1_Queue': 'Cell1_Queue', 'Warehouse_2_Queue': 'Cell2_Queue', 'Warehouse_3_Queue': 'Cell3_Queue', 'Warehouse_4_Queue': 'Cell4_Queue'}
변경 후 Queue 관련 컬럼: ['Blanking_SKU1_Queue', 'Blanking_SKU2_Queue', 'Blanking_SKU3_Queue', 'Blanking_SKU4_Queue', 'Press1_Queue', 'Press2_Queue', 'Press3_Queue', 'Press4_Queue', 'Cell1_Queue', 'Cell2_Queue', 'Cell3_Queue', 'Cell4_Queue', 'Quality_Queue', 'Forklift_Blanking_Queue', 'Forklift_Press_Queue', 'Forklift_Assembly_Queue', 'Blanking_Queue']


# Target 컬럼 생성

## 전체 기준 병목 파악 -> 새 컬럼에 추가 (Bottleneck_actual , Bottleneck_actual_val)

In [6]:
# =========================
# 1) Queue 컬럼 찾기 (대소문자 무시)
# =========================
queue_cols = [col for col in df.columns if "queue" in col.lower()]

print(f"Queue 관련 컬럼 수: {len(queue_cols)}")
print(queue_cols[:10])  # 앞 10개만 확인

# =========================
# 2) 병목 공정 컬럼 생성
# =========================
# 각 행에서 queue 값이 가장 큰 컬럼명
df["Bottleneck_actual"] = df[queue_cols].idxmax(axis=1)

# 해당 queue의 실제 최대값도 같이 기록하고 싶다면
df["Bottleneck_val"] = df[queue_cols].max(axis=1)

# =========================
# 3) 결과 확인
# =========================
print(df[["Bottleneck_actual", "Bottleneck_val"]].tail())

Queue 관련 컬럼 수: 17
['Blanking_SKU1_Queue', 'Blanking_SKU2_Queue', 'Blanking_SKU3_Queue', 'Blanking_SKU4_Queue', 'Press1_Queue', 'Press2_Queue', 'Press3_Queue', 'Press4_Queue', 'Cell1_Queue', 'Cell2_Queue']
              Bottleneck_actual  Bottleneck_val
132670  Forklift_Blanking_Queue      153.012984
132671              Cell1_Queue      185.031493
132672              Cell1_Queue      302.349198
132673              Cell1_Queue      295.667368
132674              Cell1_Queue      166.628981


In [7]:
# Bottleneck 컬럼의 빈도 계산
bottleneck_counts = df["Bottleneck_actual"].value_counts().head(10)

bottleneck_counts

Bottleneck_actual
Cell1_Queue                77629
Forklift_Blanking_Queue    53681
Cell3_Queue                 1307
Cell4_Queue                   36
Forklift_Press_Queue           9
Press4_Queue                   6
Press2_Queue                   5
Name: count, dtype: int64

## 주요 공정 단계(블랭킹, 프레스, 조립셀) 기준 병목 파악 -> 새 컬럼에 추가
- Bottleneck_actual_Blanking , Bottleneck_actual_Press , Bottleneck_actual_Cell
- Bottleneck_val_Blanking , Bottleneck_val_Press , Bottleneck_val_Cell

In [8]:
# ================================
# 11. 공정별 실제 병목 레이블 & 값 생성
# ================================
stage_groups = {
    "Blanking": ["Blanking_SKU1_Queue", "Blanking_SKU2_Queue",
                 "Blanking_SKU3_Queue", "Blanking_SKU4_Queue"],
    "Press":    ["Press1_Queue", "Press2_Queue", "Press3_Queue", "Press4_Queue"],
    "Cell":     ["Cell1_Queue", "Cell2_Queue", "Cell3_Queue", "Cell4_Queue"]
}

for stage, cols in stage_groups.items():
    # 병목 컬럼명 (어느 라인/설비가 병목인지)
    df[f"Bottleneck_actual_{stage}"] = df[cols].idxmax(axis=1)
    # 병목 값 (queue 크기)
    df[f"Bottleneck_val_{stage}"] = df[cols].max(axis=1)

# ================================
# 12. 결과 확인
# ================================
check_cols = [c for c in df.columns if c.startswith("Bottleneck_")]
print(df[check_cols].head())

         Bottleneck_actual  Bottleneck_val Bottleneck_actual_Blanking  \
0  Forklift_Blanking_Queue      157.256741        Blanking_SKU2_Queue   
1  Forklift_Blanking_Queue      166.488725        Blanking_SKU2_Queue   
2              Cell1_Queue      335.401556        Blanking_SKU1_Queue   
3  Forklift_Blanking_Queue      147.448407        Blanking_SKU4_Queue   
4  Forklift_Blanking_Queue      150.397600        Blanking_SKU4_Queue   

   Bottleneck_val_Blanking Bottleneck_actual_Press  Bottleneck_val_Press  \
0                 0.056373            Press2_Queue             65.273786   
1                 0.052934            Press1_Queue             74.772823   
2                 0.052210            Press2_Queue             72.901066   
3                 0.052362            Press1_Queue             65.091192   
4                 0.057210            Press1_Queue             65.075312   

  Bottleneck_actual_Cell  Bottleneck_val_Cell  
0            Cell1_Queue            98.155568  
1       

## 제품군(SKU) 기준 병목 파악 -> 새 컬럼에 추가
- Bottleneck_actual_SKU1 , Bottleneck_actual_SKU2 , Bottleneck_actual_SKU3 , Bottleneck_actual_SKU4
- Bottleneck_val_SKU1 , Bottleneck_val_SKU2 , Bottleneck_val_SKU3, Bottleneck_val_SKU4

### 컬럼명 맞추기

In [9]:
# =========================
# 컬럼명 정규화: 언더스코어 여러 개 → 1개로 통일
# =========================
df.columns = df.columns.str.replace(r"__+", "_", regex=True)

# 확인
for sku in ["SKU1","SKU2","SKU3","SKU4"]:
    related_cols = [c for c in df.columns if f"Cell" in c and sku in c]
    print(sku, related_cols)

SKU1 ['c_Cell1_SKU1']
SKU2 ['c_Cell1_SKU2', 'c_Cell2_SKU2', 'c_Cell3_SKU2']
SKU3 ['c_Cell3_SKU3', 'c_Cell4_SKU3']
SKU4 ['c_Cell1_SKU4', 'c_Cell2_SKU4', 'c_Cell4_SKU4']


### 조립셀 Queue 값 -> SKU별로 분배
- 각 Cell Queue 전체량을 SKU별 담당 비율에 따라 나눠서, SKU별 Cell Queue를 생성 -> SKU별로 여러 Cell에서 받은 몫을 합쳐 최종 SKU Cell Queue를 계산
- 각 행에 대해 동적으로 SKU별 분배 수행

In [10]:
for sku in ["SKU1","SKU2","SKU3","SKU4"]:
    related_cols = [c for c in df.columns if f"_{sku}" in c and "c_Cell" in c]
    print(sku, related_cols)

SKU1 ['c_Cell1_SKU1']
SKU2 ['c_Cell1_SKU2', 'c_Cell2_SKU2', 'c_Cell3_SKU2']
SKU3 ['c_Cell3_SKU3', 'c_Cell4_SKU3']
SKU4 ['c_Cell1_SKU4', 'c_Cell2_SKU4', 'c_Cell4_SKU4']


In [11]:
# ================================
# 2. SKU 경로 정의
# ================================
sku_paths = {
    "SKU1": {"blanking": "Blanking_SKU1_Queue","press": "Press1_Queue"},
    "SKU2": {"blanking": "Blanking_SKU2_Queue","press": "Press2_Queue"},
    "SKU3": {"blanking": "Blanking_SKU3_Queue","press": "Press3_Queue"},
    "SKU4": {"blanking": "Blanking_SKU4_Queue","press": "Press4_Queue"},
}

# ================================
# 3. Cell → SKU 비율 기반 분배
# ================================
sku_cell_vals = {sku: pd.Series(0, index=df.index) for sku in sku_paths}

for cell_num in range(1, 5):  # Cell1 ~ Cell4
    cell_col = f"Cell{cell_num}_Queue"
    
    if cell_col not in df.columns:
        continue
    
    # 해당 Cell Queue 값
    cell_vals = df[cell_col]
    
    for sku in sku_paths.keys():
        ratio_col = f"c_Cell{cell_num}_{sku}"
        if ratio_col in df.columns:
            # SKU별 Cell Queue = Cell Queue × 비율
            sku_cell_vals[sku] += cell_vals * df[ratio_col]

# ================================
# 4. 결과 DataFrame 구성 (컬럼명 변경)
# ================================
sku_cell_df = pd.DataFrame({
    f"Cell_{sku}_Queue": vals for sku, vals in sku_cell_vals.items()
})

print("=== SKU별로 분배된 Cell Queue (앞부분 10행) ===")
print(sku_cell_df.head(10))

# 원래 df에 붙이고 싶다면:
df = pd.concat([df, sku_cell_df], axis=1)

=== SKU별로 분배된 Cell Queue (앞부분 10행) ===
   Cell_SKU1_Queue  Cell_SKU2_Queue  Cell_SKU3_Queue  Cell_SKU4_Queue
0     1.370939e+06     1.192535e+06     1.247366e+06     8.856453e+05
1     2.451033e+06     1.958747e+06     9.126624e+05     1.182048e+06
2     5.568672e+06     3.817909e+06     9.283182e+05     1.242360e+06
3     1.647598e+06     1.114051e+06     1.090078e+06     1.048065e+06
4     1.553549e+06     1.232362e+06     1.476219e+06     1.102118e+06
5     2.101007e+06     1.931448e+06     1.167199e+06     1.028449e+06
6     1.851074e+06     1.886183e+06     6.287604e+05     8.966812e+05
7     2.972045e+06     2.248654e+06     1.170633e+06     1.487739e+06
8     5.720726e+06     4.591313e+06     9.949638e+05     1.545606e+06
9     5.471222e+06     3.388148e+06     7.908985e+05     1.160693e+06


In [12]:
# ================================
# 7. SKU별 병목 공정 예측
# ================================
sku_paths = {
    "SKU1": {"blanking": "Blanking_SKU1_Queue","press": "Press1_Queue", "cell": "Cell_SKU1_Queue"},
    "SKU2": {"blanking": "Blanking_SKU2_Queue","press": "Press2_Queue", "cell": "Cell_SKU2_Queue"},
    "SKU3": {"blanking": "Blanking_SKU3_Queue","press": "Press3_Queue", "cell": "Cell_SKU3_Queue"},
    "SKU4": {"blanking": "Blanking_SKU4_Queue","press": "Press4_Queue", "cell": "Cell_SKU4_Queue"},
}

for sku, paths in sku_paths.items():
    # 각 SKU별 병목 후보 Stage 값 모으기
    stage_df = df[[paths["blanking"], paths["press"], paths["cell"]]].copy()
    stage_df.columns = ["Blanking", "Press", "Cell"]  # 축 이름 단순화
    
    # 각 시점별 최대값 컬럼명 → 병목 공정
    df[f"Bottleneck_actual_{sku}"] = stage_df.idxmax(axis=1)

    # 각 시점별 병목 수치 (최대값)
    df[f"Bottleneck_val_{sku}"] = stage_df.max(axis=1)

# ================================
# 8. 결과 확인
# ================================
print(df[[c for c in df.columns if c.startswith("Bottleneck_actual_")]].head())
print(df[[c for c in df.columns if c.startswith("Bottleneck_val_")]].head())

  Bottleneck_actual_Blanking Bottleneck_actual_Press Bottleneck_actual_Cell  \
0        Blanking_SKU2_Queue            Press2_Queue            Cell1_Queue   
1        Blanking_SKU2_Queue            Press1_Queue            Cell1_Queue   
2        Blanking_SKU1_Queue            Press2_Queue            Cell1_Queue   
3        Blanking_SKU4_Queue            Press1_Queue            Cell1_Queue   
4        Blanking_SKU4_Queue            Press1_Queue            Cell1_Queue   

  Bottleneck_actual_SKU1 Bottleneck_actual_SKU2 Bottleneck_actual_SKU3  \
0                   Cell                   Cell                   Cell   
1                   Cell                   Cell                   Cell   
2                   Cell                   Cell                   Cell   
3                   Cell                   Cell                   Cell   
4                   Cell                   Cell                   Cell   

  Bottleneck_actual_SKU4  
0                   Cell  
1                   Cell  

# 랜덤 분할 - Train 70 : Val 15 : Test 15

In [13]:
# from sklearn.model_selection import train_test_split

# train_val_df, test_df = train_test_split(df, test_size=0.15, random_state=42, shuffle=True)
# train_df, val_df = train_test_split(train_val_df, test_size=0.1765, random_state=42, shuffle=True)
# # 0.1765 ≈ 15% / 85%

# print(f"Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

In [14]:
# # 저장 경로 지정 (원하는 폴더로 바꾸세요)
# train_path = "./bottleneck_train.csv"
# val_path   = "./bottleneck_val.csv"
# test_path  = "./bottleneck_test.csv"

# # CSV 저장 (인덱스 제외)
# train_df.to_csv(train_path, index=False, encoding="utf-8-sig")
# val_df.to_csv(val_path, index=False, encoding="utf-8-sig")
# test_df.to_csv(test_path, index=False, encoding="utf-8-sig")

# print("✅ CSV 저장 완료")
# print(f"Train → {train_path} ({train_df.shape})")
# print(f"Val   → {val_path} ({val_df.shape})")
# print(f"Test  → {test_path} ({test_df.shape})")

In [15]:
import pandas as pd

# 경로 지정
train_path = "./bottleneck_train.csv"
val_path   = "./bottleneck_val.csv"
test_path  = "./bottleneck_test.csv"

# 불러오기
train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

# 하나로 합치고 싶으면
df = pd.concat([train_df, val_df, test_df], axis=0).reset_index(drop=True)

print("✅ 불러오기 완료")
print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)
print("전체 df:", df.shape)

✅ 불러오기 완료
Train: (92867, 76) Val: (19905, 76) Test: (19901, 76)
전체 df: (132673, 76)


# 독립변수 분리
- 전체 기준 병목 예측
    - 분류 : Bottleneck_actual
    - 회귀 : Bottleneck_actual_val 
- 공정 단계 기준 병목 예측
    - 분류 : Bottleneck_actual_Blanking , Bottleneck_actual_Press , Bottleneck_actual_Cell
    - 회귀 : Bottleneck_val_Blanking , Bottleneck_val_Press , Bottleneck_val_Cell 
- 제품군(SKU) 기준 병목 예측
    - 분류 : Bottleneck_actual_SKU1 , Bottleneck_actual_SKU2 , Bottleneck_actual_SKU3 , Bottleneck_actual_SKU4
    - 회귀 : Bottleneck_val_SKU1 , Bottleneck_val_SKU2 , Bottleneck_val_SKU3, Bottleneck_val_SKU4

In [16]:
# 타깃 후보 컬럼 전부 정리
target_cols = [
    # 전체 기준
    "Bottleneck_actual", "Bottleneck_val",
    # 공정 단계 기준
    "Bottleneck_actual_Blanking", "Bottleneck_actual_Press", "Bottleneck_actual_Cell",
    "Bottleneck_val_Blanking", "Bottleneck_val_Press", "Bottleneck_val_Cell",
    # SKU 기준
    "Bottleneck_actual_SKU1", "Bottleneck_actual_SKU2", "Bottleneck_actual_SKU3", "Bottleneck_actual_SKU4",
    "Bottleneck_val_SKU1", "Bottleneck_val_SKU2", "Bottleneck_val_SKU3", "Bottleneck_val_SKU4"
]

# 독립변수(X)는 이 타깃 후보들을 제외한 것들만
X_train = train_df.drop(columns=[c for c in target_cols if c in train_df.columns])
X_val   = val_df.drop(columns=[c for c in target_cols if c in val_df.columns])
X_test  = test_df.drop(columns=[c for c in target_cols if c in test_df.columns])

print("✅ 독립변수 컬럼 수:", X_train.shape[1])
print("예시 컬럼:", X_train.columns[:10].tolist())

✅ 독립변수 컬럼 수: 60
예시 컬럼: ['Blanking_Util', 'Blanking_SKU1_Queue', 'Blanking_SKU2_Queue', 'Blanking_SKU3_Queue', 'Blanking_SKU4_Queue', 'Press1_Util', 'Press2_Util', 'Press3_Util', 'Press4_Util', 'Press1_Queue']


# 전체 기준 병목 예측 모델링
- 분류 : Bottleneck_actual
- 회귀 : Bottleneck_actual_val 

### 추가 처리
- 트리 기반 모델(RandomForest, XGB) → 그대로 둠 (스케일링 불필요)
- Logistic / Ridge / MLP → StandardScaler 적용
- MLP → PyTorch 기반 GPU 학습

In [18]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----- 타깃 준비 -----
y_train_cls = train_df["Bottleneck_actual"]
y_val_cls   = val_df["Bottleneck_actual"]
y_test_cls  = test_df["Bottleneck_actual"]

le = LabelEncoder()
y_train_cls = le.fit_transform(y_train_cls)
y_val_cls   = le.transform(y_val_cls)
y_test_cls  = le.transform(y_test_cls)

# ==================================================
# 1) Logistic Regression 튜닝
# ==================================================
log_reg = LogisticRegression(max_iter=5000, solver="saga")

log_params = {
    "logisticregression__C": [0.01, 0.1, 1, 10],
    "logisticregression__penalty": ["l1", "l2"],
    "logisticregression__class_weight": [None, "balanced"]
}

grid_log = GridSearchCV(
    make_pipeline(StandardScaler(), log_reg),
    param_grid=log_params,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1,
    verbose=2
)
grid_log.fit(X_train, y_train_cls)

best_log_model = grid_log.best_estimator_
print("✅ Best Logistic Params:", grid_log.best_params_)

# ==================================================
# 2) XGBoost 튜닝
# ==================================================
xgb = XGBClassifier(tree_method="hist", random_state=42, use_label_encoder=False, eval_metric="mlogloss")

xgb_params = {
    "n_estimators": [200, 500],
    "max_depth": [3, 6, 10],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "scale_pos_weight": [1, 2, 5]  # 불균형 대응
}

grid_xgb = GridSearchCV(
    xgb,
    param_grid=xgb_params,
    cv=3,
    scoring="f1_macro",
    n_jobs=-1,
    verbose=2
)
grid_xgb.fit(X_train, y_train_cls)

best_xgb_model = grid_xgb.best_estimator_
print("✅ Best XGBoost Params:", grid_xgb.best_params_)

# ==================================================
# 3) TorchMLP 튜닝 (수동 루프 기반)
# ==================================================
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# Dataset & DataLoader
class NumpyDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_loader = torch.utils.data.DataLoader(NumpyDataset(X_train_scaled, y_train_cls), batch_size=256, shuffle=True)
val_loader   = torch.utils.data.DataLoader(NumpyDataset(X_val_scaled, y_val_cls), batch_size=256, shuffle=False)
test_tensor  = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# 모델 정의
class TorchMLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dim=128, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )
    def forward(self, x): return self.net(x)

def train_mlp(train_loader, val_loader, input_dim, num_classes, hidden_dim=128, dropout=0.3, lr=1e-3, epochs=50, patience=5):
    model = TorchMLPClassifier(input_dim, num_classes, hidden_dim, dropout).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)

    best_val_loss = np.inf
    patience_cnt = 0
    for epoch in range(1, epochs+1):
        # Train
        model.train()
        train_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * len(xb)
        train_loss /= len(train_loader.dataset)

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                out = model(xb)
                loss = criterion(out, yb)
                val_loss += loss.item() * len(xb)
        val_loss /= len(val_loader.dataset)

        print(f"[Epoch {epoch}] Train={train_loss:.4f}, Val={val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_state = model.state_dict()
            patience_cnt = 0
        else:
            patience_cnt += 1
            if patience_cnt >= patience:
                print("⏹ Early stopping triggered")
                break

    model.load_state_dict(best_state)
    return model

# 간단한 튜닝 루프
mlp_param_grid = [
    {"hidden_dim": 128, "dropout": 0.3, "lr": 1e-3},
    {"hidden_dim": 256, "dropout": 0.4, "lr": 5e-4},
    {"hidden_dim": 256, "dropout": 0.5, "lr": 1e-4}
]

best_score = -np.inf
best_mlp_params = None
best_mlp_model = None

for params in mlp_param_grid:
    print(f"\n=== TorchMLP trial: {params} ===")
    model = train_mlp(train_loader, val_loader, X_train.shape[1], len(le.classes_),
                      hidden_dim=params["hidden_dim"], dropout=params["dropout"], lr=params["lr"],
                      epochs=50)
    # 검증 성능 측정
    model.eval()
    with torch.no_grad():
        logits = model(torch.tensor(X_val_scaled, dtype=torch.float32).to(device))
        y_val_pred = torch.argmax(logits, axis=1).cpu().numpy()
    f1_val = f1_score(y_val_cls, y_val_pred, average="macro")
    print(f"Validation F1={f1_val:.3f}")

    if f1_val > best_score:
        best_score = f1_val
        best_mlp_model = model
        best_mlp_params = params

print("✅ Best TorchMLP Params:", best_mlp_params)

# ==================================================
# 최종 테스트 성능 비교
# ==================================================
models = {
    "Logistic": best_log_model,
    "XGBoost": best_xgb_model,
    "TorchMLP": best_mlp_model
}

results = []
for name, model in models.items():
    if name == "TorchMLP":
        model.eval()
        with torch.no_grad():
            logits = model(test_tensor)
            y_pred = torch.argmax(logits, axis=1).cpu().numpy()
    else:
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test_cls, y_pred)
    f1 = f1_score(y_test_cls, y_pred, average="macro")
    bal_acc = balanced_accuracy_score(y_test_cls, y_pred)
    results.append({"name": name, "Acc": acc, "F1": f1, "BalAcc": bal_acc})
    print(f"{name} | Acc={acc:.3f}, F1={f1:.3f}, BalAcc={bal_acc:.3f}")

results_df = pd.DataFrame(results)
results_df.to_csv("classification_tuned_results.csv", index=False, encoding="utf-8-sig")
print("✅ Tuned results saved to classification_tuned_results.csv")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
✅ Best Logistic Params: {'logisticregression__C': 0.1, 'logisticregression__class_weight': 'balanced', 'logisticregression__penalty': 'l1'}
Fitting 3 folds for each of 216 candidates, totalling 648 fits


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best XGBoost Params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'scale_pos_weight': 1, 'subsample': 1.0}

=== TorchMLP trial: {'hidden_dim': 128, 'dropout': 0.3, 'lr': 0.001} ===
[Epoch 1] Train=0.2735, Val=0.0883
[Epoch 2] Train=0.0878, Val=0.0585
[Epoch 3] Train=0.0652, Val=0.0512
[Epoch 4] Train=0.0566, Val=0.0425
[Epoch 5] Train=0.0469, Val=0.0384
[Epoch 6] Train=0.0434, Val=0.0363
[Epoch 7] Train=0.0414, Val=0.0338
[Epoch 8] Train=0.0373, Val=0.0427
[Epoch 9] Train=0.0355, Val=0.0279
[Epoch 10] Train=0.0344, Val=0.0272
[Epoch 11] Train=0.0321, Val=0.0274
[Epoch 12] Train=0.0301, Val=0.0280
[Epoch 13] Train=0.0288, Val=0.0240
[Epoch 14] Train=0.0273, Val=0.0287
[Epoch 15] Train=0.0271, Val=0.0254
[Epoch 16] Train=0.0268, Val=0.0251
[Epoch 17] Train=0.0258, Val=0.0256
[Epoch 18] Train=0.0257, Val=0.0215
[Epoch 19] Train=0.0251, Val=0.0255
[Epoch 20] Train=0.0247, Val=0.0214
[Epoch 21] Train=0.0238, Val=0.0195
[Epoch 22] Train=0.0230, Val=0

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd
import joblib

# ----- 타깃 준비 -----
y_train_reg = train_df["Bottleneck_val"]
y_val_reg   = val_df["Bottleneck_val"]
y_test_reg  = test_df["Bottleneck_val"]

# ----- 학습/검증 합쳐서 튜닝 데이터로 사용 -----
X_train_full = pd.concat([X_train, X_val])
y_train_full = np.concatenate([y_train_reg.values, y_val_reg.values])

# ----- 랜덤포레스트 + 그리드서치 -----
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    "n_estimators": [200, 500, 1000],
    "max_depth": [None, 10, 20, 50],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5]
}

grid = GridSearchCV(
    rf,
    param_grid=param_grid,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=2
)

print("🔍 RandomForest Hyperparameter Tuning...")
grid.fit(X_train_full, y_train_full)

print("✅ Best Params:", grid.best_params_)
print("✅ Best CV R2:", grid.best_score_)

# ----- 최적 모델로 테스트 -----
best_rf = grid.best_estimator_
y_pred = best_rf.predict(X_test)

mae = mean_absolute_error(y_test_reg, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
r2 = r2_score(y_test_reg, y_pred)

print(f"🎯 RandomForest Tuned | MAE={mae:.3f} | RMSE={rmse:.3f} | R2={r2:.3f}")

# ----- 모델 저장 -----
joblib.dump(best_rf, "best_model_reg_RandomForest.pkl")
print("💾 Saved tuned RandomForest model: best_model_reg_RandomForest.pkl")

🔍 RandomForest Hyperparameter Tuning...
Fitting 3 folds for each of 108 candidates, totalling 324 fits
