# 파일 불러오기

In [1]:
import pandas as pd

# Load the CSV file
file_path = "./subset_20240101_20250630_fullcols.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Time_Now,Blanking_Util,Blanking_SKU1_Queue,Blanking_SKU2_Queue,Blanking_SKU3_Queue,Blanking_SKU4_Queue,Press1_Util,Press2_Util,Press3_Util,Press4_Util,...,SKU3_NVA_Time,SKU3_Transport_Time,SKU3_Wait_Time,SKU3_Other_Time,SKU4_VA_Time,SKU4_NVA_Time,SKU4_Transport_Time,SKU4_Wait_Time,SKU4_Other_Time,Blanking_Queue
0,2024-01-01,0.846367,0.045715,0.056373,0.055737,0.035849,0.410297,0.434561,0.481388,0.399992,...,0,0.537617,0.45365,0,1.523338,0,0.536243,0.473453,0,58.361452
1,2024-01-02,0.851097,0.051937,0.052934,0.038512,0.042248,0.455471,0.454445,0.387975,0.442986,...,0,0.536764,0.473677,0,1.523344,0,0.534992,0.46438,0,62.830599
2,2024-01-03,0.846115,0.05221,0.047499,0.043181,0.040979,0.496717,0.450816,0.417308,0.352829,...,0,0.535925,0.42409,0,1.523403,0,0.535077,0.47533,0,59.365867
3,2024-01-04,0.841306,0.051769,0.035436,0.046788,0.052362,0.433749,0.363004,0.443909,0.456036,...,0,0.535232,0.430992,0,1.523381,0,0.533233,0.463801,0,56.698528
4,2024-01-05,0.859599,0.045874,0.046802,0.044507,0.05721,0.418329,0.396826,0.499273,0.472454,...,0,0.538142,0.502614,0,1.523363,0,0.537223,0.44932,0,65.784631


# Cell Queue : 다 0 값. <- Warehouse Queue 값 입력. (넘버링 매칭 필요)

In [2]:
# =========================
# 1) Cell / Warehouse Queue 컬럼 찾기
# =========================
cell_cols = [c for c in df.columns if "cell" in c.lower() and "queue" in c.lower()]
warehouse_cols = [c for c in df.columns if "warehouse" in c.lower() and "queue" in c.lower()]

print("Cell Queue cols:", cell_cols)
print("Warehouse Queue cols:", warehouse_cols)

# =========================
# 2) 번호 매칭: Cell1 <-> Warehouse1
# =========================
for cell_col in cell_cols:
    # 번호 추출 (예: Cell1_Queue → 1)
    num = ''.join(filter(str.isdigit, cell_col))
    # 매칭되는 warehouse 컬럼 찾기
    candidates = [w for w in warehouse_cols if num and num in w]
    if candidates:
        warehouse_col = candidates[0]
        # Cell 값이 0이면 warehouse 값으로 대체
        df[cell_col] = df[cell_col].where(df[cell_col] != 0, df[warehouse_col])

# 확인
print()
print(df[cell_cols].head())
print()
print(df[warehouse_cols].head())

Cell Queue cols: ['Cell1_Queue', 'Cell2_Queue', 'Cell3_Queue', 'Cell4_Queue']
Warehouse Queue cols: ['Warehouse1_Queue', 'Warehouse_2_Queue', 'Warehouse_3_Queue', 'Warehouse_4_Queue']

   Cell1_Queue  Cell2_Queue  Cell3_Queue  Cell4_Queue
0    98.155568    19.580277    77.580864    75.124986
1   162.632374    18.164794    64.827578    69.374905
2   335.401556    20.369416    46.120108    71.720194
3   112.173074    16.248254    58.988737    77.579531
4   107.982827    17.036205   100.202849    81.049638

   Warehouse1_Queue  Warehouse_2_Queue  Warehouse_3_Queue  Warehouse_4_Queue
0         98.155568          19.580277          77.580864          75.124986
1        162.632374          18.164794          64.827578          69.374905
2        335.401556          20.369416          46.120108          71.720194
3        112.173074          16.248254          58.988737          77.579531
4        107.982827          17.036205         100.202849          81.049638


# Warehouse Queue를 다 0으로 만들기

In [3]:
# =========================
# 3) Warehouse Queue 모두 0으로 초기화
# =========================
df[warehouse_cols] = 0

# =========================
# 4) 확인
# =========================
print(df[cell_cols].head())
print()
print(df[warehouse_cols].head())

   Cell1_Queue  Cell2_Queue  Cell3_Queue  Cell4_Queue
0    98.155568    19.580277    77.580864    75.124986
1   162.632374    18.164794    64.827578    69.374905
2   335.401556    20.369416    46.120108    71.720194
3   112.173074    16.248254    58.988737    77.579531
4   107.982827    17.036205   100.202849    81.049638

   Warehouse1_Queue  Warehouse_2_Queue  Warehouse_3_Queue  Warehouse_4_Queue
0                 0                  0                  0                  0
1                 0                  0                  0                  0
2                 0                  0                  0                  0
3                 0                  0                  0                  0
4                 0                  0                  0                  0


# 병목 파악 후 새로운 컬럼에 병목 정보 추가

In [4]:
# 숫자형 컬럼만 선택
numeric_cols = df.select_dtypes(include=["number"]).columns

queue_cols = [col for col in df.columns if "Queue" in col]  # queue 컬럼만 추출
df["Bottleneck_actual"] = df[queue_cols].idxmax(axis=1)

# 결과 확인
print(df[["Time_Now", "Bottleneck_actual"]].tail())

       Time_Now        Bottleneck_actual
542  2025-06-26              Cell1_Queue
543  2025-06-27              Cell1_Queue
544  2025-06-28  Forklift_Blanking_Queue
545  2025-06-29              Cell1_Queue
546  2025-06-30              Cell1_Queue


In [5]:
# Bottleneck 컬럼의 빈도 계산
bottleneck_counts = df["Bottleneck_actual"].value_counts().head(5)

bottleneck_counts.head()

Bottleneck_actual
Cell1_Queue                336
Forklift_Blanking_Queue    204
Cell3_Queue                  7
Name: count, dtype: int64

# (레이더 차트 표현 위해) 각 queue에 대한 % 정보 추가
- 100% 기준 : 각 컬럼별 최댓값 (컬럼마다 개별적으로 정규화되는 구조)

In [6]:
# =========================
# 1) Queue 컬럼 찾기
# =========================
queue_cols = [c for c in df.columns if "queue" in c.lower()]

# =========================
# 2) 각 Queue 컬럼별 최대값 계산
# =========================
max_vals = df[queue_cols].max()

# =========================
# 3) 퍼센트 컬럼 생성
# =========================
for col in queue_cols:
    max_val = max_vals[col]
    if max_val > 0:
        df[col + "_Percent"] = df[col] / max_val * 100
    else:
        df[col + "_Percent"] = 0

# =========================
# 4) 확인
# =========================
print(df[[c for c in df.columns if c.endswith("_Percent")]].head())

   Blanking_SKU1_Queue_Percent  Blanking_SKU2_Queue_Percent  \
0                    67.871724                    80.814554   
1                    77.109346                    75.884512   
2                    77.514661                    68.093067   
3                    76.859921                    50.799931   
4                    68.107787                    67.093870   

   Blanking_SKU3_Queue_Percent  Blanking_SKU4_Queue_Percent  \
0                    82.562325                    53.886392   
1                    57.047209                    63.505043   
2                    63.963323                    61.597547   
3                    69.306315                    78.707893   
4                    65.927506                    85.995160   

   Press1_Queue_Percent  Press2_Queue_Percent  Press3_Queue_Percent  \
0             40.234508             57.483046             46.894637   
1             53.208918             46.260348             43.647717   
2             47.879128      

# 조립셀 Queue 값 -> SKU별로 분배 필요
- 각 Cell Queue 전체량을 SKU별 담당 비율에 따라 나눠서, SKU별 Cell Queue를 만든다.
- 그리고 SKU별로 여러 Cell에서 받은 몫을 합쳐 최종 SKU Cell Queue를 계산한다.

In [7]:
# ================================
# 2. SKU 경로 정의
# ================================
sku_paths = {
    "SKU1": {"blanking": "Blanking_SKU1_Queue","press": "Press1_Queue"},
    "SKU2": {"blanking": "Blanking_SKU2_Queue","press": "Press2_Queue"},
    "SKU3": {"blanking": "Blanking_SKU3_Queue","press": "Press3_Queue"},
    "SKU4": {"blanking": "Blanking_SKU4_Queue","press": "Press4_Queue"},
}

# ================================
# 3. Cell → SKU 비율 기반 분배
# ================================
sku_cell_vals = {sku: pd.Series(0, index=df.index) for sku in sku_paths}

for cell_num in range(1, 5):  # Cell1 ~ Cell4
    cell_col = f"Cell{cell_num}_Queue"
    
    if cell_col not in df.columns:
        continue
    
    # 해당 Cell Queue 값
    cell_vals = df[cell_col]
    
    for sku in sku_paths.keys():
        ratio_col = f"c_Cell{cell_num}_{sku}"
        if ratio_col in df.columns:
            # SKU별 Cell Queue = Cell Queue × 비율
            sku_cell_vals[sku] += cell_vals * df[ratio_col]

# ================================
# 4. 결과 DataFrame 구성 (컬럼명 변경)
# ================================
sku_cell_df = pd.DataFrame({
    f"Cell_{sku}_Queue": vals for sku, vals in sku_cell_vals.items()
})

print("=== SKU별로 분배된 Cell Queue (앞부분 10행) ===")
print(sku_cell_df.head(10))

# 원래 df에 붙이고 싶다면:
df = pd.concat([df, sku_cell_df], axis=1)

=== SKU별로 분배된 Cell Queue (앞부분 10행) ===
   Cell_SKU1_Queue  Cell_SKU2_Queue  Cell_SKU3_Queue  Cell_SKU4_Queue
0     1.370939e+06     1.192535e+06     1.247366e+06     8.856453e+05
1     2.451033e+06     1.958747e+06     9.126624e+05     1.182048e+06
2     5.568672e+06     3.817909e+06     9.283182e+05     1.242360e+06
3     1.647598e+06     1.114051e+06     1.090078e+06     1.048065e+06
4     1.553549e+06     1.232362e+06     1.476219e+06     1.102118e+06
5     2.101007e+06     1.931448e+06     1.167199e+06     1.028449e+06
6     1.851074e+06     1.886183e+06     6.287604e+05     8.966812e+05
7     2.972045e+06     2.248654e+06     1.170633e+06     1.487739e+06
8     5.720726e+06     4.591313e+06     9.949638e+05     1.545606e+06
9     5.471222e+06     3.388148e+06     7.908985e+05     1.160693e+06


In [8]:
sku_paths = {
    "SKU1": {"blanking": "Blanking_SKU1_Queue","press": "Press1_Queue", "Cell": "Cell_SKU1_Queue"},
    "SKU2": {"blanking": "Blanking_SKU2_Queue","press": "Press2_Queue", "Cell": "Cell_SKU2_Queue"},
    "SKU3": {"blanking": "Blanking_SKU3_Queue","press": "Press3_Queue", "Cell": "Cell_SKU3_Queue"},
    "SKU4": {"blanking": "Blanking_SKU4_Queue","press": "Press4_Queue", "Cell": "Cell_SKU4_Queue"},
}

# csv로 1차 저장

In [9]:
output_path = "./processed_bottleneck_results_01.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ 저장 완료: {output_path}")

✅ 저장 완료: ./processed_bottleneck_results_01.csv


# 1. 날짜별 병목 공정 예측

날짜별 병목 공정 예측 (Time-based split)

In [10]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# =========================
# 1. 날짜 컬럼 처리
# =========================
df["Time_Now"] = pd.to_datetime(df["Time_Now"], errors="coerce")

# 예측 기준일: 예를 들어 2025-06-27 하루를 예측한다고 가정
target_date = pd.Timestamp("2025-06-27")

# =========================
# 2. Feature & Label 정의
# =========================
X = df[[c for c in df.columns if "Queue" in c and not c.endswith("_Percent")]]
y = df["Bottleneck_actual"]

# =========================
# 3. Train / Predict split (날짜 기준)
# =========================
train_mask = df["Time_Now"] < target_date   # 전날까지 학습
test_mask  = df["Time_Now"] == target_date  # 당일 예측

X_train, y_train = X[train_mask], y[train_mask]
X_test,  y_test  = X[test_mask], y[test_mask]

# =========================
# 4. 모델 학습
# =========================
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# =========================
# 5. 당일 데이터 예측
# =========================
df.loc[test_mask, "Bottleneck_pred"] = model.predict(X_test)

# =========================
# 6. 성능 평가
# =========================
print("=== Train 성능 (전날까지) ===")
print(classification_report(y_train, model.predict(X_train)))

print("=== Test 성능 (당일) ===")
print(classification_report(y_test, df.loc[test_mask, "Bottleneck_pred"]))

=== Train 성능 (전날까지) ===
                         precision    recall  f1-score   support

            Cell1_Queue       1.00      1.00      1.00       333
            Cell3_Queue       1.00      1.00      1.00         7
Forklift_Blanking_Queue       1.00      1.00      1.00       203

               accuracy                           1.00       543
              macro avg       1.00      1.00      1.00       543
           weighted avg       1.00      1.00      1.00       543

=== Test 성능 (당일) ===
              precision    recall  f1-score   support

 Cell1_Queue       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [11]:
df["Bottleneck_pred"] = model.predict(X)
df[["Time_Now", "Bottleneck_actual", "Bottleneck_pred"]].head()

Unnamed: 0,Time_Now,Bottleneck_actual,Bottleneck_pred
0,2024-01-01,Forklift_Blanking_Queue,Forklift_Blanking_Queue
1,2024-01-02,Forklift_Blanking_Queue,Forklift_Blanking_Queue
2,2024-01-03,Cell1_Queue,Cell1_Queue
3,2024-01-04,Forklift_Blanking_Queue,Forklift_Blanking_Queue
4,2024-01-05,Forklift_Blanking_Queue,Forklift_Blanking_Queue


과거 7일치 데이터로 학습 → 앞으로 4일치 미리 예측

In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# =========================
# 1. 날짜 컬럼 처리
# =========================
df["Time_Now"] = pd.to_datetime(df["Time_Now"], errors="coerce")

# =========================
# 2. Feature & Label 정의
# =========================
X_all = df[[c for c in df.columns if "Queue" in c and not c.endswith("_Percent")]]
y_all = df["Bottleneck_actual"]

# =========================
# 3. 기준일 이후 4일 예측 설정
# =========================
start_date = pd.Timestamp("2025-06-27")
future_days = pd.date_range(start=start_date, periods=4, freq="D")  # 6/27 ~ 6/30

# =========================
# 4. Rolling 방식 (최근 7일만 학습)
# =========================
model = RandomForestClassifier(n_estimators=200, random_state=42)

for day in future_days:
    # 학습 기간: 예측일 기준 직전 7일
    train_start = day - pd.Timedelta(days=7)
    train_end   = day - pd.Timedelta(days=1)

    train_mask = (df["Time_Now"] >= train_start) & (df["Time_Now"] <= train_end)
    test_mask  = df["Time_Now"] == day

    if test_mask.sum() == 0:
        print(f"⚠️ {day.date()} 데이터 없음 → 예측 불가")
        continue
    if train_mask.sum() == 0:
        print(f"⚠️ {day.date()} 학습할 데이터 없음 → 예측 불가")
        continue

    X_train, y_train = X_all[train_mask], y_all[train_mask]
    X_test,  y_test  = X_all[test_mask], y_all[test_mask]

    # 모델 학습
    model.fit(X_train, y_train)

    # 예측 수행
    df.loc[test_mask, "Bottleneck_pred"] = model.predict(X_test)

    # 성능 출력
    print(f"\n=== {day.date()} 예측 결과 ===")
    print(f"(학습 데이터 기간: {train_start.date()} ~ {train_end.date()}, {train_mask.sum()}건)")
    print(classification_report(y_test, df.loc[test_mask, "Bottleneck_pred"]))

# =========================
# 5. 결과 확인
# =========================
print(df.loc[df["Time_Now"].between(start_date, future_days[-1]), 
             ["Time_Now", "Bottleneck_actual", "Bottleneck_pred"]])


=== 2025-06-27 예측 결과 ===
(학습 데이터 기간: 2025-06-20 ~ 2025-06-26, 7건)
              precision    recall  f1-score   support

 Cell1_Queue       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1


=== 2025-06-28 예측 결과 ===
(학습 데이터 기간: 2025-06-21 ~ 2025-06-27, 7건)
                         precision    recall  f1-score   support

Forklift_Blanking_Queue       1.00      1.00      1.00         1

               accuracy                           1.00         1
              macro avg       1.00      1.00      1.00         1
           weighted avg       1.00      1.00      1.00         1


=== 2025-06-29 예측 결과 ===
(학습 데이터 기간: 2025-06-22 ~ 2025-06-28, 7건)
              precision    recall  f1-score   support

 Cell1_Queue       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00 

# 2. SKU 단위 병목 예측

SKU별 단순 비교 기반 병목 컬럼 생성
- 각 SKU별로 3개 후보 공정(Blanking / Press / Cell)의 queue 값만 단순 비교해서,
가장 큰 값을 가진 공정을 병목으로 정의

In [13]:
# ================================
# 7. SKU별 병목 공정 예측
# ================================
sku_paths = {
    "SKU1": {"blanking": "Blanking_SKU1_Queue","press": "Press1_Queue", "cell": "Cell_SKU1_Queue"},
    "SKU2": {"blanking": "Blanking_SKU2_Queue","press": "Press2_Queue", "cell": "Cell_SKU2_Queue"},
    "SKU3": {"blanking": "Blanking_SKU3_Queue","press": "Press3_Queue", "cell": "Cell_SKU3_Queue"},
    "SKU4": {"blanking": "Blanking_SKU4_Queue","press": "Press4_Queue", "cell": "Cell_SKU4_Queue"},
}

for sku, paths in sku_paths.items():
    # 각 SKU별 병목 후보 Stage 값 모으기
    stage_df = df[[paths["blanking"], paths["press"], paths["cell"]]].copy()
    stage_df.columns = ["Blanking", "Press", "Cell"]  # 축 이름 단순화
    
    # 각 시점별 최대값 컬럼명 → 병목 공정
    df[f"Bottleneck_actual_{sku}"] = stage_df.idxmax(axis=1)

# ================================
# 8. 결과 확인
# ================================
print(df[[c for c in df.columns if c.startswith("Bottleneck_actual_")]].head())

  Bottleneck_actual_SKU1 Bottleneck_actual_SKU2 Bottleneck_actual_SKU3  \
0                   Cell                   Cell                   Cell   
1                   Cell                   Cell                   Cell   
2                   Cell                   Cell                   Cell   
3                   Cell                   Cell                   Cell   
4                   Cell                   Cell                   Cell   

  Bottleneck_actual_SKU4  
0                   Cell  
1                   Cell  
2                   Cell  
3                   Cell  
4                   Cell  


전체 날짜에 대해 “전날까지 학습 → 오늘 예측” (SKU별)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 날짜 정렬
df = df.sort_values("Time_Now").reset_index(drop=True)

# Feature
X_all = df[[c for c in df.columns if "Queue" in c and not c.endswith("_Percent")]]

sku_paths = {
    "SKU1": {"blanking": "Blanking_SKU1_Queue","press": "Press1_Queue", "cell": "Cell_SKU1_Queue"},
    "SKU2": {"blanking": "Blanking_SKU2_Queue","press": "Press2_Queue", "cell": "Cell_SKU2_Queue"},
    "SKU3": {"blanking": "Blanking_SKU3_Queue","press": "Press3_Queue", "cell": "Cell_SKU3_Queue"},
    "SKU4": {"blanking": "Blanking_SKU4_Queue","press": "Press4_Queue", "cell": "Cell_SKU4_Queue"},
}

for sku, paths in sku_paths.items():
    # 실제 병목 라벨 생성
    stage_df = df[[paths["blanking"], paths["press"], paths["cell"]]].copy()
    stage_df.columns = ["Blanking", "Press", "Cell"]
    y_all = stage_df.idxmax(axis=1)
    df[f"Bottleneck_actual_{sku}"] = y_all

    preds = [None]  # 첫날은 예측 불가
    for i in range(1, len(df)):
        train_mask = df.index < i
        test_mask  = df.index == i

        X_train, y_train = X_all[train_mask], y_all[train_mask]
        X_test,  y_test  = X_all[test_mask], y_all[test_mask]

        if y_train.nunique() < 2:
            preds.append(y_train.iloc[-1])
            continue

        clf = RandomForestClassifier(n_estimators=200, random_state=42)
        clf.fit(X_train, y_train)

        y_pred_test = clf.predict(X_test)[0]
        preds.append(y_pred_test)

    df[f"Bottleneck_pred_{sku}"] = preds

    # 📌 전체 기간 기준 성능 지표 출력 (첫날 제외)
    mask_valid = df[f"Bottleneck_pred_{sku}"].notna()
    print(f"\n=== {sku} 전체 기간 예측 성능 ===")
    print(classification_report(
        df.loc[mask_valid, f"Bottleneck_actual_{sku}"],
        df.loc[mask_valid, f"Bottleneck_pred_{sku}"]
    ))


=== SKU1 전체 기간 예측 성능 ===
              precision    recall  f1-score   support

        Cell       1.00      1.00      1.00       546

    accuracy                           1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546


=== SKU2 전체 기간 예측 성능 ===
              precision    recall  f1-score   support

        Cell       1.00      1.00      1.00       546

    accuracy                           1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546


=== SKU3 전체 기간 예측 성능 ===
              precision    recall  f1-score   support

        Cell       1.00      1.00      1.00       546

    accuracy                           1.00       546
   macro avg       1.00      1.00      1.00       546
weighted avg       1.00      1.00      1.00       546


=== SKU4 전체 기간 예측 성능 ===
              precision    recall  f1-score   support

        Cell       1.0

In [15]:
# ================================
# 결과 확인 (앞부분)
# ================================
print(df[[
    "Time_Now",
    "Bottleneck_actual_SKU1", "Bottleneck_pred_SKU1",
    "Bottleneck_actual_SKU2", "Bottleneck_pred_SKU2",
    "Bottleneck_actual_SKU3", "Bottleneck_pred_SKU3",
    "Bottleneck_actual_SKU4", "Bottleneck_pred_SKU4"
]].head(20))

     Time_Now Bottleneck_actual_SKU1 Bottleneck_pred_SKU1  \
0  2024-01-01                   Cell                 None   
1  2024-01-02                   Cell                 Cell   
2  2024-01-03                   Cell                 Cell   
3  2024-01-04                   Cell                 Cell   
4  2024-01-05                   Cell                 Cell   
5  2024-01-06                   Cell                 Cell   
6  2024-01-07                   Cell                 Cell   
7  2024-01-08                   Cell                 Cell   
8  2024-01-09                   Cell                 Cell   
9  2024-01-10                   Cell                 Cell   
10 2024-01-11                   Cell                 Cell   
11 2024-01-12                   Cell                 Cell   
12 2024-01-13                   Cell                 Cell   
13 2024-01-14                   Cell                 Cell   
14 2024-01-15                   Cell                 Cell   
15 2024-01-16           

# 3. 공정 그룹별 병목 예측

- 공정별 실제 병목 번호 레이블 생성

In [16]:
# ================================
# 11. 공정별 실제 병목 번호 레이블 생성
# ================================
stage_groups = {
    "Blanking": ["Blanking_SKU1_Queue", "Blanking_SKU2_Queue",
                 "Blanking_SKU3_Queue", "Blanking_SKU4_Queue"],
    "Press":    ["Press1_Queue", "Press2_Queue", "Press3_Queue", "Press4_Queue"],
    "Cell":     ["Cell_SKU1_Queue", "Cell_SKU2_Queue", "Cell_SKU3_Queue", "Cell_SKU4_Queue"]
}

for stage, cols in stage_groups.items():
    df[f"Bottleneck_actual_{stage}"] = df[cols].idxmax(axis=1)

print(df[[c for c in df.columns if c.startswith("Bottleneck_actual_")]].head())

  Bottleneck_actual_SKU1 Bottleneck_actual_SKU2 Bottleneck_actual_SKU3  \
0                   Cell                   Cell                   Cell   
1                   Cell                   Cell                   Cell   
2                   Cell                   Cell                   Cell   
3                   Cell                   Cell                   Cell   
4                   Cell                   Cell                   Cell   

  Bottleneck_actual_SKU4 Bottleneck_actual_Blanking Bottleneck_actual_Press  \
0                   Cell        Blanking_SKU2_Queue            Press2_Queue   
1                   Cell        Blanking_SKU2_Queue            Press1_Queue   
2                   Cell        Blanking_SKU1_Queue            Press2_Queue   
3                   Cell        Blanking_SKU4_Queue            Press1_Queue   
4                   Cell        Blanking_SKU4_Queue            Press1_Queue   

  Bottleneck_actual_Cell  
0        Cell_SKU1_Queue  
1        Cell_SKU1_Queue  

공정 그룹별 병목 예측 (pred) + 성능 평가

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

df = df.sort_values("Time_Now").reset_index(drop=True)
X_all = df[[c for c in df.columns if "Queue" in c and not c.endswith("_Percent")]]

for stage, cols in stage_groups.items():
    y_all = df[f"Bottleneck_actual_{stage}"]  # 위 셀에서 만든 actual 라벨 사용
    preds = [None]

    for i in range(1, len(df)):
        train_mask = df.index < i
        test_mask  = df.index == i

        X_train, y_train = X_all[train_mask], y_all[train_mask]
        X_test,  y_test  = X_all[test_mask], y_all[test_mask]

        if y_train.nunique() < 2:
            preds.append(y_train.iloc[-1])
            continue

        clf = RandomForestClassifier(n_estimators=200, random_state=42)
        clf.fit(X_train, y_train)

        preds.append(clf.predict(X_test)[0])

    df[f"Bottleneck_pred_{stage}"] = preds

    mask_valid = df[f"Bottleneck_pred_{stage}"].notna()
    print(f"\n=== {stage} 전체 기간 예측 성능 ===")
    print(classification_report(
        df.loc[mask_valid, f"Bottleneck_actual_{stage}"],
        df.loc[mask_valid, f"Bottleneck_pred_{stage}"]
    ))


=== Blanking 전체 기간 예측 성능 ===
                     precision    recall  f1-score   support

Blanking_SKU1_Queue       0.83      0.81      0.82       129
Blanking_SKU2_Queue       0.82      0.93      0.87       152
Blanking_SKU3_Queue       0.85      0.76      0.80       127
Blanking_SKU4_Queue       0.82      0.80      0.81       138

           accuracy                           0.83       546
          macro avg       0.83      0.83      0.83       546
       weighted avg       0.83      0.83      0.83       546


=== Press 전체 기간 예측 성능 ===
              precision    recall  f1-score   support

Press1_Queue       0.82      0.94      0.88       302
Press2_Queue       0.81      0.83      0.82       149
Press3_Queue       0.86      0.56      0.67        54
Press4_Queue       0.92      0.29      0.44        41

    accuracy                           0.82       546
   macro avg       0.85      0.65      0.70       546
weighted avg       0.83      0.82      0.81       546


=== Cell 전체 기간 예

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
print(df[[
    "Time_Now",
    "Bottleneck_actual_Blanking", "Bottleneck_pred_Blanking",
    "Bottleneck_actual_Press", "Bottleneck_pred_Press",
    "Bottleneck_actual_Cell", "Bottleneck_pred_Cell"
]].head(20))

     Time_Now Bottleneck_actual_Blanking Bottleneck_pred_Blanking  \
0  2024-01-01        Blanking_SKU2_Queue                     None   
1  2024-01-02        Blanking_SKU2_Queue      Blanking_SKU2_Queue   
2  2024-01-03        Blanking_SKU1_Queue      Blanking_SKU2_Queue   
3  2024-01-04        Blanking_SKU4_Queue      Blanking_SKU2_Queue   
4  2024-01-05        Blanking_SKU4_Queue      Blanking_SKU2_Queue   
5  2024-01-06        Blanking_SKU2_Queue      Blanking_SKU2_Queue   
6  2024-01-07        Blanking_SKU2_Queue      Blanking_SKU2_Queue   
7  2024-01-08        Blanking_SKU1_Queue      Blanking_SKU2_Queue   
8  2024-01-09        Blanking_SKU2_Queue      Blanking_SKU2_Queue   
9  2024-01-10        Blanking_SKU1_Queue      Blanking_SKU1_Queue   
10 2024-01-11        Blanking_SKU4_Queue      Blanking_SKU4_Queue   
11 2024-01-12        Blanking_SKU1_Queue      Blanking_SKU1_Queue   
12 2024-01-13        Blanking_SKU3_Queue      Blanking_SKU1_Queue   
13 2024-01-14        Blanking_SKU2

# 이때까지를 종합 => 1번째 csv 저장 (results_block1_bottleneck_predictions.csv)

In [19]:
output_path = "./results_block1_bottleneck_predictions.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"✅ <1> 저장 완료: {output_path}")

✅ <1> 저장 완료: ./results_block1_bottleneck_predictions.csv


# Bottleneck_actual_Cell, Bottleneck_pred_Cell
- 이 컬럼들에 들어가는 값이 Cell1-4 중의 하나여야 하는데 현재는 SKU1-4 기준으로 적혀있음
- 이미 만들어진 CSV 내용을 모두 유지하되, 딱 저 2개 컬럼만 수정하고자 함
- 원본 CSV 파일 : results_block1_bottleneck_predictions.csv

In [1]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier

# ================================
# 1. 데이터 불러오기
# ================================
file_path = "./results_block1_bottleneck_predictions_before.csv"
df = pd.read_csv(file_path)

# ================================
# 2. Cell1~4 Queue 컬럼만 추출
# ================================
cell_cols = [c for c in df.columns if re.match(r"Cell[1-4]_Queue", c)]

# ================================
# 3. Bottleneck_actual_Cell → 각 행에서 가장 큰 Cell
# ================================
df["Bottleneck_actual_Cell"] = df[cell_cols].idxmax(axis=1).str.replace("_Queue", "")

# ================================
# 4. Bottleneck_pred_Cell → 랜덤포레스트 예측 기반
# ================================
df = df.sort_values("Time_Now").reset_index(drop=True)
X_all = df[[c for c in df.columns if "Queue" in c and not c.endswith("_Percent")]]

y_all = df["Bottleneck_actual_Cell"]   # 학습 라벨: 실제 Cell1~4 병목

preds = [None]  # 첫 행은 예측 불가
for i in range(1, len(df)):
    train_mask = df.index < i
    test_mask  = df.index == i

    X_train, y_train = X_all[train_mask], y_all[train_mask]
    X_test = X_all[test_mask]

    # 클래스가 하나뿐이면 마지막 라벨 그대로
    if y_train.nunique() < 2:
        preds.append(y_train.iloc[-1])
        continue

    clf = RandomForestClassifier(n_estimators=200, random_state=42)
    clf.fit(X_train, y_train)

    preds.append(clf.predict(X_test)[0])

df["Bottleneck_pred_Cell"] = preds

In [2]:
# ================================
# 4-1. 저장 전에 결과 확인
# ================================
print("\n=== 병목 Cell 컬럼 확인 (앞부분 20행) ===")
print(df[["Time_Now", "Bottleneck_actual_Cell", "Bottleneck_pred_Cell"]].head(20))

print("\n=== 값 분포 확인 ===")
print("Bottleneck_actual_Cell 분포:")
print(df["Bottleneck_actual_Cell"].value_counts())

print("\nBottleneck_pred_Cell 분포:")
print(df["Bottleneck_pred_Cell"].value_counts())


=== 병목 Cell 컬럼 확인 (앞부분 20행) ===
      Time_Now Bottleneck_actual_Cell Bottleneck_pred_Cell
0   2024-01-01                  Cell1                 None
1   2024-01-02                  Cell1                Cell1
2   2024-01-03                  Cell1                Cell1
3   2024-01-04                  Cell1                Cell1
4   2024-01-05                  Cell1                Cell1
5   2024-01-06                  Cell1                Cell1
6   2024-01-07                  Cell1                Cell1
7   2024-01-08                  Cell1                Cell1
8   2024-01-09                  Cell1                Cell1
9   2024-01-10                  Cell1                Cell1
10  2024-01-11                  Cell1                Cell1
11  2024-01-12                  Cell1                Cell1
12  2024-01-13                  Cell1                Cell1
13  2024-01-14                  Cell1                Cell1
14  2024-01-15                  Cell1                Cell1
15  2024-01-16         

In [3]:
from sklearn.metrics import classification_report, accuracy_score

# ================================
# 4-2. 성능 지표 확인
# ================================
# 첫 번째 row는 예측 불가(None/NaN)일 수 있으므로 제외
mask_valid = df["Bottleneck_pred_Cell"].notna()

y_true = df.loc[mask_valid, "Bottleneck_actual_Cell"]
y_pred = df.loc[mask_valid, "Bottleneck_pred_Cell"]

print("\n=== 성능 지표 (Cell1~4 예측) ===")
print(classification_report(y_true, y_pred, digits=3))
print(f"Accuracy: {accuracy_score(y_true, y_pred):.3f}")


=== 성능 지표 (Cell1~4 예측) ===
              precision    recall  f1-score   support

       Cell1      0.966     0.996     0.981       517
       Cell3      0.846     0.407     0.550        27
       Cell4      0.000     0.000     0.000         2

    accuracy                          0.963       546
   macro avg      0.604     0.468     0.510       546
weighted avg      0.957     0.963     0.956       546

Accuracy: 0.963


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [4]:
# ================================
# 5. 결과 저장
# ================================
output_path = "./results_block1_bottleneck_predictions_fixed.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ 수정 완료: {output_path}")
print(df[["Time_Now", "Bottleneck_actual_Cell", "Bottleneck_pred_Cell"]].head(20))

✅ 수정 완료: ./results_block1_bottleneck_predictions_fixed.csv
      Time_Now Bottleneck_actual_Cell Bottleneck_pred_Cell
0   2024-01-01                  Cell1                 None
1   2024-01-02                  Cell1                Cell1
2   2024-01-03                  Cell1                Cell1
3   2024-01-04                  Cell1                Cell1
4   2024-01-05                  Cell1                Cell1
5   2024-01-06                  Cell1                Cell1
6   2024-01-07                  Cell1                Cell1
7   2024-01-08                  Cell1                Cell1
8   2024-01-09                  Cell1                Cell1
9   2024-01-10                  Cell1                Cell1
10  2024-01-11                  Cell1                Cell1
11  2024-01-12                  Cell1                Cell1
12  2024-01-13                  Cell1                Cell1
13  2024-01-14                  Cell1                Cell1
14  2024-01-15                  Cell1                Cel

# 특정 병목 공정이 생산량(c_TotalProducts)에 얼마나 영향
# => 2번째 csv 저장 (results_block2_regression_case1.csv , results_block2_regression_case2.csv)
- 수치나 퍼센트로 표현

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score
import pandas as pd

y = df["c_TotalProducts"]   # 종속변수: 총 생산량

# =========================================================
# Case 1: 병목 위치만
# =========================================================
X_cat = df[["Bottleneck_actual"]]

try:
    encoder = OneHotEncoder(drop="first", sparse_output=False)
except TypeError:
    encoder = OneHotEncoder(drop="first", sparse=False)

X_encoded = encoder.fit_transform(X_cat)
feature_names_cat = encoder.get_feature_names_out(["Bottleneck_actual"])
X_encoded_df = pd.DataFrame(X_encoded, columns=feature_names_cat, index=df.index)

model_cat = LinearRegression()
model_cat.fit(X_encoded_df, y)

y_pred_cat = model_cat.predict(X_encoded_df)
r2_cat = r2_score(y, y_pred_cat)

coef_df_cat = pd.DataFrame({
    "Feature": feature_names_cat,
    "Impact_on_TotalProducts": model_cat.coef_
}).sort_values(by="Impact_on_TotalProducts", key=abs, ascending=False)

print("=== Case 1: 병목 위치만 ===")
print(coef_df_cat)
print(f"\n모델 설명력 (R²): {r2_cat:.3f}")

print("\n=== 보고용 해석 ===")
for _, row in coef_df_cat.iterrows():
    feature = row["Feature"].replace("Bottleneck_actual_", "")
    impact = row["Impact_on_TotalProducts"]
    if impact > 0:
        print(f"- {feature} 병목 발생 시 → 총생산량이 약 +{impact:.0f} 증가")
    else:
        print(f"- {feature} 병목 발생 시 → 총생산량이 약 {impact:.0f} 감소")
print(f"\n※ 현재 모델은 병목 위치만 고려했을 때 생산량 변동의 약 {r2_cat*100:.1f}%를 설명합니다.\n")
print()


# =========================================================
# Case 2: 병목 위치 + Queue 값 포함
# =========================================================
queue_cols = [c for c in df.columns if c.endswith("_Queue")]
X_full = pd.concat([X_encoded_df, df[queue_cols]], axis=1)

model_full = LinearRegression()
model_full.fit(X_full, y)

y_pred_full = model_full.predict(X_full)
r2_full = r2_score(y, y_pred_full)

coef_df_full = pd.DataFrame({
    "Feature": X_full.columns,
    "Impact_on_TotalProducts": model_full.coef_
}).sort_values(by="Impact_on_TotalProducts", key=abs, ascending=False)

print("=== Case 2: 병목 위치 + Queue 값 포함 ===")
print(coef_df_full.head(15))
print(f"\n모델 설명력 (R²): {r2_full:.3f}")

print("\n=== 보고용 해석 ===")
for _, row in coef_df_full.head(10).iterrows():  # 상위 10개만 보고
    feature = row["Feature"].replace("Bottleneck_actual_", "")
    impact = row["Impact_on_TotalProducts"]
    if impact > 0:
        print(f"- {feature} 값이 커질수록 → 총생산량이 약 +{impact:.0f} 증가")
    else:
        print(f"- {feature} 값이 커질수록 → 총생산량이 약 {impact:.0f} 감소")
print(f"\n※ 현재 모델은 병목 위치와 Queue 크기를 함께 고려했을 때 생산량 변동의 약 {r2_full*100:.1f}%를 설명합니다.")

=== Case 1: 병목 위치만 ===
                                     Feature  Impact_on_TotalProducts
0              Bottleneck_actual_Cell3_Queue               599.955357
1  Bottleneck_actual_Forklift_Blanking_Queue              -293.141282

모델 설명력 (R²): 0.020

=== 보고용 해석 ===
- Cell3_Queue 병목 발생 시 → 총생산량이 약 +600 증가
- Forklift_Blanking_Queue 병목 발생 시 → 총생산량이 약 -293 감소

※ 현재 모델은 병목 위치만 고려했을 때 생산량 변동의 약 2.0%를 설명합니다.


=== Case 2: 병목 위치 + Queue 값 포함 ===
                                      Feature  Impact_on_TotalProducts
5                         Blanking_SKU4_Queue              8014.621055
4                         Blanking_SKU3_Queue              5490.032026
2                         Blanking_SKU1_Queue              3903.947551
3                         Blanking_SKU2_Queue               761.981449
20                              Quality_Queue               291.523455
24                             Blanking_Queue               171.645028
1   Bottleneck_actual_Forklift_Blanking_Queue             

In [21]:
coef_df_cat.to_csv("./results_block2_regression_case1.csv", index=False, encoding="utf-8-sig")
print("✅ <2>-Case1 저장 완료: results_block2_regression_case1.csv")

coef_df_full.to_csv("./results_block2_regression_case2.csv", index=False, encoding="utf-8-sig")
print("✅ <2>-Case2 저장 완료: results_block2_regression_case2.csv")

✅ <2>-Case1 저장 완료: results_block2_regression_case1.csv
✅ <2>-Case2 저장 완료: results_block2_regression_case2.csv


# SKU 재분배 시뮬레이션
# => 3번째 csv 저장 (results_block3_sku_redistribution.csv)
- 날짜별로 Cell에서 병목이 생긴 경우, 그날의 SKU 작업량을 다른 Cell로 분산시켜봤을 때 어떻게 바뀌는지 기록하는 시뮬레이션

In [26]:
import pandas as pd
import numpy as np
import re

# 날짜 단위 컬럼 생성
df["_date"] = df["Time_Now"].dt.floor("D")

queue_cols = [c for c in df.columns if "Queue" in c]
cell_sku_cols = [c for c in df.columns if c.startswith("c_Cell")]

# Cell ↔ SKU 매핑
cell_sku_map = {
    "c_Cell1": [1, 2, 4],
    "c_Cell2": [2, 4],
    "c_Cell3": [2, 3],
    "c_Cell4": [3, 4],
}

# 결과 저장 리스트
records = []

# ================================
# 날짜별 루프
# ================================
for d, g in df.groupby("_date"):

    if g.empty:
        continue

    # -------------------------
    # 1. 병목 Queue 탐지 (Cell 단위만)
    # -------------------------
    cell_queue_cols = [c for c in df.columns if re.match(r"Cell\d+_Queue", c)]
    if not cell_queue_cols:
        continue

    row_bneck = g[cell_queue_cols].astype(float).idxmax(axis=1)
    bneck_queue = row_bneck.value_counts().idxmax()  # 예: "Cell3_Queue"

    bneck_cell = None
    match = re.search(r"(Cell\d+)", bneck_queue)
    if match:
        bneck_cell = "c_" + match.group(1)  # 예: Cell3 → c_Cell3

    # -------------------------
    # 2. Cell×SKU 집계
    # -------------------------
    pivot = g[cell_sku_cols].sum().reset_index()
    pivot.columns = ["col", "value"]
    pivot["Cell"] = pivot["col"].str.extract(r"(c_Cell\d+)")
    pivot["SKU"]  = pivot["col"].str.extract(r"SKU(\d+)")

    cell_sku_pivot = pivot.pivot_table(
        index="Cell", columns="SKU", values="value", aggfunc="sum"
    ).fillna(0)

    redistributed = cell_sku_pivot.astype(float).copy()

    # -------------------------
    # 3. 재분배 시뮬레이션 (Cell 병목일 때만 실행)
    # -------------------------
    if bneck_cell and bneck_cell in cell_sku_pivot.index:
        sku_dist = cell_sku_pivot.loc[bneck_cell]
        for sku, qty in sku_dist.items():
            if qty == 0:
                continue
            eligible_cells = [c for c, skus in cell_sku_map.items()
                              if (int(sku) in skus and c != bneck_cell)]
            if not eligible_cells:
                continue
            share = qty / len(eligible_cells)
            redistributed.loc[bneck_cell, sku] -= qty
            for c in eligible_cells:
                redistributed.loc[c, sku] += share

    # -------------------------
    # 4. 전후 비교 기록
    # -------------------------
    before_sum = cell_sku_pivot.sum(axis=1)
    after_sum  = redistributed.sum(axis=1)

    # 비율 계산
    before_ratio = cell_sku_pivot.div(before_sum, axis=0).fillna(0)
    after_ratio  = redistributed.div(after_sum, axis=0).fillna(0)

    for cell in cell_sku_map.keys():
        before_val = before_sum.get(cell, 0)
        after_val = after_sum.get(cell, 0)
        diff = after_val - before_val
        pct_change = (diff / before_val * 100) if before_val > 0 else np.nan

        for sku in cell_sku_map[cell]:
            before_r = before_ratio.loc[cell, str(sku)] if cell in before_ratio.index else 0
            after_r  = after_ratio.loc[cell, str(sku)] if cell in after_ratio.index else 0
            diff_r   = after_r - before_r

            records.append({
                "date": d,
                "bneck_queue": bneck_queue,
                "bneck_cell": bneck_cell,
                "cell": cell,
                "sku": f"SKU{sku}",
                "before_queue": int(round(before_val)),
                "after_queue": int(round(after_val)),
                "queue_diff": int(round(diff)),
                "queue_diff_pct": pct_change,
                "before_ratio": before_r,
                "after_ratio": after_r,
                "ratio_diff": diff_r
            })

# ================================
# 최종 DataFrame 저장
# ================================
result_df = pd.DataFrame(records)
result_path = "./results_block3_sku_redistribution.csv"
result_df.to_csv(result_path, index=False, encoding="utf-8-sig")
print(f"✅ <3> 저장 완료: {result_path}")

print(result_df.head(20))

✅ <3> 저장 완료: ./results_block3_sku_redistribution.csv
         date  bneck_queue bneck_cell     cell   sku  before_queue  \
0  2024-01-01  Cell1_Queue    c_Cell1  c_Cell1  SKU1         29216   
1  2024-01-01  Cell1_Queue    c_Cell1  c_Cell1  SKU2         29216   
2  2024-01-01  Cell1_Queue    c_Cell1  c_Cell1  SKU4         29216   
3  2024-01-01  Cell1_Queue    c_Cell1  c_Cell2  SKU2          7185   
4  2024-01-01  Cell1_Queue    c_Cell1  c_Cell2  SKU4          7185   
5  2024-01-01  Cell1_Queue    c_Cell1  c_Cell3  SKU2          5610   
6  2024-01-01  Cell1_Queue    c_Cell1  c_Cell3  SKU3          5610   
7  2024-01-01  Cell1_Queue    c_Cell1  c_Cell4  SKU3         16677   
8  2024-01-01  Cell1_Queue    c_Cell1  c_Cell4  SKU4         16677   
9  2024-01-02  Cell1_Queue    c_Cell1  c_Cell1  SKU1         29866   
10 2024-01-02  Cell1_Queue    c_Cell1  c_Cell1  SKU2         29866   
11 2024-01-02  Cell1_Queue    c_Cell1  c_Cell1  SKU4         29866   
12 2024-01-02  Cell1_Queue    c_Cell1