# 생성한 모델을 비교 분석

In [279]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import numpy as np
import glob
import joblib
from sklearn.svm import SVC

In [280]:
data = pd.read_csv("../model_predict_data.csv")

# 10k 모델 불러오기

## CatBoost

In [281]:
cb_10k = CatBoostClassifier()
cb_10k.load_model("catboost_model_10k.cbm")

<catboost.core.CatBoostClassifier at 0x286bc05adb0>

In [282]:
cb_10k_features = list(cb_10k.feature_names_)

범주형 변수 확인

In [283]:
cb_10k_cat_features = [cb_10k_features[i] for i in cb_10k.get_cat_feature_indices()]

In [284]:
cb_10k_cat_features

[]

## LightGBM

In [285]:
lgb_10k = lgb.Booster(model_file='lightgbm_model_10k.txt')

In [286]:
lgb_10k_features = lgb_10k.feature_name()

lightgbm은 범주형 변수를 사용하지 않았음

In [287]:
(lgb_10k.params or {}).get("objective", "").lower()

'binary'

# 5m 모델, 데이터 변환

In [288]:
svm_5m = joblib.load('svm_model_5m.joblib')
pca_1 = joblib.load('pca_1.joblib')
pca_2 = joblib.load('pca_2.joblib')

## 데이터 변환

사용 컬럼

In [289]:
use_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 
            'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 
            'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 
            'wk2_Holiday_ShowMean', 'opening_AudienceStd',
            'dist_big_flop', 'dist_big_hit', 'dist_small_flop', 'dist_small_hit',
            'Month', 'Pandemic', 'Grade', 'Main_Country']

In [290]:
data_svm = data[use_cols].copy()

In [291]:
log_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 
            'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 
            'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 
            'wk2_Holiday_ShowMean', 'opening_AudienceStd']

In [292]:
data_svm[log_cols] = np.log1p(data_svm[log_cols])

In [293]:
scaler = joblib.load('svm_scaler.joblib')

In [294]:
data_svm[log_cols] = scaler.transform(data_svm[log_cols])

In [295]:
data_svm.describe()

Unnamed: 0,wk1_Audience,wk1_AudiencePerShow,wk2_Audience,wk2_AudiencePerShow,Show_Change,opening_Ho_Retention,wk1_Holiday_AudienceMean,wk1_Holiday_ShowMean,wk2_Holiday_AudienceMean,wk2_Holiday_ShowMean,opening_AudienceStd,dist_big_flop,dist_big_hit,dist_small_flop,dist_small_hit,Month,Pandemic
count,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0
mean,-0.12051,-0.195763,-0.078637,-0.174922,0.101448,0.308105,-0.184546,-0.09641,-0.013701,0.092381,-0.162943,0.37931,0.517241,0.034483,0.103448,7.586207,2.0
std,1.164085,0.673526,1.203301,0.743353,1.07013,1.092788,1.213417,1.264681,1.152806,1.14854,1.182112,0.621852,0.508548,0.185695,0.309934,1.15007,0.0
min,-2.302474,-1.2829,-2.205181,-1.456706,-1.608772,-1.446642,-2.375028,-2.445312,-2.217078,-1.956005,-2.464185,0.0,0.0,0.0,0.0,5.0,2.0
25%,-0.941096,-0.727653,-0.92985,-0.948639,-0.682911,-0.377843,-0.896739,-0.873041,-0.858157,-0.742743,-0.966764,0.0,0.0,0.0,0.0,7.0,2.0
50%,-0.20648,-0.22013,-0.234749,-0.070393,-0.120301,0.310756,-0.3061,-0.001927,-0.106304,-0.00454,-0.102888,0.0,1.0,0.0,0.0,8.0,2.0
75%,0.831401,0.119673,0.856119,0.418088,0.723581,0.865898,0.673619,0.989968,0.861609,0.960456,0.773659,1.0,1.0,0.0,0.0,8.0,2.0
max,1.921818,1.314558,1.920317,0.898289,2.801633,3.48714,1.896309,1.868067,1.797128,1.944598,1.897899,2.0,1.0,1.0,1.0,12.0,2.0


In [296]:
sorted(data_svm["Month"].unique())

[5, 6, 7, 8, 12]

In [297]:
sorted(data_svm["Pandemic"].unique())

[2]

In [298]:
sorted(data_svm["Grade"].unique())

['12세이상관람가', '15세이상관람가', '전체관람가', '청소년관람불가']

In [299]:
sorted(data_svm["Main_Country"].unique())

['기타', '미국', '일본', '한국']

In [300]:
data_svm = pd.get_dummies(data_svm, columns=['Month', 'Pandemic', 'Grade', 'Main_Country'])

In [301]:
data_svm.columns

Index(['wk1_Audience', 'wk1_AudiencePerShow', 'wk2_Audience',
       'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention',
       'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean',
       'wk2_Holiday_AudienceMean', 'wk2_Holiday_ShowMean',
       'opening_AudienceStd', 'dist_big_flop', 'dist_big_hit',
       'dist_small_flop', 'dist_small_hit', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_12', 'Pandemic_2', 'Grade_12세이상관람가', 'Grade_15세이상관람가',
       'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_기타', 'Main_Country_미국',
       'Main_Country_일본', 'Main_Country_한국'],
      dtype='object')

In [302]:
result_onehot_cols = ['Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가', 'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본', 'Main_Country_한국']

In [303]:
missing_cols = [c for c in result_onehot_cols if c not in data_svm.columns]
if missing_cols:
    data_svm[missing_cols] = 0

In [304]:
onehot_cols = ['Month', 'Pandemic', 'Grade', 'Main_Country']
onehot_columns = [col for col in data_svm.columns if any(prefix in col for prefix in ['Month_', 'Pandemic_', 'Grade_', 'Main_Country_'])]
data_svm[onehot_columns] = data_svm[onehot_columns].astype(int)

In [305]:
pca_1_cols = ['wk2_AudiencePerShow', 'wk1_AudiencePerShow']
pca_2_cols = ['wk1_Holiday_AudienceMean','wk2_Holiday_AudienceMean', 
  'wk1_Audience',
  'opening_AudienceStd',
  'wk2_Audience',
  'wk2_Holiday_ShowMean',
  'wk1_Holiday_ShowMean']

In [306]:
df_pca_1 = pd.DataFrame(pca_1.transform(data_svm[pca_1_cols]), columns=["PC1"], index=data_svm.index)
df_pca_2 = pd.DataFrame(pca_2.transform(data_svm[pca_2_cols]), columns=["PC2"], index=data_svm.index)

In [307]:
data_svm.drop(columns=pca_1_cols + pca_2_cols, inplace=True)
data_svm = pd.concat([data_svm, df_pca_1, df_pca_2], axis=1)

In [308]:
origin_train_cols = ['Show_Change', 'opening_Ho_Retention', 'dist_big_flop', 'dist_big_hit',
       'dist_small_flop', 'dist_small_hit', 'Month_2', 'Month_3', 'Month_4',
       'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10',
       'Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가',
       'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본',
       'Main_Country_한국', 'PC1', 'PC2']

In [309]:
set(data_svm.columns) - set(origin_train_cols)

{'Grade_12세이상관람가', 'Main_Country_기타'}

In [310]:
data_svm.drop(columns=['Grade_12세이상관람가', 'Main_Country_기타'], inplace=True)

In [311]:
data_svm = data_svm[origin_train_cols]

# 예측 진행

In [312]:
data_cb_10k = data[cb_10k_features]
data_lgb_10k = data[lgb_10k_features]

## 10k

### CatBoost

In [313]:
cb_10k_pool = Pool(data_cb_10k)
cb_10k_proba = cb_10k.predict_proba(cb_10k_pool)[:, 1]
cb_10k_pred = cb_10k.predict(cb_10k_pool)

### LightGBM

In [314]:
lgbm_10k_proba = lgb_10k.predict(data_lgb_10k)
lgbm_10k_pred = (lgbm_10k_proba >= 0.5).astype(int)

### 결과

In [315]:
result_10k = pd.DataFrame({
    'Movie_Title': data['Movie_Title'],
    'cb_10k_pred': cb_10k_pred,
    'cb_10k_proba': np.round(cb_10k_proba, 4),
    'lgbm_10k_pred': lgbm_10k_pred,
    'lgbm_10k_proba': np.round(lgbm_10k_proba, 4)
})
result_10k

Unnamed: 0,Movie_Title,cb_10k_pred,cb_10k_proba,lgbm_10k_pred,lgbm_10k_proba
0,F1 더 무비,0,0.0,0,0.0012
1,강령: 귀신놀이,0,0.0,0,0.0007
2,극장판 귀멸의 칼날: 무한성편,0,0.0,0,0.0013
3,긴키 지방의 어느 장소에 대하여,0,0.0,0,0.0012
4,꼬마마법사 주니토니,0,0.0,0,0.0012
5,"나의 아픈, 사랑이야기",0,0.0182,0,0.0402
6,너는 나를 불태워,1,1.0,1,0.9984
7,노바디2,0,0.0,0,0.0009
8,노이즈,0,0.0,0,0.0009
9,더 폴: 디렉터스 컷,0,0.0,0,0.001


## 5k

In [316]:
svm_5m_pred = svm_5m.predict(data_svm)

In [317]:
result_5m = pd.DataFrame({
    'Movie_Title': data['Movie_Title'],
    'cb_5m_pred': svm_5m_pred
})

# 실제 데이터랑 비교

In [318]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import glob
search_pattern = os.path.join('*.xls')
file_list = glob.glob(search_pattern)
file_list.sort(reverse=True)

for file_path in file_list:
    print(f"\n--- 📂 파일 처리 시작: {file_path} ---")
    try:
        # A. 날짜 데이터 추출 (파일 이름용)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, 'lxml')
        board_titles = soup.find_all('div', class_='board_tit')
        
        date_list = []
        for title in board_titles:
            processed_text = title.get_text(strip=True).split('(')[0] \
                                          .replace("●", "").replace("년 ", "-") \
                                          .replace("월 ", "-").replace("일", "").strip()
            date_list.append(processed_text)

        # B. 테이블 데이터 추출 (파일 내용용)
        list_of_dataframes = pd.read_html(file_path, header=0, encoding='utf-8')
        data_tables = list_of_dataframes[1:]

        # C. 날짜와 데이터를 짝지어 CSV 파일로 저장
        for date_str, daily_df in zip(date_list, data_tables):
            csv_file_path = os.path.join(f"{date_str}.csv")
            daily_df.iloc[:-1].to_csv(csv_file_path, index=False, encoding='utf-8-sig')
            print(f"✅ '{csv_file_path}' 파일 저장 완료!")

    except Exception as e:
        # 한 파일에서 오류가 발생해도 프로그램이 멈추지 않고 다음 파일 처리를 계속 진행합니다.
        print(f"🚨 '{file_path}' 처리 중 오류 발생: {e}")
        continue



--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10.xls ---
✅ '2025-10-09.csv' 파일 저장 완료!
✅ '2025-10-08.csv' 파일 저장 완료!
✅ '2025-10-07.csv' 파일 저장 완료!
✅ '2025-10-06.csv' 파일 저장 완료!
✅ '2025-10-05.csv' 파일 저장 완료!
✅ '2025-10-04.csv' 파일 저장 완료!
✅ '2025-10-03.csv' 파일 저장 완료!
✅ '2025-10-02.csv' 파일 저장 완료!

--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10 (4).xls ---
✅ '2025-09-10.csv' 파일 저장 완료!
✅ '2025-09-09.csv' 파일 저장 완료!
✅ '2025-09-08.csv' 파일 저장 완료!
✅ '2025-09-07.csv' 파일 저장 완료!
✅ '2025-09-06.csv' 파일 저장 완료!
✅ '2025-09-05.csv' 파일 저장 완료!
✅ '2025-09-04.csv' 파일 저장 완료!

--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10 (3).xls ---
✅ '2025-09-17.csv' 파일 저장 완료!
✅ '2025-09-16.csv' 파일 저장 완료!
✅ '2025-09-15.csv' 파일 저장 완료!
✅ '2025-09-14.csv' 파일 저장 완료!
✅ '2025-09-13.csv' 파일 저장 완료!
✅ '2025-09-12.csv' 파일 저장 완료!
✅ '2025-09-11.csv' 파일 저장 완료!

--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10 (2).xls ---
✅ '2025-09-24.csv' 파일 저장 완료!
✅ '2025-09-23.csv' 파일 저장 완료!
✅ '2025-09-22.csv' 파일 저장 완료!
✅ '2025-09-21.csv' 파일 저장 완료!
✅ '2025-09-20.csv' 파일 저장 완료!
✅

In [319]:
csv_files = glob.glob('2025*.csv')
dataframes = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    file_name = os.path.basename(csv_file)[:-4]
    df['일자'] = file_name
    dataframes.append(df)
merged_df = pd.concat(dataframes, ignore_index=True)

In [320]:
merged_df = merged_df[['영화명', '누적관객수', '일자']]

In [321]:
latest_df = merged_df.sort_values('일자').groupby('영화명', as_index=False).last()
latest_df = latest_df[['영화명', '누적관객수', '일자']]
latest_df

Unnamed: 0,영화명,누적관객수,일자
0,#진상을 말씀드립니다,6853,2025-10-09
1,007 살인번호,13695,2025-10-09
2,10 라이브즈,60486,2025-09-10
3,100 미터.,13985,2025-10-09
4,10년만에 만난 선생님에게 조련 당한 여자,1,2025-10-01
...,...,...,...
910,휴가,7802,2025-09-23
911,흑인 물건 맛에 빠져버린 인기 블로거,1,2025-10-01
912,희생,15737,2025-09-30
913,히어,15630,2025-09-23


10k미만이면 1, 5m이상이면 2

In [322]:
result = pd.merge(result_5m, result_10k, on='Movie_Title', how='inner')
result = result.merge(latest_df, left_on='Movie_Title', right_on='영화명', how='left')

result["answer"] = 0
result.loc[result["누적관객수"] < 10_000, "answer"] = 1
result.loc[result["누적관객수"] >= 5_000_000, "answer"] = 2

In [323]:
result.drop(columns=['영화명', '누적관객수', '일자'])

Unnamed: 0,Movie_Title,cb_5m_pred,cb_10k_pred,cb_10k_proba,lgbm_10k_pred,lgbm_10k_proba,answer
0,F1 더 무비,0,0,0.0,0,0.0012,2
1,강령: 귀신놀이,0,0,0.0,0,0.0007,0
2,극장판 귀멸의 칼날: 무한성편,0,0,0.0,0,0.0013,2
3,긴키 지방의 어느 장소에 대하여,0,0,0.0,0,0.0012,0
4,꼬마마법사 주니토니,0,0,0.0,0,0.0012,0
5,"나의 아픈, 사랑이야기",0,0,0.0182,0,0.0402,0
6,너는 나를 불태워,0,1,1.0,1,0.9984,1
7,노바디2,0,0,0.0,0,0.0009,0
8,노이즈,0,0,0.0,0,0.0009,0
9,더 폴: 디렉터스 컷,0,0,0.0,0,0.001,0


5m 이상인 것은 `F1 더 무비`와 `극장판 귀멸의 칼날: 무한성편`을 잘 예측하지 못한다 - 규격외의 존재여서 그런가?  
정밀도 = 1/1  
민감도 = 1/3