# 생성한 모델을 비교 분석

In [56]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import numpy as np
import glob


In [57]:
data = pd.read_csv("../model_predict_data.csv")

# 모델 불러오기

## CatBoost

In [58]:
cb_10k = CatBoostClassifier()
cb_10k.load_model("catboost_model_10k.cbm")

cb_5m = CatBoostClassifier()
cb_5m.load_model("catboost_model_5m.cbm")

<catboost.core.CatBoostClassifier at 0x1e975fd4260>

In [59]:
cb_10k_features = list(cb_10k.feature_names_)
cb_5m_features = list(cb_5m.feature_names_)

범주형 변수 확인

In [60]:
cb_10k_cat_features = [cb_10k_features[i] for i in cb_10k.get_cat_feature_indices()]
cb_5m_cat_features = [cb_5m_features[i] for i in cb_5m.get_cat_feature_indices()]

In [61]:
cb_10k_cat_features

[]

In [62]:
cb_5m_cat_features

['Pandemic']

## LightGBM

In [63]:
lgb_10k = lgb.Booster(model_file='lightgbm_model_10k.txt')
lgb_5m = lgb.Booster(model_file='lightgbm_model_5m.txt')

In [64]:
lgb_10k_features = lgb_10k.feature_name()
lgb_5m_features = lgb_5m.feature_name()

lightgbm은 범주형 변수를 사용하지 않았음

In [65]:
(lgb_10k.params or {}).get("objective", "").lower()

'binary'

In [66]:
(lgb_5m.params or {}).get("objective", "").lower()

'binary'

# 예측 진행

In [67]:
data_cb_10k = data[cb_10k_features]
data_cb_5m = data[cb_5m_features]
data_lgb_10k = data[lgb_10k_features]
data_lgb_5m = data[lgb_5m_features]

## 10k

### CatBoost

In [68]:
cb_10k_pool = Pool(data_cb_10k)
cb_10k_proba = cb_10k.predict_proba(cb_10k_pool)[:, 1]
cb_10k_pred = cb_10k.predict(cb_10k_pool)

### LightGBM

In [69]:
lgbm_10k_proba = lgb_10k.predict(data_lgb_10k)
lgbm_10k_pred = (lgbm_10k_proba >= 0.5).astype(int)

### 결과

In [70]:
result_10k = pd.DataFrame({
    'Movie_Title': data['Movie_Title'],
    'cb_10k_pred': cb_10k_pred,
    'cb_10k_proba': np.round(cb_10k_proba, 4),
    'lgbm_10k_pred': lgbm_10k_pred,
    'lgbm_10k_proba': np.round(lgbm_10k_proba, 4)
})
result_10k

Unnamed: 0,Movie_Title,cb_10k_pred,cb_10k_proba,lgbm_10k_pred,lgbm_10k_proba
0,F1 더 무비,0,0.0,0,0.0012
1,강령: 귀신놀이,0,0.0,0,0.0007
2,극장판 귀멸의 칼날: 무한성편,0,0.0,0,0.0013
3,긴키 지방의 어느 장소에 대하여,0,0.0,0,0.0012
4,꼬마마법사 주니토니,0,0.0,0,0.0012
5,"나의 아픈, 사랑이야기",0,0.0182,0,0.0402
6,너는 나를 불태워,1,1.0,1,0.9984
7,노바디2,0,0.0,0,0.0009
8,노이즈,0,0.0,0,0.0009
9,더 폴: 디렉터스 컷,0,0.0,0,0.001


## 5m

### CatBoost

In [71]:
cb_5m_pool = Pool(data_cb_5m, cat_features=cb_5m_cat_features)
cb_5m_proba = cb_5m.predict_proba(cb_5m_pool)[:, 1]
cb_5m_pred = cb_5m.predict(cb_5m_pool)

### LightGBM

In [72]:
lgbm_5m_proba = lgb_5m.predict(data_lgb_5m)
lgbm_5m_pred = (lgbm_5m_proba >= 0.5).astype(int)

### 결과

In [73]:
result_5m = pd.DataFrame({
    'Movie_Title': data['Movie_Title'],
    'cb_5m_pred': cb_5m_pred,
    'cb_5m_proba': np.round(cb_5m_proba, 4),
    'lgbm_5m_pred': lgbm_5m_pred,
    'lgbm_5m_proba': np.round(lgbm_5m_proba, 4)
})
result_5m

Unnamed: 0,Movie_Title,cb_5m_pred,cb_5m_proba,lgbm_5m_pred,lgbm_5m_proba
0,F1 더 무비,0,0.0001,0,0.0024
1,강령: 귀신놀이,0,0.0,0,0.0024
2,극장판 귀멸의 칼날: 무한성편,0,0.0041,0,0.1967
3,긴키 지방의 어느 장소에 대하여,0,0.0,0,0.0024
4,꼬마마법사 주니토니,0,0.0,0,0.0024
5,"나의 아픈, 사랑이야기",0,0.0,0,0.0024
6,너는 나를 불태워,0,0.0,0,0.0024
7,노바디2,0,0.0,0,0.0024
8,노이즈,0,0.0027,0,0.0051
9,더 폴: 디렉터스 컷,0,0.0,0,0.0024


# 실제 데이터랑 비교

In [74]:
import pandas as pd
from bs4 import BeautifulSoup
import os
import glob
search_pattern = os.path.join('*.xls')
file_list = glob.glob(search_pattern)
file_list.sort(reverse=True)

for file_path in file_list:
    print(f"\n--- 📂 파일 처리 시작: {file_path} ---")
    try:
        # A. 날짜 데이터 추출 (파일 이름용)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        soup = BeautifulSoup(html_content, 'lxml')
        board_titles = soup.find_all('div', class_='board_tit')
        
        date_list = []
        for title in board_titles:
            processed_text = title.get_text(strip=True).split('(')[0] \
                                          .replace("●", "").replace("년 ", "-") \
                                          .replace("월 ", "-").replace("일", "").strip()
            date_list.append(processed_text)

        # B. 테이블 데이터 추출 (파일 내용용)
        list_of_dataframes = pd.read_html(file_path, header=0, encoding='utf-8')
        data_tables = list_of_dataframes[1:]

        # C. 날짜와 데이터를 짝지어 CSV 파일로 저장
        for date_str, daily_df in zip(date_list, data_tables):
            csv_file_path = os.path.join(f"{date_str}.csv")
            daily_df.iloc[:-1].to_csv(csv_file_path, index=False, encoding='utf-8-sig')
            print(f"✅ '{csv_file_path}' 파일 저장 완료!")

    except Exception as e:
        # 한 파일에서 오류가 발생해도 프로그램이 멈추지 않고 다음 파일 처리를 계속 진행합니다.
        print(f"🚨 '{file_path}' 처리 중 오류 발생: {e}")
        continue



--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10.xls ---
✅ '2025-10-09.csv' 파일 저장 완료!
✅ '2025-10-08.csv' 파일 저장 완료!
✅ '2025-10-07.csv' 파일 저장 완료!
✅ '2025-10-06.csv' 파일 저장 완료!
✅ '2025-10-05.csv' 파일 저장 완료!
✅ '2025-10-04.csv' 파일 저장 완료!
✅ '2025-10-03.csv' 파일 저장 완료!
✅ '2025-10-02.csv' 파일 저장 완료!

--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10 (4).xls ---
✅ '2025-09-10.csv' 파일 저장 완료!
✅ '2025-09-09.csv' 파일 저장 완료!
✅ '2025-09-08.csv' 파일 저장 완료!
✅ '2025-09-07.csv' 파일 저장 완료!
✅ '2025-09-06.csv' 파일 저장 완료!
✅ '2025-09-05.csv' 파일 저장 완료!
✅ '2025-09-04.csv' 파일 저장 완료!

--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10 (3).xls ---
✅ '2025-09-17.csv' 파일 저장 완료!
✅ '2025-09-16.csv' 파일 저장 완료!
✅ '2025-09-15.csv' 파일 저장 완료!
✅ '2025-09-14.csv' 파일 저장 완료!
✅ '2025-09-13.csv' 파일 저장 완료!
✅ '2025-09-12.csv' 파일 저장 완료!
✅ '2025-09-11.csv' 파일 저장 완료!

--- 📂 파일 처리 시작: KOBIS_일별_박스오피스_2025-10-10 (2).xls ---
✅ '2025-09-24.csv' 파일 저장 완료!
✅ '2025-09-23.csv' 파일 저장 완료!
✅ '2025-09-22.csv' 파일 저장 완료!
✅ '2025-09-21.csv' 파일 저장 완료!
✅ '2025-09-20.csv' 파일 저장 완료!
✅

In [75]:
csv_files = glob.glob('2025*.csv')
dataframes = []
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    file_name = os.path.basename(csv_file)[:-4]
    df['일자'] = file_name
    dataframes.append(df)
merged_df = pd.concat(dataframes, ignore_index=True)

In [76]:
merged_df = merged_df[['영화명', '누적관객수', '일자']]

In [80]:
latest_df = merged_df.sort_values('일자').groupby('영화명', as_index=False).last()
latest_df = latest_df[['영화명', '누적관객수', '일자']]
latest_df

Unnamed: 0,영화명,누적관객수,일자
0,#진상을 말씀드립니다,6853,2025-10-09
1,007 살인번호,13695,2025-10-09
2,10 라이브즈,60486,2025-09-10
3,100 미터.,13985,2025-10-09
4,10년만에 만난 선생님에게 조련 당한 여자,1,2025-10-01
...,...,...,...
910,휴가,7802,2025-09-23
911,흑인 물건 맛에 빠져버린 인기 블로거,1,2025-10-01
912,희생,15737,2025-09-30
913,히어,15630,2025-09-23


In [84]:
result = pd.merge(result_5m, result_10k, on='Movie_Title', how='inner')
result = result.merge(latest_df, left_on='Movie_Title', right_on='영화명', how='left')

result["answer"] = 0
result.loc[result["누적관객수"] < 10_000, "answer"] = 1
result.loc[result["누적관객수"] > 5_000_000, "answer"] = 2

In [85]:
result.drop(columns=['영화명', '누적관객수', '일자'])

Unnamed: 0,Movie_Title,cb_5m_pred,cb_5m_proba,lgbm_5m_pred,lgbm_5m_proba,cb_10k_pred,cb_10k_proba,lgbm_10k_pred,lgbm_10k_proba,answer
0,F1 더 무비,0,0.0001,0,0.0024,0,0.0,0,0.0012,2
1,강령: 귀신놀이,0,0.0,0,0.0024,0,0.0,0,0.0007,0
2,극장판 귀멸의 칼날: 무한성편,0,0.0041,0,0.1967,0,0.0,0,0.0013,2
3,긴키 지방의 어느 장소에 대하여,0,0.0,0,0.0024,0,0.0,0,0.0012,0
4,꼬마마법사 주니토니,0,0.0,0,0.0024,0,0.0,0,0.0012,0
5,"나의 아픈, 사랑이야기",0,0.0,0,0.0024,0,0.0182,0,0.0402,0
6,너는 나를 불태워,0,0.0,0,0.0024,1,1.0,1,0.9984,1
7,노바디2,0,0.0,0,0.0024,0,0.0,0,0.0009,0
8,노이즈,0,0.0027,0,0.0051,0,0.0,0,0.0009,0
9,더 폴: 디렉터스 컷,0,0.0,0,0.0024,0,0.0,0,0.001,0
