# 라이브러리

In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
import joblib

# 데이터

In [4]:
data_predict_origin = pd.read_csv("../model/model_predict_data.csv", encoding="utf-8-sig")

In [5]:
data_predict_origin.head()

Unnamed: 0,Movie_Title,wk1_Audience,wk1_AudiencePerShow,wk2_Audience,wk2_AudiencePerShow,Show_Change,opening_Ho_Retention,wk1_Holiday_AudienceMean,wk1_Holiday_ShowMean,wk2_Holiday_AudienceMean,...,e247,e248,e249,e250,e251,e252,e253,e254,e255,e256
0,F1 더 무비,478085.0,18.453893,498575,21.791818,0.88312,0.919323,140722.5,5263.0,129369.5,...,-0.000246,-0.020752,0.010559,-0.007599,-0.025391,0.008606,-0.003235,0.008972,0.028931,0.066406
1,강령: 귀신놀이,41447.0,20.733867,25043,14.01399,0.893947,0.426405,9892.0,375.0,4218.0,...,0.004852,-0.045166,-0.014221,0.001099,-0.017456,-0.019287,-0.007996,-0.032715,0.034668,0.003586
2,극장판 귀멸의 칼날: 무한성편,1623200.0,63.005085,1504807,30.722887,1.901176,0.575226,552628.5,8626.5,317886.5,...,-0.00705,-0.024902,-0.024536,0.004028,-0.035645,0.007935,-0.028687,0.023193,0.028076,-0.001259
3,긴키 지방의 어느 장소에 대하여,101649.0,32.506876,86929,20.677688,1.34442,0.766366,24380.0,664.333333,18684.0,...,-0.014587,-0.011475,-0.038818,0.00885,-0.039551,-0.00135,0.004028,-0.023438,0.036621,0.013672
4,꼬마마법사 주니토니,11403.0,22.625,23195,21.697848,2.121032,0.638604,5701.5,252.0,3641.0,...,0.013184,-0.046875,0.000866,0.047119,-0.024902,-0.006378,0.009399,0.004211,0.004547,0.02832


# I. 1만명 이진분류

## 1. 이진분류 모델

CatBoost를 사용

In [6]:
cb_10k = CatBoostClassifier()
cb_10k.load_model("../model/이진분류/catboost_model_10k.cbm")

<catboost.core.CatBoostClassifier at 0x11b8101a0>

사용되는 변수명 확인

In [7]:
cb_10k_features = list(cb_10k.feature_names_)
print(cb_10k_features)

['wk1_Audience', 'wk2_Audience', 'wk1_Holiday_AudienceMean', 'wk2_Holiday_AudienceMean', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6', 'e7', 'e8', 'e9', 'e10', 'e11', 'e12', 'e13', 'e14', 'e15', 'e16', 'e17', 'e18', 'e19', 'e20', 'e21', 'e22', 'e23', 'e24', 'e25', 'e26', 'e27', 'e28', 'e29', 'e30', 'e31', 'e32']


범주형 변수는 사용되지 않는다

## 2. 예측 진행

In [8]:
data_cb_10k = data_predict_origin[cb_10k_features] #사용되는 변수만으로 데이터셋 생성
cb_10k_pool = Pool(data_cb_10k)
cb_10k_pred = cb_10k.predict(cb_10k_pool)
cb_10k_proba = cb_10k.predict_proba(cb_10k_pool)[:, 1]

## 3. 예측 결과

In [9]:
result_10k = pd.DataFrame({
    'Movie_Title': data_predict_origin['Movie_Title'],
    'cb_10k_pred': cb_10k_pred,
    'cb_10k_proba': np.round(cb_10k_proba, 4),
})
result_10k

Unnamed: 0,Movie_Title,cb_10k_pred,cb_10k_proba
0,F1 더 무비,0,0.0
1,강령: 귀신놀이,0,0.0
2,극장판 귀멸의 칼날: 무한성편,0,0.0
3,긴키 지방의 어느 장소에 대하여,0,0.0
4,꼬마마법사 주니토니,0,0.0
5,"나의 아픈, 사랑이야기",0,0.0182
6,너는 나를 불태워,1,1.0
7,노바디2,0,0.0
8,노이즈,0,0.0
9,더 폴: 디렉터스 컷,0,0.0


## 4. 결론

In [10]:
under_10k_movie = result_10k.query('cb_10k_pred == 1')["Movie_Title"].values.tolist()
under_10k_movie

['너는 나를 불태워', '봄밤', '제프 맥페트리지: 드로잉 라이프']

`너는 나를 불태워`, `봄밤`, `제프 맥페트리지: 드로잉 라이프`영화는 총 관람객수가 1만명을 넘지 못할것으로 예상되어진다

# II. 500만명 이진분류

## 1. 총 관람객수 1만명 미만 예측 영화 제거

In [11]:
data_predict_5m = data_predict_origin[~data_predict_origin['Movie_Title'].isin(under_10k_movie)].copy()
data_predict_5m.reset_index(drop=True, inplace=True)

In [12]:
data_predict_5m.head()

Unnamed: 0,Movie_Title,wk1_Audience,wk1_AudiencePerShow,wk2_Audience,wk2_AudiencePerShow,Show_Change,opening_Ho_Retention,wk1_Holiday_AudienceMean,wk1_Holiday_ShowMean,wk2_Holiday_AudienceMean,...,e247,e248,e249,e250,e251,e252,e253,e254,e255,e256
0,F1 더 무비,478085.0,18.453893,498575,21.791818,0.88312,0.919323,140722.5,5263.0,129369.5,...,-0.000246,-0.020752,0.010559,-0.007599,-0.025391,0.008606,-0.003235,0.008972,0.028931,0.066406
1,강령: 귀신놀이,41447.0,20.733867,25043,14.01399,0.893947,0.426405,9892.0,375.0,4218.0,...,0.004852,-0.045166,-0.014221,0.001099,-0.017456,-0.019287,-0.007996,-0.032715,0.034668,0.003586
2,극장판 귀멸의 칼날: 무한성편,1623200.0,63.005085,1504807,30.722887,1.901176,0.575226,552628.5,8626.5,317886.5,...,-0.00705,-0.024902,-0.024536,0.004028,-0.035645,0.007935,-0.028687,0.023193,0.028076,-0.001259
3,긴키 지방의 어느 장소에 대하여,101649.0,32.506876,86929,20.677688,1.34442,0.766366,24380.0,664.333333,18684.0,...,-0.014587,-0.011475,-0.038818,0.00885,-0.039551,-0.00135,0.004028,-0.023438,0.036621,0.013672
4,꼬마마법사 주니토니,11403.0,22.625,23195,21.697848,2.121032,0.638604,5701.5,252.0,3641.0,...,0.013184,-0.046875,0.000866,0.047119,-0.024902,-0.006378,0.009399,0.004211,0.004547,0.02832


## 2. 이진분류 모델

In [13]:
svm_5m = joblib.load('../model/이진분류/svm_model_5m.joblib') #SVM 모델
scaler = joblib.load('../model/이진분류/svm_scaler.joblib') #표준화 모델
pca_1 = joblib.load('../model/이진분류/pca_1.joblib') #PCA 모델 1
pca_2 = joblib.load('../model/이진분류/pca_2.joblib') #PCA 모델 2

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 3. 데이터 전처리

In [14]:
# 1) 사용할 컬럼
use_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 
            'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 
            'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 
            'wk2_Holiday_ShowMean', 'opening_AudienceStd',
            'dist_big_flop', 'dist_big_hit', 'dist_small_flop', 'dist_small_hit',
            'Month', 'Pandemic', 'Grade', 'Main_Country']
data_svm = data_predict_5m[use_cols].copy()

In [15]:
# 2) 수치형 변수 로그변환
log_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 
            'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 
            'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 
            'wk2_Holiday_ShowMean', 'opening_AudienceStd']
data_svm[log_cols] = np.log1p(data_svm[log_cols])

In [16]:
# 3) 수치형 변수 표준화
data_svm[log_cols] = scaler.transform(data_svm[log_cols])

In [17]:
# 4) 범주형 변수 원핫인코딩
data_svm = pd.get_dummies(data_svm, columns=['Month', 'Pandemic', 'Grade', 'Main_Country'])
## 부족한 원핫 인코딩 생성
result_onehot_cols = ['Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가', 'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본', 'Main_Country_한국']
missing_cols = [c for c in result_onehot_cols if c not in data_svm.columns]
if missing_cols:
    data_svm[missing_cols] = 0
## 원핫인코딩 타입 변환
onehot_cols = ['Month', 'Pandemic', 'Grade', 'Main_Country']
onehot_columns = [col for col in data_svm.columns if any(prefix in col for prefix in ['Month_', 'Pandemic_', 'Grade_', 'Main_Country_'])]
data_svm[onehot_columns] = data_svm[onehot_columns].astype(int)

In [18]:
# 5) PCA 변환
## PCA 라이브러리를 안불러와도 됨
pca_1_cols = ['wk2_AudiencePerShow', 'wk1_AudiencePerShow']
pca_2_cols = ['wk1_Holiday_AudienceMean','wk2_Holiday_AudienceMean', 'wk1_Audience', 'opening_AudienceStd', 'wk2_Audience', 'wk2_Holiday_ShowMean', 'wk1_Holiday_ShowMean']
df_pca_1 = pd.DataFrame(pca_1.transform(data_svm[pca_1_cols]), columns=["PC1"], index=data_svm.index)
df_pca_2 = pd.DataFrame(pca_2.transform(data_svm[pca_2_cols]), columns=["PC2"], index=data_svm.index)
data_svm.drop(columns=pca_1_cols + pca_2_cols, inplace=True)
data_svm = pd.concat([data_svm, df_pca_1, df_pca_2], axis=1)

In [19]:
# 6) 불필요한 변수 제거
origin_train_cols = ['Show_Change', 'opening_Ho_Retention', 'dist_big_flop', 'dist_big_hit','dist_small_flop', 'dist_small_hit', 'Month_2', 'Month_3', 'Month_4','Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10','Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가','Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본','Main_Country_한국', 'PC1', 'PC2']
data_svm.drop(columns=['Grade_12세이상관람가', 'Main_Country_기타'], inplace=True)
data_svm = data_svm[origin_train_cols].copy()


## 4. 예측 결과

In [20]:
svm_5m_pred = svm_5m.predict(data_svm)
result_5m = pd.DataFrame({
    'Movie_Title': data_predict_5m['Movie_Title'],
    'cb_5m_pred': svm_5m_pred
})

result_5m

Unnamed: 0,Movie_Title,cb_5m_pred
0,F1 더 무비,0
1,강령: 귀신놀이,0
2,극장판 귀멸의 칼날: 무한성편,0
3,긴키 지방의 어느 장소에 대하여,0
4,꼬마마법사 주니토니,0
5,"나의 아픈, 사랑이야기",0
6,노바디2,0
7,노이즈,0
8,더 폴: 디렉터스 컷,0
9,독립군: 끝나지 않은 전쟁,0


## 5. 결론

In [21]:
over_5m_movie = result_5m.query('cb_5m_pred == 1')["Movie_Title"].values.tolist()
over_5m_movie

['좀비딸']

`좀비딸`영화의 총 관람객수가 500만명을 넘을것으로 예상되어진다.

# III. 총 관객수 예측

## 1. 총 관람객수 500만명 이상 예측 영화 제거

In [22]:
data_predict_final = data_predict_5m[~data_predict_5m['Movie_Title'].isin(over_5m_movie)].copy()
data_predict_final.reset_index(drop=True, inplace=True)
data_predict_final.head()

Unnamed: 0,Movie_Title,wk1_Audience,wk1_AudiencePerShow,wk2_Audience,wk2_AudiencePerShow,Show_Change,opening_Ho_Retention,wk1_Holiday_AudienceMean,wk1_Holiday_ShowMean,wk2_Holiday_AudienceMean,...,e247,e248,e249,e250,e251,e252,e253,e254,e255,e256
0,F1 더 무비,478085.0,18.453893,498575,21.791818,0.88312,0.919323,140722.5,5263.0,129369.5,...,-0.000246,-0.020752,0.010559,-0.007599,-0.025391,0.008606,-0.003235,0.008972,0.028931,0.066406
1,강령: 귀신놀이,41447.0,20.733867,25043,14.01399,0.893947,0.426405,9892.0,375.0,4218.0,...,0.004852,-0.045166,-0.014221,0.001099,-0.017456,-0.019287,-0.007996,-0.032715,0.034668,0.003586
2,극장판 귀멸의 칼날: 무한성편,1623200.0,63.005085,1504807,30.722887,1.901176,0.575226,552628.5,8626.5,317886.5,...,-0.00705,-0.024902,-0.024536,0.004028,-0.035645,0.007935,-0.028687,0.023193,0.028076,-0.001259
3,긴키 지방의 어느 장소에 대하여,101649.0,32.506876,86929,20.677688,1.34442,0.766366,24380.0,664.333333,18684.0,...,-0.014587,-0.011475,-0.038818,0.00885,-0.039551,-0.00135,0.004028,-0.023438,0.036621,0.013672
4,꼬마마법사 주니토니,11403.0,22.625,23195,21.697848,2.121032,0.638604,5701.5,252.0,3641.0,...,0.013184,-0.046875,0.000866,0.047119,-0.024902,-0.006378,0.009399,0.004211,0.004547,0.02832


## 2. 데이터 전처리

In [23]:
predict_data = data_predict_final.copy()

In [24]:
# 1) 임베딩 벡터 제거
e_list = [f"e{i}" for i in range(1, 256+1)]
predict_data.drop(columns=e_list, inplace=True)

In [25]:
# 2) 영화 제목 제거
predict_data.drop(columns=['Movie_Title'], inplace=True)

In [26]:
# 3) 수치형, 범주형 변수 분리
num_cols = ['wk1_Audience', 'wk1_AudiencePerShow', 'wk2_Audience', 'wk2_AudiencePerShow', 'Show_Change', 'opening_Ho_Retention', 'wk1_Holiday_AudienceMean', 'wk1_Holiday_ShowMean', 'wk2_Holiday_AudienceMean', 'wk2_Holiday_ShowMean', 'opening_AudienceStd', 'Year', 'dist_big_flop', 'dist_big_hit', 'dist_small_flop', 'dist_small_hit']
cat_cols = ['Month', 'Pandemic', 'Grade', 'Main_Country']

In [27]:
# 4) 범주형 변수 원핫 인코딩
predict_data = pd.get_dummies(predict_data, columns=cat_cols, drop_first=True)
onehot_columns = [col for col in predict_data.columns if any(prefix in col for prefix in ['Month_', 'Pandemic_', 'Grade_', 'Main_Country_'])]
predict_data[onehot_columns] = predict_data[onehot_columns].astype(int)

In [28]:
# 5) 연도 변환 -> 시간의 흐름으로 보고싶음
predict_data["Year"] = predict_data["Year"] - predict_data["Year"].min()

In [29]:
# 6) 변수 변환
## 로그변환
log_cols=['wk1_Audience','wk1_AudiencePerShow','wk2_Audience','wk2_AudiencePerShow','Show_Change','wk1_Holiday_AudienceMean','wk1_Holiday_ShowMean','wk2_Holiday_AudienceMean','wk2_Holiday_ShowMean','opening_AudienceStd']
## 제곱근변환
sqrt_cols=['opening_Ho_Retention']
## 변환하지 않음
none_cols=['Year','dist_big_flop', 'dist_big_hit', 'dist_small_flop', 'dist_small_hit']
## 원핫인코딩 변수
cat_cols=['Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Pandemic_1', 'Pandemic_2', 'Grade_15세이상관람가', 'Grade_전체관람가', 'Grade_청소년관람불가', 'Main_Country_미국', 'Main_Country_일본', 'Main_Country_한국']

## 변환 진행
predict_data[log_cols] = predict_data[log_cols].astype("float64")
predict_data.loc[:, log_cols] = np.log1p(predict_data[log_cols])
predict_data.loc[:, sqrt_cols] = np.sqrt(predict_data[sqrt_cols])

In [30]:
# 7) 표준화 스케일링
## 스케일링 방법에 따른 컬럼 지정
standard_cols = [ 'wk1_Audience','wk2_AudiencePerShow','wk1_Holiday_AudienceMean', 'opening_AudienceStd','wk2_Audience', 'wk2_Holiday_ShowMean','wk1_AudiencePerShow', 'wk1_Holiday_ShowMean','wk2_Holiday_AudienceMean', 'Show_Change','Year','opening_Ho_Retention']
robust_cols = ['dist_big_flop', 'dist_big_hit', 'dist_small_flop', 'dist_small_hit']

## 스케일러 파일 불러오기
standard_scaler = joblib.load('standard_scaler.joblib')
robust_scaler = joblib.load('robust_scaler.joblib')

## 스케일링
predict_data[standard_cols] = standard_scaler.transform(predict_data[standard_cols])
predict_data[robust_cols] = robust_scaler.transform(predict_data[robust_cols])

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## 3) 예측 결과

결정계수 값이 가장 높은 릿지 모델을 사용

In [31]:
# 1) 모델 불러오기
ridge_cv_loaded = joblib.load('ridge_cv_model.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [32]:
# 2) 부족한 원핫인코딩 변수 채우기
missing_cols = set(ridge_cv_loaded.feature_names_in_) - set(predict_data.columns)
for col in missing_cols:
    predict_data[col] = 0
## 팬데믹 변수 생성
predict_data['Pandemic_2'] = 1

In [33]:
# 3) 변수 순서 정렬
predict_data = predict_data[ridge_cv_loaded.feature_names_in_]

In [34]:
# 3) 예측 진행 및 결과
ridge_cv_pred = ridge_cv_loaded.predict(predict_data)
final_result = pd.DataFrame({
    'Movie_Title': data_predict_final['Movie_Title'],
    'Total_Audience_Predict': np.expm1(ridge_cv_pred).round(0).astype(int)
})
final_result

Unnamed: 0,Movie_Title,Total_Audience_Predict
0,F1 더 무비,1375423
1,강령: 귀신놀이,58978
2,극장판 귀멸의 칼날: 무한성편,4585204
3,긴키 지방의 어느 장소에 대하여,120556
4,꼬마마법사 주니토니,2402
5,"나의 아픈, 사랑이야기",10514
6,노바디2,81217
7,노이즈,156050
8,더 폴: 디렉터스 컷,12606
9,독립군: 끝나지 않은 전쟁,9266


# IV. 총 관객수 100만명 이상 overestimate 검사

## 1. 총 관람객수 예측 100만명 이상 추출

In [35]:
predict_upper_1m_movie = final_result.query('Total_Audience_Predict >= 1_000_000')["Movie_Title"].values.tolist()
data_predict_upper_1m = data_predict_origin[data_predict_origin['Movie_Title'].isin(predict_upper_1m_movie)].copy()

## 2. 과대평가 예측 측정

In [36]:
# 1) threshold 기준 설정
rules = {
    # 'wk1_Audience': ('<', 835815),
    'Show_Change': ('>', 1.484),
    'opening_Ho_Retention': ('>', 0.876),
    # 'wk1_Holiday_ShowMean': ('<', 3346.25),
    # 'wk2_Holiday_AudienceMean': ('>', 217009)
}

# 2) 과대평가 점수 계산 함수 정의
def calc_overestimate(row, rules):
    score = 0
    for col, (op, thr) in rules.items():
        if op == '<' and row[col] < thr:
            score += 1
        elif op == '>' and row[col] > thr:
            score += 1

    if score>=3:
        return "overestimate"
    else:
        return "no problem"

In [37]:
# 3) 예측 결과
data_predict_upper_1m['Overestimate'] = data_predict_upper_1m.apply(calc_overestimate, axis=1, rules=rules)

In [38]:
data_predict_upper_1m[['Movie_Title', 'Overestimate']]

Unnamed: 0,Movie_Title,Overestimate
0,F1 더 무비,no problem
2,극장판 귀멸의 칼날: 무한성편,no problem
11,명탐정 코난: 척안의 잔상,no problem
23,전지적 독자 시점,no problem
27,킹 오브 킹스,no problem


과대평가된 영화는 없다 (과소평가는 신경쓰지 않는다.)