# 영화 관객 수 데이터를 활용한 데이터 분석

감독, 이름, 상영등급, 스태프 수 등의 정보로 영화 관객 수를 예측하는 모델을 만들어보자

In [1]:
import pandas as pd
import lightgbm as lgb

In [2]:
# 데이터 불러오기
train = pd.read_csv('영화 관객수/movies_train.csv')
test = pd.read_csv('영화 관객수/movies_test.csv')
submission = pd.read_csv('영화 관객수/submission.csv')

In [3]:
train.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
0,개들의 전쟁,롯데엔터테인먼트,액션,2012-11-22,96,청소년 관람불가,조병옥,,0,91,2,23398
1,내부자들,(주)쇼박스,느와르,2015-11-19,130,청소년 관람불가,우민호,1161602.5,2,387,3,7072501
2,은밀하게 위대하게,(주)쇼박스,액션,2013-06-05,123,15세 관람가,장철수,220775.25,4,343,4,6959083
3,나는 공무원이다,(주)NEW,코미디,2012-07-12,101,전체 관람가,구자홍,23894.0,2,20,6,217866
4,불량남녀,쇼박스(주)미디어플렉스,코미디,2010-11-04,108,15세 관람가,신근호,1.0,1,251,2,483387


In [4]:
test.head()

Unnamed: 0,title,distributor,genre,release_time,time,screening_rat,director,dir_prev_bfnum,dir_prev_num,num_staff,num_actor
0,용서는 없다,시네마서비스,느와르,2010-01-07,125,청소년 관람불가,김형준,300529.0,2,304,3
1,아빠가 여자를 좋아해,(주)쇼박스,멜로/로맨스,2010-01-14,113,12세 관람가,이광재,342700.2,4,275,3
2,하모니,CJ 엔터테인먼트,드라마,2010-01-28,115,12세 관람가,강대규,4206611.0,3,419,7
3,의형제,(주)쇼박스,액션,2010-02-04,116,15세 관람가,장훈,691342.0,2,408,2
4,평행 이론,CJ 엔터테인먼트,공포,2010-02-18,110,15세 관람가,권호영,31738.0,1,380,1


In [5]:
submission.head()

Unnamed: 0,title,box_off_num
0,용서는 없다,0
1,아빠가 여자를 좋아해,0
2,하모니,0
3,의형제,0
4,평행 이론,0


In [6]:
# shape 형태
print(train.shape)
print(test.shape)
print(submission.shape)

(600, 12)
(243, 11)
(243, 2)


In [17]:
train.describe()

Unnamed: 0,time,dir_prev_bfnum,dir_prev_num,num_staff,num_actor,box_off_num
count,600.0,270.0,600.0,600.0,600.0,600.0
mean,100.9,1050442.9,0.9,151.1,3.7,708181.8
std,18.1,1791408.3,1.2,165.7,2.4,1828005.9
min,45.0,1.0,0.0,0.0,0.0,1.0
25%,89.0,20380.0,0.0,17.0,2.0,1297.2
50%,100.0,478423.6,0.0,82.5,3.0,12591.0
75%,114.0,1286568.6,2.0,264.0,4.0,479886.8
max,180.0,17615314.0,5.0,869.0,25.0,14262766.0


In [16]:
pd.options.display.float_format = '{:.1f}'.format

In [19]:
train[['genre', 'box_off_num']].groupby('genre').mean().sort_values('box_off_num')

Unnamed: 0_level_0,box_off_num
genre,Unnamed: 1_level_1
뮤지컬,6627.0
다큐멘터리,67172.3
서스펜스,82611.0
애니메이션,181926.7
멜로/로맨스,425968.0
미스터리,527548.2
공포,590832.5
드라마,625689.8
코미디,1193914.0
SF,1788345.7


# 전처리

In [21]:
train.isna().sum()

title               0
distributor         0
genre               0
release_time        0
time                0
screening_rat       0
director            0
dir_prev_bfnum    330
dir_prev_num        0
num_staff           0
num_actor           0
box_off_num         0
dtype: int64

In [23]:
train['dir_prev_bfnum'].fillna(0, inplace=True)
test['dir_prev_bfnum'].fillna(0, inplace=True)

# 변수 선택 및 모델 구축

In [24]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)
# 순차적으로 만드는 모델을 1000개 만들겠다.

In [25]:
features = ['time', 'dir_prev_num', 'num_staff', 'num_actor']
target = ['box_off_num']

In [27]:
X_train, X_test, y_train = train[features], test[features], train[target]

# 모델 학습 및 검증

a. lightGBM (base model)  
b. k-fold lightGBM (k-fold model)  
c. feature engineering (fe)  
d. grid search (hyperparameter tuning)  

### LightGBM
- 부스팅 모델
- 기존 모델이 못 맞춘 문제에 가중치를 두어 지속적으로 학습 이어나감
- 타 부스팅 모델에 비해 속도가 빠름

In [28]:
model.fit(X_train, y_train)

LGBMRegressor(n_estimators=1000, random_state=777)

In [29]:
singleLGBM = submission.copy()

In [30]:
singleLGBM['box_off_num'] = model.predict(X_test)

In [31]:
singleLGBM

Unnamed: 0,title,box_off_num
0,용서는 없다,2817995.2
1,아빠가 여자를 좋아해,375377.2
2,하모니,-569324.3
3,의형제,1581189.0
4,평행 이론,-527780.6
...,...,...
238,해에게서 소년에게,500784.4
239,울보 권투부,1013858.4
240,어떤살인,1682067.7
241,말하지 못한 비밀,300216.3


In [32]:
# 저장
singleLGBM.to_csv('singleLGBM.csv', index=False)

### k-fold 교차검증 (k-fold cross validation)
- 과대적합 방지
- 모델 간 일반적인 성능 비교

In [33]:
from sklearn.model_selection import KFold

In [34]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=777)

In [39]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)

models = []

for train_idx, val_idx in k_fold.split(X_train):
    x_t = X_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    x_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val,y_val), 
                            early_stopping_rounds=100, 
                            verbose=100))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.70572e+12
Early stopping, best iteration is:
[6]	valid_0's l2: 2.45438e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 3.90847e+12
Early stopping, best iteration is:
[33]	valid_0's l2: 3.72825e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 3.50344e+12
Early stopping, best iteration is:
[8]	valid_0's l2: 2.58737e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 1.45977e+12
Early stopping, best iteration is:
[11]	valid_0's l2: 1.26226e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 1.77214e+12
Early stopping, best iteration is:
[22]	valid_0's l2: 1.57631e+12


In [41]:
preds = []
for model in models:
    preds.append(model.predict(X_test))
len(preds)

5

In [45]:
kfoldLightGBM = submission.copy()

In [46]:
import numpy as np

In [47]:
kfoldLightGBM['box_off_num'] = np.mean(preds, axis=0)

In [48]:
kfoldLightGBM.to_csv('kfoldLightGBM.csv', index=False)

### feature engineering

In [49]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train['genre'] = le.fit_transform(train['genre'])

In [50]:
test['genre'] = le.transform(test['genre'])

In [51]:
features = ['time', 'dir_prev_num', 'num_staff', 'num_actor', 'dir_prev_bfnum', 'genre']

In [52]:
X_train, X_test, y_train = train[features], test[features], train[target]

In [53]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)

models = []

for train_idx, val_idx in k_fold.split(X_train):
    x_t = X_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    x_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val,y_val), 
                            early_stopping_rounds=100, 
                            verbose=100))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.62067e+12
Early stopping, best iteration is:
[9]	valid_0's l2: 2.42668e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 4.39227e+12
Early stopping, best iteration is:
[23]	valid_0's l2: 3.97173e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 3.29841e+12
Early stopping, best iteration is:
[10]	valid_0's l2: 2.53643e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 1.56499e+12
Early stopping, best iteration is:
[16]	valid_0's l2: 1.21201e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 1.60118e+12
Early stopping, best iteration is:
[29]	valid_0's l2: 1.47528e+12


In [54]:
preds = []
for model in models:
    preds.append(model.predict(X_test))
len(preds)

5

In [55]:
feLightGBM = submission.copy()

In [56]:
feLightGBM['box_off_num'] = np.mean(preds, axis=0)

In [57]:
feLightGBM.to_csv('feLightGBM.csv', index=False)

### Grid Search

In [58]:
from sklearn.model_selection import GridSearchCV

In [60]:
model = lgb.LGBMRegressor(random_state=777, n_estimators=1000)

params = {
    'learning_rate' : [0.1, 0.01, 0.001],
    'min_child_samples' : [10, 20, 30]
}

gs = GridSearchCV(estimator=model,
                 param_grid=params,
                 scoring='neg_mean_squared_error',
                 cv=k_fold)

In [61]:
gs.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=777, shuffle=True),
             estimator=LGBMRegressor(n_estimators=1000, random_state=777),
             param_grid={'learning_rate': [0.1, 0.01, 0.001],
                         'min_child_samples': [10, 20, 30]},
             scoring='neg_mean_squared_error')

In [62]:
gs.best_params_

{'learning_rate': 0.001, 'min_child_samples': 30}

In [66]:
model = lgb.LGBMRegressor(random_state=777, 
                          n_estimators=10000, 
                          learning_rate=0.001,
                          min_child_samples=30)

In [67]:
models = []

for train_idx, val_idx in k_fold.split(X_train):
    x_t = X_train.iloc[train_idx]
    y_t = y_train.iloc[train_idx]
    x_val = X_train.iloc[val_idx]
    y_val = y_train.iloc[val_idx]
    
    models.append(model.fit(x_t, y_t, eval_set=(x_val,y_val), 
                            early_stopping_rounds=100, 
                            verbose=100))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 2.6981e+12
[200]	valid_0's l2: 2.62546e+12
[300]	valid_0's l2: 2.56718e+12
[400]	valid_0's l2: 2.51397e+12
[500]	valid_0's l2: 2.47666e+12
[600]	valid_0's l2: 2.45634e+12
[700]	valid_0's l2: 2.44526e+12
[800]	valid_0's l2: 2.43828e+12
[900]	valid_0's l2: 2.42849e+12
[1000]	valid_0's l2: 2.42601e+12
[1100]	valid_0's l2: 2.42741e+12
Early stopping, best iteration is:
[1006]	valid_0's l2: 2.42583e+12
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l2: 5.39165e+12
[200]	valid_0's l2: 5.12207e+12
[300]	valid_0's l2: 4.89262e+12
[400]	valid_0's l2: 4.69817e+12
[500]	valid_0's l2: 4.53817e+12
[600]	valid_0's l2: 4.40999e+12
[700]	valid_0's l2: 4.32616e+12
[800]	valid_0's l2: 4.25195e+12
[900]	valid_0's l2: 4.19287e+12
[1000]	valid_0's l2: 4.14293e+12
[1100]	valid_0's l2: 4.10072e+12
[1200]	valid_0's l2: 4.06185e+12
[1300]	valid_0's l2: 4.02619e+12
[1400]	valid_0's l2: 3.99487e+12
[1

In [68]:
preds = []
for model in models:
    preds.append(model.predict(X_test))
len(preds)

5

In [69]:
gsLightGBM = submission.copy()

In [70]:
gsLightGBM['box_off_num'] = np.mean(preds, axis=0)

In [71]:
gsLightGBM.to_csv('gsLightGBM.csv', index=False)