In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.callbacks import EarlyStopping

# 데이터 불러오기 (데이터 파일 경로 지정)
data = pd.read_csv('/Users/sopung/Desktop/movie_clean_new5_bc_utf.csv')
data.fillna(0, inplace=True)  # 결측치를 1로 대체

# 종속 변수인 '총관객수' 로그 변환
y = np.log1p(data['총관객수'])

# 사용할 특성 선택
selected_features = [
    '계절성여부',
    '상영시간(분)',
    '스크린수',
    '비평가 점수',
    '관객 평가(네티즌평가)',
    '전체관람가',
    '12세이상관람가',
    '15세이상관람가',
    '배우파워1',
    '배우파워2',
    '감독파워',
    '다양성(독립)영화',
    '액션',
    '멜로/로맨스',
    '판타지',
    '드라마',
    '코미디',
    '어드벤처',
    '전쟁',
    '스릴러',
    '다큐멘터리',
    '공포(호러)',
    '애니메이션',
    '사극',
    'SF',
    '범죄',
    '미스터리',
    '뮤지컬',
    '가족',
    '서부극(웨스턴)',
    '공연',
    '성인물(에로)',
    '기타',
    '다국적',
    '배급사파워1',
]

# 독립 변수 선택
X = data[selected_features]

# 데이터 전처리
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 임계값 설정
threshold = 100000  

# 흥행 여부 레이블 생성
y_class = (data['총관객수'] > threshold).astype(int)

# 데이터 분할: 학습 데이터와 테스트 데이터
X_train, X_test, y_train_reg, y_test_reg, y_train_class, y_test_class = train_test_split(X_scaled, y, y_class, test_size=0.2, random_state=42)

# 회귀 모델 정의
regression_model = keras.Sequential([
    keras.layers.Dense(4096, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(2048, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(1024, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(1) 
])

# 회귀 모델 학습
regression_model.compile(loss='mean_squared_error', optimizer='adam')
regression_model.fit(X_train, y_train_reg, epochs=200, batch_size=64, validation_split=0.2, verbose=1)

# 회귀 모델 평가
y_pred_reg = regression_model.predict(X_test)
mse = mean_squared_error(y_test_reg, y_pred_reg)
mae = mean_absolute_error(np.expm1(y_test_reg), np.expm1(y_pred_reg))
r2 = r2_score(np.expm1(y_test_reg), np.expm1(y_pred_reg))

# 이진 분류 모델 정의
classification_model = keras.Sequential([
    keras.layers.Dense(4096, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(2048, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(1024, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.2),  # Dropout 추가
    keras.layers.Dense(1, activation='sigmoid')  # 이진 분류를 위한 시그모이드 활성화 함수
])

# 이진 분류 모델 학습
classification_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
classification_model.fit(X_train, y_train_class, epochs=100, batch_size=64, validation_split=0.2, verbose=1)

# 이진 분류 모델 평가
y_pred_class = (classification_model.predict(X_test) > 0.5).astype(int)  # 확률을 클래스로 변환
accuracy = accuracy_score(y_test_class, y_pred_class)
precision = precision_score(y_test_class, y_pred_class)
recall = recall_score(y_test_class, y_pred_class)
f1 = f1_score(y_test_class, y_pred_class)

print(f'Binary Classification Accuracy: {accuracy}')
print(f'Binary Classification Precision: {precision}')
print(f'Binary Classification Recall: {recall}')
print(f'Binary Classification F1 Score: {f1}')

print(f'Regression Mean Squared Error: {mse}')
print(f'Regression Mean Absolute Error: {mae}')
print(f'Regression R2 Score: {r2}')

# 관객수 예측
def predict_audience(new_data):
    new_data_scaled = scaler.transform(new_data)
    log_predicted_audience = regression_model.predict(new_data_scaled)
    predicted_audience = np.expm1(log_predicted_audience)
    return predicted_audience

# 예측할 새로운 데이터 예시
new_data = np.array([[0, 120, 9, 8.5, 9.05, 0, 0, 0, 92105, 224182, 162517, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
predicted_audience = predict_audience(new_data)
print(f'Predicted Audience: {predicted_audience[0]}')

# 흥행 여부 예측
predicted_success = (predicted_audience > threshold).astype(int)
print(f'Predicted Success: {"Yes" if predicted_success[0] == 1 else "No"}')


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

