### 1. 데이터 로드 및 모델 학습

In [None]:
# 데이터 분석
import pandas as pd
import numpy as np
import pickle
import os

# 모델 학습
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

# 불균형 데이터
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 모델 선정 -> 불균형 데이터에 강력한 트리 기반 모델
from lightgbm import LGBMClassifier

# --- 데이터 로드 및 분리
df = pd.read_csv('./data/netflix_feature_engineered.csv')
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

drop_cols = ['Region', 'Device Used Most Often', 'Payment History (On-Time/Delayed)', 'Monthly Income ($)', 
             'Number of Profiles Created', 'Age_group', 'Support Queries Logged', 'Daily Watch Time (Hours)']

df.drop(columns=drop_cols, axis=1, inplace=True, errors='ignore')

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# --- 모델 파이프라인 구축
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

model = LGBMClassifier(random_state=42, class_weight='balanced')

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', model)
])

# --- 모델 학습
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 74803, number of negative: 74803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2339
[LightGBM] [Info] Number of data points in the train set: 149606, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.20      0.68      0.31      3875
           1       0.95      0.67      0.78     32059

    accuracy                           0.67     35934
   macro avg       0.57      0.68      0.55     35934
weighted avg       0.87      0.67      0.73     35934





### 2. 예측 및 평가 시행

In [None]:
# --- 모델 예측 및 평가
y_pred = pipeline.predict(X_test)
y_pred_ser = pd.Series(y_pred)

print(classification_report(y_test, y_pred))

### 3. 학습된 모델과 데이터 저장 (필요시) 

In [None]:
# --- 학습된 모델 및 데이터 저장
# 1. 저장할 디렉토리 생성
output_dir = './model'
os.makedirs(output_dir, exist_ok=True)

# 2. 학습된 모델 파이프라인 저장
pipeline_path = os.path.join(output_dir, 'churn_lgbm_pipeline.pkl')
with open(pipeline_path, 'wb') as f:
    pickle.dump(pipeline, f)

# 3. 학습/테스트 데이터 저장
data_path = os.path.join(output_dir, 'churn_data_split.pkl')
data_to_save = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}
with open(data_path, 'wb') as f:
    pickle.dump(data_to_save, f)

### 4. 로컬에 저장된 모델 로드 후 사용

In [None]:
# --- 저장된 모델 및 데이터 로드
target_dir = './model'
pipeline_path = os.path.join(target_dir, 'churn_lgbm_pipeline.pkl')
data_path = os.path.join(target_dir, 'churn_data_split.pkl')

with open(pipeline_path, 'rb') as f:
    loaded_pipeline = pickle.load(f)

with open(data_path, 'rb') as f:
    loaded_data = pickle.load(f)

loaded_X_test = loaded_data['X_test']
loaded_y_test = loaded_data['y_test']
loaded_pred = loaded_pipeline.predict(loaded_X_test)

print(classification_report(loaded_y_test, loaded_pred))
print(loaded_pred)

              precision    recall  f1-score   support

           0       0.20      0.68      0.31      3875
           1       0.95      0.67      0.78     32059

    accuracy                           0.67     35934
   macro avg       0.57      0.68      0.55     35934
weighted avg       0.87      0.67      0.73     35934

[1 0 0 ... 1 0 1]


