### 1. 데이터 로드 및 모델 학습

In [None]:
# 데이터 분석
import pandas as pd
import numpy as np
import pickle
import os

# 모델 학습
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

# 불균형 데이터
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# 모델 선정 -> 불균형 데이터에 강력한 트리 기반 모델
from lightgbm import LGBMClassifier

# --- 데이터 로드 및 분리
df = pd.read_csv('./data/netflix_feature_engineered.csv')
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)

X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# --- 모델 파이프라인 구축
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

model = LGBMClassifier(random_state=42, class_weight='balanced')

pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', model)
])

# --- 모델 학습
pipeline.fit(X_train, y_train)

Exception in thread Thread-4 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\Playdata\anaconda3\envs\ml_env\Lib\threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "c:\Users\Playdata\anaconda3\envs\ml_env\Lib\site-packages\ipykernel\ipkernel.py", line 772, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Playdata\anaconda3\envs\ml_env\Lib\threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Playdata\anaconda3\envs\ml_env\Lib\subprocess.py", line 1599, in _readerthread
    buffer.append(fh.read())
                  ^^^^^^^^^
  File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 4: invalid start byte


[LightGBM] [Info] Number of positive: 74803, number of negative: 74803
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8367
[LightGBM] [Info] Number of data points in the train set: 149606, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


0,1,2
,steps,"[('preprocessor', ...), ('smote', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,sampling_strategy,'auto'
,random_state,42
,k_neighbors,5

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.1
,n_estimators,100
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


### 2. 예측 및 평가 시행

In [2]:
# --- 모델 예측 및 평가
y_pred = pipeline.predict(X_test)
y_pred_ser = pd.Series(y_pred)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.21      0.71      0.32      3875
           1       0.95      0.68      0.79     32059

    accuracy                           0.68     35934
   macro avg       0.58      0.69      0.56     35934
weighted avg       0.87      0.68      0.74     35934





### 3. 학습된 모델과 데이터 저장 (필요시) 

In [3]:
# --- 학습된 모델 및 데이터 저장
# 1. 저장할 디렉토리 생성
output_dir = './model'
os.makedirs(output_dir, exist_ok=True)

# 2. 학습된 모델 파이프라인 저장
pipeline_path = os.path.join(output_dir, 'churn_lgbm_pipeline.pkl')
with open(pipeline_path, 'wb') as f:
    pickle.dump(pipeline, f)

# 3. 학습/테스트 데이터 저장
data_path = os.path.join(output_dir, 'churn_data_split.pkl')
data_to_save = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test
}
with open(data_path, 'wb') as f:
    pickle.dump(data_to_save, f)

### 4. 로컬에 저장된 모델 로드 후 사용

In [4]:
# --- 저장된 모델 및 데이터 로드
target_dir = './model'
pipeline_path = os.path.join(target_dir, 'churn_lgbm_pipeline.pkl')
data_path = os.path.join(target_dir, 'churn_data_split.pkl')

with open(pipeline_path, 'rb') as f:
    loaded_pipeline = pickle.load(f)

with open(data_path, 'rb') as f:
    loaded_data = pickle.load(f)

loaded_X_test = loaded_data['X_test']
loaded_y_test = loaded_data['y_test']
loaded_pred = loaded_pipeline.predict(loaded_X_test)

print(classification_report(loaded_y_test, loaded_pred))
print(loaded_pred)

              precision    recall  f1-score   support

           0       0.21      0.71      0.32      3875
           1       0.95      0.68      0.79     32059

    accuracy                           0.68     35934
   macro avg       0.58      0.69      0.56     35934
weighted avg       0.87      0.68      0.74     35934

[1 0 0 ... 1 0 1]


