In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# =============================================
# 1. 원본 enhanced_data_clean.csv 로드
# =============================================
df = pd.read_csv("../data/enhanced_data_not_clean_FE_delete.csv")   # 8000 × 26


# =============================================
# 2. 결측치 처리 (규칙 기반)
# =============================================

# 3-1. median 사용 변수
for col in ["listening_time", "songs_played_per_day"]:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

# 3-2. 결제/크래시 -> 0
df["payment_failure_count"] = df["payment_failure_count"].fillna(0)
df["app_crash_count_30d"] = df["app_crash_count_30d"].fillna(0)

# 3-3. boolean event -> False
df["customer_support_contact"] = df["customer_support_contact"].fillna(False)
df["promotional_email_click"] = df["promotional_email_click"].fillna(False)


# =============================================
# 3. IQR 기반 Winsorizing (이상치 처리)
# =============================================
num_cols = [
    col for col in df.columns
    if df[col].dtype in ["int64", "float64"]
    and col not in ["user_id", "is_churned"]
]

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    df[col] = df[col].clip(lower, upper)


# =============================================
# 4. 전처리 파이프라인 구성 (TargetEncoder → OHE로 교체)
# =============================================

numerical_features = [
    col for col in df.columns
    if df[col].dtype in ["int64", "float64"]
    and col not in ["is_churned", "user_id"]
]

categorical_features = [
    "gender",
    "device_type",
    "subscription_type",
    "country"    # 기존 TargetEncoding → OHE로 전환
]

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numerical_features),
        ("cat_ohe", categorical_pipeline, categorical_features)
    ]
)


# =============================================
# 5. Train/Test Split + 전처리 함수
# =============================================
def preprocess_and_split(df, test_size=0.2, random_state=42):
    X = df.drop(columns=["is_churned"])
    y = df["is_churned"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y
    )

    # 전처리 fit & transform
    X_train_processed = preprocessor.fit_transform(X_train, y_train)
    X_test_processed = preprocessor.transform(X_test)

    return X_train_processed, X_test_processed, y_train, y_test, preprocessor


# =============================================
# 6. 실제 실행
# =============================================
X_train_processed, X_test_processed, y_train, y_test, preprocess_model = preprocess_and_split(df)

print("완료!")
print("X_train shape:", X_train_processed.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test_processed.shape)
print("y_test shape:", y_test.shape)

완료!
X_train shape: (6400, 33)
y_train shape: (6400,)
X_test shape: (1600, 33)
y_test shape: (1600,)


In [None]:
print("X_train:", X_train.shape)
print("X_test :", X_test.shape)
print("y_train:", y_train.shape)
print("y_test :", y_test.shape)

In [None]:
import pickle

# 저장할 폴더 경로
save_path = (r"../data/")

# =========================
# 1️⃣ 데이터와 객체 저장
# =========================
def save_processed_data(X_train, X_test, y_train, y_test, preprocessor, save_path=save_path):
    with open(os.path.join(save_path, 'X_train_processed.pkl'), 'wb') as f:
        pickle.dump(X_train, f)
        
    with open(os.path.join(save_path, 'X_test_processed.pkl'), 'wb') as f:
        pickle.dump(X_test, f)
        
    with open(os.path.join(save_path, 'y_train.pkl'), 'wb') as f:
        pickle.dump(y_train, f)
        
    with open(os.path.join(save_path, 'y_test.pkl'), 'wb') as f:
        pickle.dump(y_test, f)
        
    with open(os.path.join(save_path, 'preprocessor.pkl'), 'wb') as f:
        pickle.dump(preprocessor, f)
        
    print("✅ 데이터와 전처리 객체 저장 완료!")


In [23]:
save_processed_data(X_train_processed, X_test_processed, y_train, y_test, preprocess_model)

✅ 데이터와 전처리 객체 저장 완료!


In [None]:
# =========================
# 2️⃣ 데이터와 객체 불러오기
# =========================
def load_processed_data(save_path=save_path):
    with open(os.path.join(save_path, 'X_train_processed.pkl'), 'rb') as f:
        X_train = pickle.load(f)
        
    with open(os.path.join(save_path, 'X_test_processed.pkl'), 'rb') as f:
        X_test = pickle.load(f)
        
    with open(os.path.join(save_path, 'y_train.pkl'), 'rb') as f:
        y_train = pickle.load(f)
        
    with open(os.path.join(save_path, 'y_test.pkl'), 'rb') as f:
        y_test = pickle.load(f)
        
    with open(os.path.join(save_path, 'preprocessor.pkl'), 'rb') as f:
        preprocessor = pickle.load(f)
        
    print("✅ 데이터와 전처리 객체 불러오기 완료!")
    return X_train, X_test, y_train, y_test, preprocessor