In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
import re
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# 엑셀 파일 불러오기
file_path = "../data/NewspaperChurn.xlsx"
df = pd.read_excel(file_path)

### 전처리

In [3]:
# columns drop
df = df.drop(['SubscriptionID', 'Ethnicity', 'Language', 'Address', 'State', 'City', 'County', 'Zip Code', 'Source Channel'], axis=1)

# Subscriber열 No : 1, Yes :0
def preprocess_subscriber(df):
    df['Subscriber'] = df['Subscriber'].map({'NO': 1, 'YES': 0})
    return df
    
# reward program 열 1 이상은 1, 미만은 0, 결측치도 0
def preprocess_reward_program(df):
    df['reward program'] = df['reward program'].apply(lambda x: 1 if x >= 1 else 0)
    return df
    
# 고객유형코드
def preprocess_nielsen_prizm(df):
    nielsen_mapping = {
        'FM': 0,  # Female Middle-aged
        'MW': 1,  # Male Working-age
        'MM': 2,  # Male Middle-aged
        'FW': 3,  # Female Working-age
        'YW': 4,  # Young Woman
        'YM': 5,  # Young Man
        'ME': 6,  # Male Elderly
        'FE': 7,  # Female Elderly
        'YE': 8   # Young Elderly
    }
    
    # 'Nielsen Prizm'을 매핑된 숫자로 변환
    df['Nielsen Prizm'] = df['Nielsen Prizm'].map(nielsen_mapping)
    
    # 129개의 결측치는 날림 (삭제) # 이거는 나중에 삭제 ?? # 위에서 따로
    df.dropna(subset=['Nielsen Prizm'], inplace=True)

    return df

# 2. 근로 중 여부 (중이면 1, 아니면 0)
def preprocess_working(df):
    df['Working'] = df['Nielsen Prizm'].apply(lambda x: 1 if x in [1, 3] else 0)
    return df

# 3. 성별 (남 : 0, 여: 1)
def preprocess_gender(df):
    # 기본 성별 처리
    df['Gender'] = df['Nielsen Prizm'].apply(lambda x: 0 if x in [1, 2, 5, 6] else 1)
    
    # 'YE' (Young Elderly) 처리: 25명 남성 (0), 26명 여성 (1) 랜덤으로 배분
    ye_indices = df[df['Nielsen Prizm'] == 8].index  # 'YE'에 해당하는 인덱스 찾기
    
    # 랜덤 시드를 고정하여 항상 동일한 결과가 나오도록 설정
    random.seed(42)  # 시드 값 (고정된 숫자)
    
    # 'YE'에 해당하는 인덱스의 길이에 맞는 gender_assignment 리스트 생성
    gender_assignment = [0] * (len(ye_indices) // 2) + [1] * (len(ye_indices) - len(ye_indices) // 2)  # 남성 25명, 여성 26명
    random.shuffle(gender_assignment)  # 섞기
    
    # YE 인덱스에 해당하는 성별 값을 할당
    df.loc[ye_indices, 'Gender'] = gender_assignment
    
    return df

# weekly fee
def preprocess_fee(value):
    if pd.isnull(value):
        return None
    
    match = re.findall(r'\d+\.\d+|\d+', str(value))
    num = [float(x) for x in match]

    if len(num) == 2:
        return (num[0] + num[1]) / 2
    elif len(num) == 1:
        return num[0]
    else:
        return None
    
def nan_fee(value):
    avg_fee = np.mean(df['weekly fee'])
    if pd.isnull(value):
        return avg_fee
    else:
        return value
        
# Deliveryperiod 전처리
def preprocess_deliveryperiod(df):
    # 온라인 배송 타입 정의
    online = {
        '7DayT', 'Fri-SunT', 'Sun-FriT', 'Thu-SunT', 
        'SoooTFST', 'Fri-SunT', 'SooooFST', 'SoooooST', 'SunOnlyT', 'SooooooT'
    }


    period_map = {
        '7Day': 7, '7DAY': 7, '7day': 7, '7DayOL': 7, '7DayT': 7,
        'Fri-Sun': 6, 'Fri-SunT': 6, 'Sun-Fri': 6, 'Sun-FriT': 6,
        'oMTWTFo': 5, 'Mon-Fri': 5,
        'Thu-Sun': 4, 'THU-SUN': 4, 'thu-sun': 4, 'SoooTFS': 4, 'SoooTFST': 4, 'Thu-SunT': 4,
        'SooooFS': 3, 'SooooFST': 3, 'Fri-SunT': 3, 'Fri-Sun': 3,
        'SatSun': 2, 'SoooooS': 2, 'SoooooST': 2,
        'SunOnly': 1, 'SunOnlyT': 1, 'sunonly': 1, 'SUNONLY': 1, 'Soooooo': 1, 'SooooooT': 1
    }

    df['Is_Online'] = df['Deliveryperiod'].apply(lambda x: 1 if x in online else 0)
    df['Deliveryperiod'] = df['Deliveryperiod'].map(period_map)

    return df

            
# dummy for children (자녀가 있으면 1, 없으면 0)       
def preprocess_dummy(df):
    df['dummy for Children'] = np.where(df['dummy for Children'] == 'Y', 1, 0)
    return df
    
    
#hh income 전처리
def preprocess_hh_income(df):
    """
    HH Income 열을 전처리하여:
    1. `plus`나 `under`가 포함된 값을 평균값으로 대체
    2. 범위 데이터(`$30,000 - $39,999`)는 평균값으로 변환
    3. 기호(`$`, `,`) 제거
    """
    # 1. "plus" 또는 "under"가 포함된 행을 마스킹
    mask = df['HH Income'].str.contains('plus|under', case=False, na=False)
    
    # 2. 범위 데이터를 평균값으로 변환 (plus/under 제외)
    def process_value(value):
        try:
            # 기호 제거
            clean_value = str(value).replace('$', '').replace(',', '')
            
            # 범위 처리 (예: "30000-39999")
            if '-' in clean_value:
                low, high = map(int, clean_value.split('-'))
                return (low + high) / 2
            # 숫자 처리
            elif clean_value.isdigit():
                return float(clean_value)
            else:
                return np.nan  # plus/under는 NaN 처리
        except:
            return np.nan
    
    # 3. 전체 열을 숫자로 변환 (plus/under는 NaN)
    df['HH Income_Processed'] = df['HH Income'].apply(process_value)
    
    # 4. 평균값 계산 (plus/under 제외)
    valid_mean = df.loc[~mask, 'HH Income_Processed'].mean()
    
    # 5. plus/under가 포함된 행을 평균값으로 대체
    df.loc[mask, 'HH Income_Processed'] = valid_mean
    
    # 6. 원본 열 대체
    df['HH Income'] = df['HH Income_Processed'].round(0).astype(int)
    df = df.drop('HH Income_Processed', axis=1)
    
    return df


#Home Ownership 전처리
def preprocess_home_ownership(df):
    df['Home Ownership'] = df['Home Ownership'].map({'OWNER': 1, 'RENTER': 0})
    return df

def preprocess_age(value):
    if pd.isnull(value):
        return None
    
    match = re.findall(r'\d[\d,]*', str(value))
    num = [int(x.replace(',', '')) for x in match]

    if len(num) == 2:
        return (num[0] + num[1]) / 2
    elif len(num) == 1:
        return num[0]
    else:
        return None
    
def nan_age(value):
    avg_age = np.mean(df['Age range'])
    if pd.isnull(value):
        return avg_age
    else:
        return value

df['Age range'] = df['Age range'].apply(preprocess_age)
df['Age range'] = df['Age range'].apply(nan_age)
df['Age range'] = np.floor(df['Age range']).astype(int)
df.rename(columns={'Age range': 'Age'}, inplace=True)
df = preprocess_subscriber(df)
df = preprocess_reward_program(df)
df = preprocess_nielsen_prizm(df)
df = preprocess_working(df)
df = preprocess_gender(df)
df['weekly fee'] = df['weekly fee'].apply(preprocess_fee)
df['weekly fee'] = df['weekly fee'].apply(nan_fee)
df = preprocess_deliveryperiod(df)
df = preprocess_dummy(df)
df = preprocess_hh_income(df)
df = preprocess_home_ownership(df)

### 타깃 설정

In [4]:
target = df['Subscriber']
df = df.drop('Subscriber', axis=1)

### 모든 모델 평가

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=0)

# 1. 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Smote
smote_tomek = SMOTE(random_state=0)
X_train_scaled, y_train= smote_tomek.fit_resample(X_train_scaled, y_train)

# 3. 모델 학습 및 평가 함수 정의
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-ROC: {roc_auc:.4f}")
    print(classification_report(y_test, y_pred))

# 4. 로지스틱 회귀 (가중치 추가)
print("\n[로지스틱 회귀 (Cost-Sensitive)]")
lr_model = LogisticRegression(random_state=0, class_weight='balanced')
lr_model.fit(X_train_scaled, y_train)
evaluate_model(lr_model, X_test_scaled, y_test)

# 5. KNN 모델
print("\n[KNN]")
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
evaluate_model(knn_model, X_test_scaled, y_test)

# 6. 랜덤 포레스트 (가중치 추가)
print("\n[랜덤 포레스트 (Cost-Sensitive)]")
rf_model = RandomForestClassifier(random_state=0, class_weight='balanced')
rf_model.fit(X_train_scaled, y_train)
evaluate_model(rf_model, X_test_scaled, y_test)

# 7. XGBoost 모델 (가중치 추가)
print("\n[XGBoost (Cost-Sensitive)]")
xgb_model = XGBClassifier(random_state=0, eval_metric='logloss', scale_pos_weight=4)
xgb_model.fit(X_train_scaled, y_train)
evaluate_model(xgb_model, X_test_scaled, y_test)

# 8. 다중 퍼셉트론 (MLPClassifier, 가중치 추가)
print("\n[다중 퍼셉트론 (MLP, Cost-Sensitive)]")
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, random_state=0)
mlp_model.fit(X_train_scaled, y_train)
evaluate_model(mlp_model, X_test_scaled, y_test)

# 9. SVM (가중치 추가)
print("\n[SVM (Cost-Sensitive)]")
svm_model = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=0)
svm_model.fit(X_train_scaled, y_train)
evaluate_model(svm_model, X_test_scaled, y_test)


[로지스틱 회귀 (Cost-Sensitive)]
Accuracy: 0.6809
Precision: 0.8771
Recall: 0.7084
F1 Score: 0.7838
AUC-ROC: 0.6333
              precision    recall  f1-score   support

           0       0.30      0.56      0.39       577
           1       0.88      0.71      0.78      2569

    accuracy                           0.68      3146
   macro avg       0.59      0.63      0.59      3146
weighted avg       0.77      0.68      0.71      3146


[KNN]
Accuracy: 0.6771
Precision: 0.8916
Recall: 0.6882
F1 Score: 0.7768
AUC-ROC: 0.6578
              precision    recall  f1-score   support

           0       0.31      0.63      0.42       577
           1       0.89      0.69      0.78      2569

    accuracy                           0.68      3146
   macro avg       0.60      0.66      0.60      3146
weighted avg       0.79      0.68      0.71      3146


[랜덤 포레스트 (Cost-Sensitive)]
Accuracy: 0.8147
Precision: 0.8796
Recall: 0.8957
F1 Score: 0.8876
AUC-ROC: 0.6749
              precision    recall 

### MLP 튜닝

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_resample_train_scaled = scaler.fit_transform(X_train)
X_resample_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_resample, y_resample = smote.fit_resample(X_resample_train_scaled, y_train)

# MLPClassifier를 위한 하이퍼파라미터 그리드
param_grid = {
    'hidden_layer_sizes': [(64, 32, 16), (128, 64, 32), (32, 16, 8)],  # 은닉층 구조 다양화
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],   
    'learning_rate_init': [0.001, 0.005, 0.01],  # 학습률 튜닝
    'alpha': [0.0001, 0.001, 0.01],  # 정규화 강도
    'batch_size': [32, 64, 128],  # 미니 배치 크기 조정
    'max_iter': [500, 1000],  # 에포크 증가
}

# 모델 정의
mlp = MLPClassifier(early_stopping=True, random_state=0)

# 그리드 서치 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(mlp, param_grid, scoring='f1_weighted', cv=cv, n_jobs=-1, verbose=2)

# 모델 학습 및 최적 파라미터 탐색
grid_search.fit(X_resample, y_resample)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1-Score: {grid_search.best_score_:.4f}")

# 최적 모델로 예측
best_mlp = grid_search.best_estimator_
y_pred = best_mlp.predict(X_resample_test_scaled)

# 평가
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

### SVM 튜닝

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df,target, test_size=0.2, random_state=0)

# 데이터 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM 하이퍼파라미터 그리드
param_grid = {
    'C': [0.1, 1, 10, 100],               # 규제 강도
    'kernel': ['linear', 'rbf', 'poly'],  # 커널 함수
    'gamma': ['scale', 'auto'],           # 감마 설정
    'degree': [2, 3],                     # 다항 커널 차수
}

# 모델 정의
svc = SVC(random_state=0)

# 그리드 서치 설정
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
grid_search = GridSearchCV(svc, param_grid, scoring='f1_weighted', cv=cv, n_jobs=-1, verbose=2)

# 모델 학습 및 최적 파라미터 탐색
grid_search.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best F1-Score: {grid_search.best_score_:.4f}")

# 최적 모델로 예측
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test_scaled)

# 평가
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

### Gradient Boosting 튜닝

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

# 데이터 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE로 데이터 증강
smote = SMOTE(random_state=42)
X_resample, y_resample = smote.fit_resample(X_train_scaled, y_train)

# 1. 기본 Gradient Boosting 모델
gbc = GradientBoostingClassifier(random_state=42)
gbc.fit(X_resample, y_resample)
y_pred_gbc = gbc.predict(X_test_scaled)

print("\n[Gradient Boosting 기본 모델 평가]")
print("정확도:", accuracy_score(y_test, y_pred_gbc))
print(classification_report(y_test, y_pred_gbc))

# 2. 하이퍼파라미터 튜닝 (GridSearchCV)
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42),
                           param_grid, scoring='f1_weighted', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_resample, y_resample)

print("\n[Gradient Boosting 최적 파라미터]")
print(grid_search.best_params_)
best_gbc = grid_search.best_estimator_

# 최적 모델로 예측
y_pred_best_gbc = best_gbc.predict(X_test_scaled)
print("\n[최적 Gradient Boosting 모델 평가]")
print("정확도:", accuracy_score(y_test, y_pred_best_gbc))
print(classification_report(y_test, y_pred_best_gbc))

### LightGBM 튜닝

In [None]:
# 필수 라이브러리 임포트
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_resample_train_scaled = scaler.fit_transform(X_train)
X_resample_test_scaled = scaler.transform(X_test)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_resample_train_scaled, y_train)  # Fix: Resample both X_train and y_train together


# 4. LightGBM 모델 설정
lgbm = lgb.LGBMClassifier(random_state=0, is_unbalance=True)

# 5. 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],        # 트리 개수
    'learning_rate': [0.01, 0.05, 0.1],     # 학습률
    'num_leaves': [31, 63, 127],            # 리프 노드 수
    'max_depth': [-1, 10, 20],              # 트리 깊이
    'scale_pos_weight': [1, 10, 20]         # 불균형 데이터 보정
}

# 6. 교차 검증 설정 (Stratified K-Fold 사용)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# 7. 그리드 서치 설정
grid_search = GridSearchCV(
    lgbm, param_grid, scoring='f1_weighted', 
    cv=cv, n_jobs=-1, verbose=2
)

# 8. 모델 학습
grid_search.fit(X_train, y_train)

# 10. 최적 모델로 예측
best_lgbm = grid_search.best_estimator_
y_pred = best_lgbm.predict(X_resample_test_scaled)

# 11. 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

### KNN 튜닝

In [None]:
import hyperopt
import hyperopt.hp

def objective_knn(params):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = KNeighborsClassifier(
        n_neighbors=int(params['n_neighbors']),
        weights=params['weights'],
        metric=params['metric']
    )
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    
    return {'loss': -acc, 'status': hyperopt.STATUS_OK}

space = {
    'n_neighbors': hp.quniform('n_neighbors', 3, 20, 1),
    'weights': hp.choice('weights', ['uniform', 'distance']),
    'metric': hp.choice('metric', ['euclidean', 'manhattan']),
    'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
}

trials = Trials()
best = fmin(
    fn=objective_knn,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

weights_options = ['uniform', 'distance']
metric_options = ['euclidean', 'manhattan']

best['n_neighbors'] = int(best['n_neighbors'])
best['weights'] = weights_options[best['weights']]
best['metric'] = metric_options[best['metric']]

print("Best Parameters:", best)

knn_best = KNeighborsClassifier(algorithm='auto', metric='euclidean', n_neighbors=13, weights='uniform')

knn_best.fit(X_train_scaled, y_train)
knn_best_pred = knn_best.predict(X_test_scaled)

print(accuracy_score(y_test, knn_best_pred))
print(classification_report(y_test, knn_best_pred))

### Booting 튜닝

In [None]:
# df 로드
import pandas as pd
import matplotlib.pyplot as plt

# 학습, 테스트 데이터 나누기
from sklearn.model_selection import train_test_split


X_train,X_test,y_train,y_test=train_test_split(df, target, test_size=0.2, random_state=42,stratify=target)


# 스케일러
from sklearn.preprocessing import StandardScaler

# 연속형 변수
continuous_cols = ['HH Income', 'Year Of Residence', 'Age', 'weekly fee', 'Deliveryperiod']
scaler=StandardScaler()
X_train[continuous_cols] = scaler.fit_transform(X_train[continuous_cols].astype(float))
X_test[continuous_cols] = scaler.transform(X_test[continuous_cols].astype(float))



# 학습 및 성능평가
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import numpy as np
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score,precision_score,recall_score,f1_score


# 개별 모델들
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
lgbm = LGBMClassifier(random_state=42)

# VotingClassifier에 모델들을 추가
vclf = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb), ('lgbm', lgbm)], voting='soft')  # 또는 hard로 변경 가능

# 파라미터 지정
params = {
    'voting': ['soft', 'hard'],
    # RandomForest 하이퍼파라미터
    'rf__n_estimators': randint(100, 301), 
    'rf__max_depth': [3,4,5],  

    # XGBoost 하이퍼파라미터
    'xgb__max_depth': [3, 4, 5], 
    'xgb__learning_rate': uniform(0.01, 0.3),
    'xgb__reg_alpha': uniform(0.01, 0.3),  # L1 Regularization
    'xgb__reg_lambda': uniform(0.01, 0.3), # L2 Regularization  
    
    # LightGBM 하이퍼파라미터
    'lgbm__num_leaves': randint(20, 40),
    'lgbm__learning_rate': uniform(0.01, 0.3),
    'lgbm__lambda_l1': uniform(0.01, 0.3),  # L1 Regularization
    'lgbm__lambda_l2': uniform(0.01, 0.3),  # L2 Regularization  

}

# RandomizedSearchCV 사용
random_search = RandomizedSearchCV(vclf, params, n_iter=40, scoring='f1', cv=5, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

print("최적의 파라미터:", random_search.best_params_)
print("최적화된 f1-score:", random_search.best_score_)

# 최적의 모델로 예측
y_pred = random_search.best_estimator_.predict(X_test)
y_pred_prob=random_search.best_estimator_.predict_proba(X_test)


print("Accuracy:",accuracy_score(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))

# classification_report 출력
print("Classification Report:\n", classification_report(y_test, y_pred))


### 랜덤포레스트 튜닝

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, classification_report)
from imblearn.over_sampling import SMOTE
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.4f}'.format)



X_train, X_test, y_train, y_test = train_test_split(
    df, target,
    test_size=0.2,
    stratify=target,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_res_scaled, y_res = smote.fit_resample(X_train_scaled, y_train)


rf_clf = RandomForestClassifier(
    random_state=42,
    class_weight={0:6, 1:1},  
    bootstrap=True
)

param_grid = {
    'n_estimators': [200, 300],  
    'max_depth': [5, 7, None],  
    'min_samples_split': [5, 10],  
    'min_samples_leaf': [2, 4],  
    'max_features': ['sqrt', 0.8]  
}


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  
grid_search_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=2  
)


grid_search_rf.fit(X_res_scaled, y_res)


print("\n🏆 최적 하이퍼파라미터:", grid_search_rf.best_params_)
print("\n🔍 상위 10개 파라미터 조합:")
results_df = pd.DataFrame(grid_search_rf.cv_results_)
print(results_df[['params', 'mean_test_score', 'std_test_score', 'rank_test_score']]
      .sort_values('rank_test_score')
      .head(10))


best_rf_model = grid_search_rf.best_estimator_

def evaluate_rf(model, X_scaled, y_true, dataset_name):
    y_pred = model.predict(X_scaled)  
    y_proba = model.predict_proba(X_scaled)[:, 1]  
    
    print(f"\n📊 {dataset_name} 성능 평가:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred):.4f}")
    print(f"ROC-AUC: {roc_auc_score(y_true, y_proba):.4f}")
    
    print("\n              precision    recall  f1-score   support")
    report = classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1'], digits=4)
    print(report)

print("\n=== 스케일링 적용 모델 성능 ===")
evaluate_rf(best_rf_model, X_train_scaled, y_train, "훈련 데이터")
evaluate_rf(best_rf_model, X_test_scaled, y_test, "테스트 데이터")


### KNN 튜닝

In [None]:
import hyperopt
from hyperopt import fmin, tpe, hp, Trials
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=42)

def objective_knn(params):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = KNeighborsClassifier(
        n_neighbors=int(params['n_neighbors']),
        weights=params['weights'],
        metric=params['metric']
    )
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    
    return {'loss': -acc, 'status': hyperopt.STATUS_OK}

space = {
    'n_neighbors': hp.quniform('n_neighbors', 3, 20, 1),
    'weights': hp.choice('weights', ['uniform', 'distance']),
    'metric': hp.choice('metric', ['euclidean', 'manhattan']),
    'algorithm': hp.choice('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
}

trials = Trials()
best = fmin(
    fn=objective_knn,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

weights_options = ['uniform', 'distance']
metric_options = ['euclidean', 'manhattan']

best['n_neighbors'] = int(best['n_neighbors'])
best['weights'] = weights_options[best['weights']]
best['metric'] = metric_options[best['metric']]

print("Best Parameters:", best)

knn_best = KNeighborsClassifier(algorithm='auto', metric='euclidean', n_neighbors=13, weights='uniform')

knn_best.fit(X_train_scaled, y_train)
knn_best_pred = knn_best.predict(X_test_scaled)

print(accuracy_score(y_test, knn_best_pred))
print(classification_report(y_test, knn_best_pred))

### CatBoost 튜닝

In [None]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score, make_scorer
from catboost import CatBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


smote = SMOTE(random_state=42)
X_res_scaled, y_res = smote.fit_resample(X_train_scaled, y_train)


param_grid = {
    'depth': [6, 10],
    'iterations': [500, 1000],
    'learning_rate': [0.03, 0.1],
    'l2_leaf_reg': [1, 3],
    'class_weights': [{0:3, 1:1}]
}


f1_scorer = make_scorer(f1_score, pos_label=0)


catboost = CatBoostClassifier(
    random_seed=42,
    eval_metric='F1',
    early_stopping_rounds=50,
    verbose=0
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    catboost,
    param_grid,
    scoring=f1_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2
)


grid_search.fit(X_res_scaled, y_res)

print(f"최적 파라미터: {grid_search.best_params_}")
print(f"최고 F1-Score(0): {grid_search.best_score_:.4f}")

best_cb = grid_search.best_estimator_
y_pred = best_cb.predict(X_test_scaled)

print("\n테스트 성능:")
print(classification_report(y_test, y_pred))
