In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
import lightgbm
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
import catboost
from bayes_opt import BayesianOptimization

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import plotly.express as px
import umap

from sklearn.linear_model import LogisticRegression

In [2]:
# data 불러오기 - 데이터 전처리 완료 
data = pd.read_csv('./preprocessing_train.csv')

In [7]:
results = {}
stratified_k_fold = StratifiedKFold(random_state=0, shuffle=True)  # n_splits: 5 (default)

for i, (train_index, test_index) in enumerate(stratified_k_fold.split(data, data.Churn)):
    temp = data.iloc[train_index]
    test = data.iloc[test_index]
    train, valid = train_test_split(temp, test_size= len(test)/len(temp), random_state=0)

    rf = RandomForestClassifier(random_state=0)
    
    parameters = {
        'max_depth': np.random.randint(1, 15, 5),
        'n_estimators': np.random.randint(50, 500, 20),
        }

    random_search = RandomizedSearchCV(rf, parameters, random_state=0)
    random_search.fit(train.drop(columns=['Churn']), train.Churn)

    index_best_param = random_search.cv_results_.get('rank_test_score').argmax()
    best_params = random_search.cv_results_.get('params')[index_best_param]

    rf =RandomForestClassifier(
        n_estimators = best_params.get('n_estimators'),
        max_depth = best_params.get('max_depth'),
        random_state=0
    ).fit(train.drop(columns=['Churn']), train.Churn)

    confusion_matrix_result = confusion_matrix(
        test.Churn,
        rf.predict(test.drop(columns=['Churn']))
    )

    importances = pd.Series(rf.feature_importances_, index=train.drop(columns=['Churn']).columns).sort_values(ascending=False)[:10]

    TP = confusion_matrix_result[0, 0]
    FN = confusion_matrix_result[0, 1]
    FP = confusion_matrix_result[1, 0]
    TN = confusion_matrix_result[1, 1]

    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = 2*precision*recall/(precision+recall)

    accuracy = (TP+TN) / (TP+TN+FP+FN)

    results.update({
        f'fold{i+1}':{
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "accuracy": float(accuracy),
        "best_params": best_params,
        "\n importances": importances
        }
    })

    print(f'fold:{i+1}| prescision: {precision: .2f}, recall: {recall: .2f}, f1: {f1: .2f}, accuracy: {accuracy: .2f}')

fold:1| prescision:  0.72, recall:  1.00, f1:  0.83, accuracy:  0.72
fold:2| prescision:  0.72, recall:  1.00, f1:  0.83, accuracy:  0.72
fold:3| prescision:  0.72, recall:  1.00, f1:  0.83, accuracy:  0.72
fold:4| prescision:  0.72, recall:  1.00, f1:  0.84, accuracy:  0.72
fold:5| prescision:  0.72, recall:  1.00, f1:  0.83, accuracy:  0.72


In [8]:
for fold, result in results.items():
    print(f'{fold}: {result}')

fold1: {'precision': 0.7154406891981445, 'recall': 1.0, 'f1': 0.8341188287105, 'accuracy': 0.7154406891981445, 'best_params': {'n_estimators': 270, 'max_depth': 2}, '\n importances': CurrentEquipmentDays    0.307794
MonthsInService         0.224676
MonthlyMinutes          0.142052
PercChangeMinutes       0.066403
HandsetWebCapable       0.064346
CreditRating            0.033311
CustomerCareCalls       0.032896
AgeHH1                  0.023135
HandsetPrice            0.020097
MonthlyRevenue          0.017458
dtype: float64}
fold2: {'precision': 0.7155732273028496, 'recall': 1.0, 'f1': 0.8342088998763907, 'accuracy': 0.7155732273028496, 'best_params': {'n_estimators': 77, 'max_depth': 1}, '\n importances': MonthsInService         0.194805
MonthlyMinutes          0.142857
CreditRating            0.090909
HandsetWebCapable       0.090909
CurrentEquipmentDays    0.090909
CustomerCareCalls       0.051948
HandsetModels           0.051948
MonthlyRevenue          0.038961
AgeHH1                