In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 데이터 로드
traindf = pd.read_csv("./dataset/cell2celltrain.csv")
testdf = pd.read_csv("./dataset/cell2cellholdout.csv")

# 일부 열 제거 및 결측치 제거
drop_col = ['CallForwardingCalls', 'ThreewayCalls', 'OffPeakCallsInOut', 'HandsetRefurbished', 'TruckOwner', 'RVOwner', 'BuysViaMailOrder', 'RespondsToMailOffers', 'OptOutMailings', 'OwnsComputer', 'OwnsMotorcycle']
traindf = traindf.drop(columns=drop_col)
testdf = testdf.drop(columns=drop_col)
traindf.dropna(axis=0, inplace=True)

# 데이터 전처리 열 지정
remain_col = ['CustomerID', 'CallForwardingCalls','RetentionCalls','RetentionOffersAccepted','ReferralsMadeBySubscriber','AdjustmentsToCreditRating']
number = traindf.select_dtypes(include=[np.number]).columns
prenumber = [x for x in number if x not in remain_col]
cate = [x for x in traindf.columns if x not in number]
cate.pop(0) # target 열 제거
precate = [x for x in cate if x not in remain_col]

# 이상치 처리 함수
def outlier_iqr(df, whis=2.0):
    for i in df.columns:
        X = df[i]
        q1 = np.nanquantile(X, q=0.25)
        q3 = np.nanquantile(X, q=0.75)
        IQR = q3 - q1
        lower_bound = q1 - IQR * whis
        upper_bound = q3 + IQR * whis
        
        df.loc[df[i] < lower_bound, i] = X.median()
        df.loc[df[i] > upper_bound, i] = X.median()
    return df

# 데이터 인코딩
def encoding(df, numcol, catecol):
    le = LabelEncoder()
    df['Churn'] = le.fit_transform(df['Churn'])
    for col in catecol:
        df[col] = le.fit_transform(df[col])
    
    scaler = StandardScaler()
    df[numcol] = scaler.fit_transform(df[numcol])
    return df

traindf[prenumber] = outlier_iqr(traindf[prenumber])
traindf = encoding(traindf, prenumber, precate)
traindf.to_csv(f"./dataset/dataset_outlier_rm.csv", index=False)

In [11]:
%%writefile dataloader.py
import pandas as pd

def load_dataset():
    # load dataset
    data = pd.read_csv('./dataset/cell2celltrain.csv')
    # 컬럼명 소문자로 변경
    data.columns = data.columns.str.lower()
    # target 컬럼을 y로, 나머지를 X로
    X = data.drop(columns='churn')
    y = data['churn']
    
    return X, y

def load_testset():
    data = pd.read_csv('./dataset/cell2cellholdout.csv')
    data.columns = data.columns.str.lower()
    # target 컬럼을 y로, 나머지를 X로
    X = data.drop(columns='churn')
    y = data['churn']

    return X,y

Overwriting dataloader.py


In [14]:
from sklearn.model_selection import train_test_split
from dataloader import load_dataset
from dataloader import load_testset
from sklearn.model_selection import train_test_split

X, y = load_dataset()
X_test, y_test = load_testset()

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

In [16]:
from sklearn.tree import DecisionTreeClassifier
# tree = DecisionTreeClassifier(random_state=0)
# tree.fit(X_train, y_train)

In [17]:
# !pip install metrics
# !pip install dataloader

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = [
    ("LR", LogisticRegression(random_state=0, max_iter=3000)),
    ("GBM", GradientBoostingClassifier(random_state=0)),
    ("RF", RandomForestClassifier(random_state=0)),
    ("KNN", KNeighborsClassifier()),
    ("XGB", XGBClassifier(random_state=0)),
    ("TREE",DecisionTreeClassifier(random_state=0))
]

In [19]:
baseline_result_train_dict = {}
baseline_result_valid_dict = {}


for name, model in models:
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    
    pred_valid = model.predict(X_valid)
    pred_proba_valid = model.predict_proba(X_valid)[:, 1]

    accuracy_valid = accuracy_score(y_valid, pred_valid)
    precision_valid = precision_score(y_valid, pred_valid)
    recall_valid = recall_score(y_valid, pred_valid)
    f1_valid = f1_score(y_valid, pred_valid)
    auc_valid = roc_auc_score(y_valid, pred_proba_valid)
    
    baseline_result_valid_dict[name] = [accuracy_valid, precision_valid, recall_valid, f1_valid, auc_valid]

Training LR...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Training GBM...
Training RF...
Training KNN...
Training XGB...
Training TREE...


In [12]:
pd.DataFrame(baseline_result_valid_dict, index=['accuracy', 'precision', 'recall', 'f1', 'roc-auc'])

Unnamed: 0,LR,GBM,RF,KNN,XGB,TREE
accuracy,0.713137,0.72182,0.720855,0.651552,0.709037,0.621161
precision,0.333333,0.639889,0.607748,0.299845,0.47793,0.349385
recall,0.001966,0.064869,0.070486,0.162595,0.176355,0.374895
f1,0.003908,0.117797,0.126321,0.210852,0.257641,0.361691
roc-auc,0.589393,0.662326,0.644168,0.513366,0.645137,0.547423


In [20]:
import joblib
import os

os.makedirs('models', exist_ok=True)

In [13]:
from sklearn.model_selection import RandomizedSearchCV

param_gbm = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': range(1, 6),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

param_xgb = {
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [1000, 2000, 3000, 4000, 5000],
    'max_depth': range(1, 6),
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

# Initialize RandomizedSearchCV for each model
random_search_gbm = RandomizedSearchCV(GradientBoostingClassifier(random_state=0), param_gbm, cv=4, scoring='roc_auc', n_jobs=-1, n_iter=60, random_state=0)
random_search_xgb = RandomizedSearchCV(XGBClassifier(random_state=0), param_xgb, cv=4, scoring='roc_auc', n_jobs=-1, n_iter=60, random_state=0)

print("GradientBoostingClassifier 튜닝...")
random_search_gbm.fit(X_train, y_train)

print("XGBClassifier 튜닝...")
random_search_xgb.fit(X_train, y_train);

GradientBoostingClassifier 튜닝...
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Playdata\AppData\Local\Temp\ipykernel_13004\2276646693.py", line 22, in <module>
    random_search_gbm.fit(X_train, y_train)
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_search.py", line 1019, in fit
    self._run_search(evaluate_candidates)
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_search.py", line 1960, in _run_search
    evaluate_candidates(
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\sklearn\model_selection\_search.py", line 965, in evaluate_candidates
 

In [4]:
best_gbm = random_search_gbm.best_estimator_
best_xgb = random_search_xgb.best_estimator_

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Playdata\AppData\Local\Temp\ipykernel_13004\3945663569.py", line 1, in <module>
    best_gbm = random_search_gbm.best_estimator_
               ^^^^^^^^^^^^^^^^^
NameError: name 'random_search_gbm' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\ultratb.py", line 1457, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Us

In [3]:
joblib.dump(best_gbm, 'models/best_gbm.pkl')
joblib.dump(best_xgb, 'models/best_xgb.pkl')

print("Models saved successfully.")

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Playdata\AppData\Local\Temp\ipykernel_13004\4074536209.py", line 1, in <module>
    joblib.dump(best_gbm, 'models/best_gbm.pkl')
                ^^^^^^^^
NameError: name 'best_gbm' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\interactiveshell.py", line 2168, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Playdata\miniconda3\envs\ml\Lib\site-packages\IPython\core\ultratb.py", line 1457, in structured_traceback
    return FormattedTB.structured_traceback(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Playdata\mini