# Library

In [1]:
!pip install optuna



In [2]:
# Classification
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from matplotlib import rc
import statsmodels.api as sm
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold, cross_val_score, train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
import optuna
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import f1_score
from optuna.integration import OptunaSearchCV


# Data

In [3]:
'''1.데이터 가져오기/데이터 파일 로드'''


# Google Drive 파일 로드를 위한 Google Drive 마운트
from google.colab import drive

drive.mount('/content/drive')


# csv 파일 읽기
def r_csv(*file_names):
    path = '/content/drive/MyDrive/Colab Notebooks/data/'
    data_list = []
    for file_name in file_names:
        data = pd.read_csv(path + file_name, encoding='EUC-KR')
        data_list.append(data)
    return data_list

# csv 파일 읽기
origin_train_data, origin_test_data, origin_sample_submission = r_csv('train.csv', 'test.csv', 'sample_submission.csv')

# 원본 보존을 위해 카피해서 사용
train, test, ss = origin_train_data.copy(), origin_test_data.copy(), origin_sample_submission.copy()

train.drop(['user_id'], axis=1, inplace=True)
test.drop(['user_id'], axis=1, inplace=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Preprocessing

In [4]:
#단계별로 인코딩
train['preferred_difficulty_level'] = train['preferred_difficulty_level'].map({'Low':0, 'Medium':1,'High':2})
test['preferred_difficulty_level'] = test['preferred_difficulty_level'].map({'Low':0, 'Medium':1,'High':2})

#일반은 0, 프리미엄은 1로 인코딩
train['subscription_type'] = train['subscription_type'].map({'Basic':0, 'Premium':1})
test['subscription_type'] = test['subscription_type'].map({'Basic':0, 'Premium':1})

# Feature Engineering

In [5]:
def fe(df):
    # 평균 로그인 시간 / 가입 기간
    df['login_duration_ratio'] = df['average_login_time'] / (df['subscription_duration'] * 30)

    # 완료한 코스 수 대비 가입 기간
    df['course_duration_ratio'] = df['total_completed_courses'] / (df['subscription_duration'] * 30)

    # 평균 학습 시간 대비 활동적인 학습 일수
    df['learning_time_active_days_ratio'] = df['average_time_per_learning_session'] / df['monthly_active_learning_days']

    # 고객 문의 이력 대비 가입 기간
    df['inquiry_to_duration_ratio'] = df['customer_inquiry_history'] / (df['subscription_duration'] * 30)

    # 가입 기간 대비 중단된 학습 세션 수
    df['abandoned_sessions_to_duration_ratio'] = df['abandoned_learning_sessions'] / (df['subscription_duration']* 30)

    # 월별 이용 시간
    df['monthly_login_time'] = df['average_login_time'] * df['monthly_active_learning_days']

    # 평균 로그인 시간 대비 커뮤니티 레벨
    df['community_login_ratio'] = df['community_engagement_level'] / df['average_login_time']

    # 평균 로그인 시간 대비 완료한 코스 수
    df['login_to_course_ratio'] = df['total_completed_courses'] / df['average_login_time']

    return df


train = fe(train)
test = fe(test)

# Modeling


In [6]:
X = train.drop(['target', 'customer_inquiry_history', 'preferred_difficulty_level'], axis=1)
# -> 0.523
y = train['target']

In [7]:
X.columns

Index(['subscription_duration', 'recent_login_time', 'average_login_time',
       'average_time_per_learning_session', 'monthly_active_learning_days',
       'total_completed_courses', 'recent_learning_achievement',
       'abandoned_learning_sessions', 'community_engagement_level',
       'subscription_type', 'payment_pattern', 'login_duration_ratio',
       'course_duration_ratio', 'learning_time_active_days_ratio',
       'inquiry_to_duration_ratio', 'abandoned_sessions_to_duration_ratio',
       'monthly_login_time', 'community_login_ratio', 'login_to_course_ratio'],
      dtype='object')

In [8]:
# KNN 학습 모델
score_list = []
def knn_objective(trial):

    param_grid = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 50),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'metric': trial.suggest_categorical('metric',['euclidean', 'manhattan']),
        'p': trial.suggest_int('p', 1, 5)
    }

    model = KNeighborsClassifier(**param_grid)

    fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = []
    for num, (train_idx, valid_idx) in enumerate(fold.split(X, y)):
      X_train, X_test = X.iloc[train_idx], X.iloc[valid_idx]
      y_train, y_test = y.iloc[train_idx], y.iloc[valid_idx]

      model.fit(X_train, y_train)

      preds = model.predict(X_test)

      score = f1_score(y_test, preds, average='macro')

      scores.append(score)

    return np.mean(scores)

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(knn_objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best trial:', study.best_value)

[I 2023-12-20 15:24:06,421] A new study created in memory with name: no-name-2d751e27-8ab8-4b77-8639-bc4dc033ac8a
[I 2023-12-20 15:24:13,275] Trial 0 finished with value: 0.4790542950225326 and parameters: {'n_neighbors': 23, 'weights': 'distance', 'metric': 'manhattan', 'p': 1}. Best is trial 0 with value: 0.4790542950225326.
[I 2023-12-20 15:24:16,294] Trial 1 finished with value: 0.4932551203263005 and parameters: {'n_neighbors': 17, 'weights': 'uniform', 'metric': 'euclidean', 'p': 4}. Best is trial 1 with value: 0.4932551203263005.
[I 2023-12-20 15:24:18,612] Trial 2 finished with value: 0.44119185190000776 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'metric': 'euclidean', 'p': 5}. Best is trial 1 with value: 0.4932551203263005.
[I 2023-12-20 15:24:20,970] Trial 3 finished with value: 0.44119185190000776 and parameters: {'n_neighbors': 48, 'weights': 'distance', 'metric': 'euclidean', 'p': 4}. Best is trial 1 with value: 0.4932551203263005.
[I 2023-12-20 15:24:23,80

Number of finished trials: 100
Best trial: {'n_neighbors': 1, 'weights': 'uniform', 'metric': 'manhattan', 'p': 3}
Best trial: 0.5267570543989903


In [10]:
# shuffle = True로 했기 때문에 하이퍼파라미터 값은 계속해서 변경 됨 --> 최상의 값 : 0.5267570543989903

In [11]:
# 해당 코드의 파라미터 값이 위 코드 최상의 파라미터값과 다른 이유는 같은 점수값의 모델이 다수이기 때문입니다.
knn_params = {
 'n_neighbors': 1, 'weights': 'distance', 'metric': 'manhattan', 'p': 5
}
knn = KNeighborsClassifier(
  **knn_params
)
knn.fit(X, y)

# Post Precessing

In [12]:
final_test = knn.predict(test[X.columns])
test['target'] = final_test
test['target'].value_counts()

1    6105
0    3895
Name: target, dtype: int64

In [13]:
ss['target']= final_test
print(ss['target'].value_counts())
path = "./20231208_knn1.csv"
ss.to_csv(path,index=False)

1    6105
0    3895
Name: target, dtype: int64
