In [2]:
# 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# 데이터 불러오기
# EUC-KR : 대표적인 한글 완성형 인코딩
train = pd.read_csv('/content/drive/MyDrive/colab/data/train.csv', encoding='EUC-KR')
test = pd.read_csv('/content/drive/MyDrive/colab/data/test.csv', encoding='EUC-KR')

In [5]:
# 데이터 정보 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   user_id                            10000 non-null  object 
 1   subscription_duration              10000 non-null  int64  
 2   recent_login_time                  10000 non-null  int64  
 3   average_login_time                 10000 non-null  float64
 4   average_time_per_learning_session  10000 non-null  float64
 5   monthly_active_learning_days       10000 non-null  int64  
 6   total_completed_courses            10000 non-null  int64  
 7   recent_learning_achievement        10000 non-null  float64
 8   abandoned_learning_sessions        10000 non-null  int64  
 9   community_engagement_level         10000 non-null  int64  
 10  preferred_difficulty_level         10000 non-null  object 
 11  subscription_type                  10000 non-null  obje

In [6]:
train.head()

Unnamed: 0,user_id,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern,target
0,b919c29d,13,14,14.946163,8.427187,18,16,68.360455,3,4,Low,Basic,4,5,0
1,a0a60abb,16,18,18.453224,72.646087,16,13,97.567322,2,3,Medium,Basic,1,6,1
2,b9f171ae,22,1,16.195228,21.774492,13,14,94.358763,3,4,Medium,Premium,0,7,1
3,5dc0ba8b,1,19,17.628656,42.659066,19,18,70.153228,0,3,Low,Basic,1,0,1
4,65c83654,4,5,21.390656,30.744287,19,10,81.917908,2,4,Medium,Basic,3,0,1


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   user_id                            10000 non-null  object 
 1   subscription_duration              10000 non-null  int64  
 2   recent_login_time                  10000 non-null  int64  
 3   average_login_time                 10000 non-null  float64
 4   average_time_per_learning_session  10000 non-null  float64
 5   monthly_active_learning_days       10000 non-null  int64  
 6   total_completed_courses            10000 non-null  int64  
 7   recent_learning_achievement        10000 non-null  float64
 8   abandoned_learning_sessions        10000 non-null  int64  
 9   community_engagement_level         10000 non-null  int64  
 10  preferred_difficulty_level         10000 non-null  object 
 11  subscription_type                  10000 non-null  obje

In [8]:
train.shape

(10000, 15)

## X, Y 변수 분리

In [9]:
# test데이터 copy
test2 = test.copy()

In [10]:
# test 데이터에서 '구독유형' 변수 삭제
test2 = test2.drop(['subscription_type'], axis=1)

In [11]:
# y값이 목표변수
y_train = train['subscription_type']

In [12]:
y_train=pd.DataFrame(y_train)

In [13]:
ID = test['user_id']

In [17]:
# x값은 종속변수
x_train = train.drop(['subscription_type'], axis=1)

## 범주형 라벨인코딩

In [19]:
# 범주형 컬럼 확인
print(x_train['preferred_difficulty_level'].unique())

['Low' 'Medium' 'High']


In [21]:
# train 라벨인코딩
x_train['preferred_difficulty_level']=x_train['preferred_difficulty_level'].map({'Low':0,'Medium':1,'High':2})
y_train['subscription_type'] = y_train['subscription_type'].map({'Basic':0, 'Premium':1})

# test 라벨인코딩
test['preferred_difficulty_level']=test['preferred_difficulty_level'].map({'Low':0,'Medium':1,'High':2})

In [22]:
# 적용되었는지 확인
x_train.head()

Unnamed: 0,user_id,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,customer_inquiry_history,payment_pattern,target
0,b919c29d,13,14,14.946163,8.427187,18,16,68.360455,3,4,0,4,5,0
1,a0a60abb,16,18,18.453224,72.646087,16,13,97.567322,2,3,1,1,6,1
2,b9f171ae,22,1,16.195228,21.774492,13,14,94.358763,3,4,1,0,7,1
3,5dc0ba8b,1,19,17.628656,42.659066,19,18,70.153228,0,3,0,1,0,1
4,65c83654,4,5,21.390656,30.744287,19,10,81.917908,2,4,1,3,0,1


## StandardScaler 정규화 스케일링

In [23]:
# 정규화 스케일링 라이브러리
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [24]:
model = MinMaxScaler()

In [25]:
col = x_train[['average_login_time', 'average_time_per_learning_session', 'recent_learning_achievement']]
model.fit(col)

In [26]:
scaled = model.transform(x_train[['average_login_time', 'average_time_per_learning_session', 'recent_learning_achievement']])

In [27]:
scaled_df = pd.DataFrame(scaled, columns=[['average_login_time', 'average_time_per_learning_session', 'recent_learning_achievement']])

In [28]:
# 정규화 스케일링
x_train['average_login_time'] = scaled_df['average_login_time']
x_train['average_time_per_learning_session'] = scaled_df['average_time_per_learning_session']
x_train['recent_learning_achievement'] = scaled_df['recent_learning_achievement']

In [29]:
# user_id drop
x_train = x_train.drop(columns=['user_id'])

In [30]:
# 적용되었는 확인
x_train.head()

Unnamed: 0,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,customer_inquiry_history,payment_pattern,target
0,13,14,0.51071,0.016719,18,16,0.422657,3,4,0,4,5,0
1,16,18,0.653087,0.144299,16,13,0.803441,2,3,1,1,6,1
2,22,1,0.561419,0.043235,13,14,0.761609,3,4,1,0,7,1
3,1,19,0.619612,0.084726,19,18,0.446031,0,3,0,1,0,1
4,4,5,0.772338,0.061055,19,10,0.599412,2,4,1,3,0,1


In [31]:
# 상관관계 확인
x_train.corr()

Unnamed: 0,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,customer_inquiry_history,payment_pattern,target
subscription_duration,1.0,0.014754,-3.1e-05,0.006324,0.002193,-0.001662,0.007875,0.014414,0.002114,-0.016938,-0.005174,-0.003704,-0.00671
recent_login_time,0.014754,1.0,-0.006113,-0.00465,-0.015338,-0.000142,0.005768,-0.015882,-0.005486,0.00886,0.011653,0.008737,-0.004184
average_login_time,-3.1e-05,-0.006113,1.0,-0.007382,0.002294,-0.012416,-0.005967,0.013661,0.003129,-0.008765,0.009768,0.012508,-0.008527
average_time_per_learning_session,0.006324,-0.00465,-0.007382,1.0,-0.001678,0.081332,-0.008231,-0.014488,0.086485,-0.086796,-0.001397,-0.00246,0.118261
monthly_active_learning_days,0.002193,-0.015338,0.002294,-0.001678,1.0,0.004816,0.03011,-0.007512,0.0007,-0.012973,-0.016704,-0.003457,0.004876
total_completed_courses,-0.001662,-0.000142,-0.012416,0.081332,0.004816,1.0,-0.002416,0.012322,0.266026,-0.255945,-0.014949,-0.013646,-0.00404
recent_learning_achievement,0.007875,0.005768,-0.005967,-0.008231,0.03011,-0.002416,1.0,0.002115,-0.00081,-0.00494,0.005612,-0.003968,0.002294
abandoned_learning_sessions,0.014414,-0.015882,0.013661,-0.014488,-0.007512,0.012322,0.002115,1.0,0.015877,-0.023684,-0.007322,0.010299,-0.019636
community_engagement_level,0.002114,-0.005486,0.003129,0.086485,0.0007,0.266026,-0.00081,0.015877,1.0,-0.217334,0.005574,-0.015914,0.009641
preferred_difficulty_level,-0.016938,0.00886,-0.008765,-0.086796,-0.012973,-0.255945,-0.00494,-0.023684,-0.217334,1.0,0.012926,0.018934,0.01286


## 데이터 분리

In [32]:
# 라이브러리
from sklearn.model_selection import train_test_split

In [33]:
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(x_train, y_train, test_size=0.2, random_state=12)

In [34]:
ID = pd.DataFrame(ID)

In [36]:
test2 = test2.drop(columns=['user_id'])

## 모델링

In [37]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

In [38]:
# XGBoost 분류기 생성
xgb = XGBClassifier()
scorer = make_scorer(f1_score, average='macro')

In [39]:
# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'min_child_weight': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'subsample': [0.3, 0.5, 0.55, 0.6, 0.7],
    'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
    'n_estimators': [104, 109, 210, 300, 350, 500]
}

In [40]:
# 그리드 서치 수행
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_TRAIN, Y_TRAIN)

# 최적의 하이퍼파라미터 및 성능 확인
print(grid_search.best_params_)
print(grid_search.best_score_)

Fitting 3 folds for each of 16200 candidates, totalling 48600 fits
{'colsample_bytree': 0.5, 'max_depth': 1, 'min_child_weight': 3, 'n_estimators': 109, 'subsample': 0.6}
0.769500586620221


In [None]:
model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=5,
    random_state=12,
    n_jobs=-1
)
model.fit(X_TRAIN, Y_TRAIN)

train_pred = model.predict(X_TRAIN)
test_pred = model.predict(X_TEST)

print(accuracy_score(Y_TRAIN, train_pred))
print(accuracy_score(Y_TEST, test_pred))