In [3]:
# 라이브러리
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# 데이터 불러오기
# EUC-KR : 대표적인 한글 완성형 인코딩
train = pd.read_csv('/content/drive/MyDrive/colab/data/train.csv', encoding='EUC-KR')
test = pd.read_csv('/content/drive/MyDrive/colab/data/test.csv', encoding='EUC-KR')

In [6]:
# 데이터 정보 확인
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   user_id                            10000 non-null  object 
 1   subscription_duration              10000 non-null  int64  
 2   recent_login_time                  10000 non-null  int64  
 3   average_login_time                 10000 non-null  float64
 4   average_time_per_learning_session  10000 non-null  float64
 5   monthly_active_learning_days       10000 non-null  int64  
 6   total_completed_courses            10000 non-null  int64  
 7   recent_learning_achievement        10000 non-null  float64
 8   abandoned_learning_sessions        10000 non-null  int64  
 9   community_engagement_level         10000 non-null  int64  
 10  preferred_difficulty_level         10000 non-null  object 
 11  subscription_type                  10000 non-null  obje

In [7]:
train.head()

Unnamed: 0,user_id,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern,target
0,b919c29d,13,14,14.946163,8.427187,18,16,68.360455,3,4,Low,Basic,4,5,0
1,a0a60abb,16,18,18.453224,72.646087,16,13,97.567322,2,3,Medium,Basic,1,6,1
2,b9f171ae,22,1,16.195228,21.774492,13,14,94.358763,3,4,Medium,Premium,0,7,1
3,5dc0ba8b,1,19,17.628656,42.659066,19,18,70.153228,0,3,Low,Basic,1,0,1
4,65c83654,4,5,21.390656,30.744287,19,10,81.917908,2,4,Medium,Basic,3,0,1


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   user_id                            10000 non-null  object 
 1   subscription_duration              10000 non-null  int64  
 2   recent_login_time                  10000 non-null  int64  
 3   average_login_time                 10000 non-null  float64
 4   average_time_per_learning_session  10000 non-null  float64
 5   monthly_active_learning_days       10000 non-null  int64  
 6   total_completed_courses            10000 non-null  int64  
 7   recent_learning_achievement        10000 non-null  float64
 8   abandoned_learning_sessions        10000 non-null  int64  
 9   community_engagement_level         10000 non-null  int64  
 10  preferred_difficulty_level         10000 non-null  object 
 11  subscription_type                  10000 non-null  obje

## X, Y 변수 분리

In [9]:
# y값이 목표변수
y_train = train['target']

In [10]:
y_train=pd.DataFrame(y_train)

In [11]:
ID = test['user_id']

In [12]:
# x값은 종속변수
x_train = train.iloc[:, :14]

## 범주형 라벨인코딩

In [13]:
# 범주형 컬럼 확인
print(x_train['subscription_type'].unique())
print(x_train['preferred_difficulty_level'].unique())

['Basic' 'Premium']
['Low' 'Medium' 'High']


In [14]:
# train 라벨인코딩
x_train['preferred_difficulty_level']=x_train['preferred_difficulty_level'].map({'Low':1,'Medium':2,'High':3})
x_train['subscription_type'] = x_train['subscription_type'].map({'Basic':1, 'Premium':2})

# test 라벨인코딩
test['preferred_difficulty_level']=test['preferred_difficulty_level'].map({'Low':1,'Medium':2,'High':3})
test['subscription_type'] = test['subscription_type'].map({'Basic':1, 'Premium':2})

In [15]:
# 적용되었는지 확인
x_train.head()

Unnamed: 0,user_id,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern
0,b919c29d,13,14,14.946163,8.427187,18,16,68.360455,3,4,1,1,4,5
1,a0a60abb,16,18,18.453224,72.646087,16,13,97.567322,2,3,2,1,1,6
2,b9f171ae,22,1,16.195228,21.774492,13,14,94.358763,3,4,2,2,0,7
3,5dc0ba8b,1,19,17.628656,42.659066,19,18,70.153228,0,3,1,1,1,0
4,65c83654,4,5,21.390656,30.744287,19,10,81.917908,2,4,2,1,3,0


## StandardScaler 정규화 스케일링

In [16]:
# 정규화 스케일링 라이브러리
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [17]:
model = MinMaxScaler()

In [18]:
col = x_train[['average_login_time', 'average_time_per_learning_session', 'recent_learning_achievement']]
model.fit(col)

In [19]:
scaled = model.transform(x_train[['average_login_time', 'average_time_per_learning_session', 'recent_learning_achievement']])

In [20]:
scaled_df = pd.DataFrame(scaled, columns=[['average_login_time', 'average_time_per_learning_session', 'recent_learning_achievement']])

In [21]:
# 정규화 스케일링
x_train['average_login_time'] = scaled_df['average_login_time']
x_train['average_time_per_learning_session'] = scaled_df['average_time_per_learning_session']
x_train['recent_learning_achievement'] = scaled_df['recent_learning_achievement']

In [22]:
# user_id drop
x_train = x_train.drop(columns=['user_id'])

In [23]:
# 적용되었는 확인
x_train.head()

Unnamed: 0,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern
0,13,14,0.51071,0.016719,18,16,0.422657,3,4,1,1,4,5
1,16,18,0.653087,0.144299,16,13,0.803441,2,3,2,1,1,6
2,22,1,0.561419,0.043235,13,14,0.761609,3,4,2,2,0,7
3,1,19,0.619612,0.084726,19,18,0.446031,0,3,1,1,1,0
4,4,5,0.772338,0.061055,19,10,0.599412,2,4,2,1,3,0


In [24]:
# 상관관계 확인
x_train.corr()

Unnamed: 0,subscription_duration,recent_login_time,average_login_time,average_time_per_learning_session,monthly_active_learning_days,total_completed_courses,recent_learning_achievement,abandoned_learning_sessions,community_engagement_level,preferred_difficulty_level,subscription_type,customer_inquiry_history,payment_pattern
subscription_duration,1.0,0.014754,-3.1e-05,0.006324,0.002193,-0.001662,0.007875,0.014414,0.002114,-0.016938,-0.002887,-0.005174,-0.003704
recent_login_time,0.014754,1.0,-0.006113,-0.00465,-0.015338,-0.000142,0.005768,-0.015882,-0.005486,0.00886,-0.018172,0.011653,0.008737
average_login_time,-3.1e-05,-0.006113,1.0,-0.007382,0.002294,-0.012416,-0.005967,0.013661,0.003129,-0.008765,0.008284,0.009768,0.012508
average_time_per_learning_session,0.006324,-0.00465,-0.007382,1.0,-0.001678,0.081332,-0.008231,-0.014488,0.086485,-0.086796,0.141833,-0.001397,-0.00246
monthly_active_learning_days,0.002193,-0.015338,0.002294,-0.001678,1.0,0.004816,0.03011,-0.007512,0.0007,-0.012973,-0.017154,-0.016704,-0.003457
total_completed_courses,-0.001662,-0.000142,-0.012416,0.081332,0.004816,1.0,-0.002416,0.012322,0.266026,-0.255945,0.406623,-0.014949,-0.013646
recent_learning_achievement,0.007875,0.005768,-0.005967,-0.008231,0.03011,-0.002416,1.0,0.002115,-0.00081,-0.00494,-0.007014,0.005612,-0.003968
abandoned_learning_sessions,0.014414,-0.015882,0.013661,-0.014488,-0.007512,0.012322,0.002115,1.0,0.015877,-0.023684,-0.001664,-0.007322,0.010299
community_engagement_level,0.002114,-0.005486,0.003129,0.086485,0.0007,0.266026,-0.00081,0.015877,1.0,-0.217334,0.450195,0.005574,-0.015914
preferred_difficulty_level,-0.016938,0.00886,-0.008765,-0.086796,-0.012973,-0.255945,-0.00494,-0.023684,-0.217334,1.0,-0.002068,0.012926,0.018934


## 데이터 분리

In [25]:
# 라이브러리
from sklearn.model_selection import train_test_split

In [26]:
X_TRAIN, X_TEST, Y_TRAIN, Y_TEST = train_test_split(x_train, y_train, test_size=0.2, random_state=12)

In [27]:
ID = pd.DataFrame(ID)

In [28]:
test = test.drop(columns=['user_id'])