# ML

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import random

random_number = random.randint(1, 100)

from sklearn.model_selection import train_test_split

pd.set_option('display.float_format', '{:.2f}'.format) 

In [31]:
df_netflix = pd.read_csv('./data/netflix_user_data_clean_kbs.csv')

rename_map = {
    "Customer ID": "고객번호",
    "Subscription Length (Months)": "구독기간_개월",
    "Customer Satisfaction Score (1-10)": "고객만족도_1_10",
    "Daily Watch Time (Hours)": "일일시청시간_시간",
    "Engagement Rate (1-10)": "참여도_1_10",
    "Device Used Most Often": "주이용기기",
    "Genre Preference": "선호장르",
    "Region": "지역",
    "Payment History (On-Time/Delayed)": "결제이력_정시_지연",
    "Subscription Plan": "구독플랜",
    "Churn status": "이탈여부",
    "Support Queries Logged": "지원문의_건수",
    "Age": "나이",
    "Monthly Income ($)": "월소득_달러",
    "Promotional Offers Used": "프로모션_사용",
    "Number of Profiles Created": "생성프로필수"
}

df_netflix = df_netflix.drop('Customer ID', axis=1) # 쓸일 없을듯

In [32]:
# df_netflix.info()

## ## 라벨인코딩

In [33]:
from sklearn.preprocessing import LabelEncoder # 전처리

encoders = [
    ('Device Used Most Often', LabelEncoder()),    
    ('Genre Preference', LabelEncoder()),
    ('Region', LabelEncoder()),
    ('Payment History (On-Time/Delayed)', LabelEncoder()),
    ('Subscription Plan', LabelEncoder()),
    ('Churn status', LabelEncoder())
]


for i, (col_name, le) in enumerate(encoders):
    df_netflix[col_name + '_le'] = le.fit_transform(df_netflix[col_name])


## ## 데이터 분류

In [34]:

x = df_netflix[['Subscription Length (Months)', 'Customer Satisfaction Score (1-10)',
       'Daily Watch Time (Hours)', 'Engagement Rate (1-10)',
       'Support Queries Logged', 'Age', 'Monthly Income ($)',
       'Promotional Offers Used', 'Number of Profiles Created',
       'Device Used Most Often_le', 'Genre Preference_le', 'Region_le',
       'Payment History (On-Time/Delayed)_le', 'Subscription Plan_le']]
y = df_netflix['Churn status_le']


In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=random_number)

In [36]:
from imblearn.over_sampling import SMOTE

print(f'Before SMOTE: {np.bincount(y)}')

smote = SMOTE(random_state=random_number)

x_train, y_train = smote.fit_resample(x_train, y_train)


print(f'After SMOTE: {np.bincount(y_train)}')
print(x.shape, y.shape)
x_train.shape, y_train.shape

Before SMOTE: [ 12042 102071]
After SMOTE: [76532 76532]
(114113, 14) (114113,)


((153064, 14), (153064,))

## ## 데이터 정규화

In [37]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)

## ## 로지스틱 회귀

In [38]:

from sklearn.linear_model import LogisticRegression


lr_clf = LogisticRegression()

lr_clf.fit(x_train_scaled, y_train)

lr_clf.score(x_train_scaled, y_train), lr_clf.score(x_test_scaled, y_test)

(0.7080044948518267, 0.6912264713098952)

In [39]:
encoders[5][1].classes_

array(['No', 'Yes'], dtype=object)

In [40]:
x_test[:20]

Unnamed: 0,Subscription Length (Months),Customer Satisfaction Score (1-10),Daily Watch Time (Hours),Engagement Rate (1-10),Support Queries Logged,Age,Monthly Income ($),Promotional Offers Used,Number of Profiles Created,Device Used Most Often_le,Genre Preference_le,Region_le,Payment History (On-Time/Delayed)_le,Subscription Plan_le
54119,12.0,1.0,1.0,9.0,3.0,50.0,3271.0,2.0,4.0,3,3,3,1,2
87007,12.0,3.0,1.0,4.0,3.0,40.0,4050.0,3.0,3.0,2,3,0,1,2
92304,3.0,9.0,4.0,1.0,6.0,40.0,902.0,4.0,4.0,4,4,2,1,0
19471,12.0,6.0,3.0,1.0,0.0,30.0,705.0,2.0,3.0,0,4,2,1,1
88072,12.0,4.0,4.0,2.0,7.0,70.0,6126.0,4.0,4.0,2,4,0,0,2
34451,1.0,3.0,2.0,3.0,3.0,30.0,2490.0,2.0,2.0,1,1,4,0,0
5425,12.0,10.0,5.0,4.0,1.0,50.0,7392.0,4.0,4.0,1,5,1,0,2
82056,1.0,5.0,3.0,2.0,5.0,30.0,6089.0,3.0,2.0,1,6,0,0,0
50389,3.0,6.0,5.0,4.0,1.0,60.0,2034.0,4.0,5.0,2,2,2,0,2
8943,1.0,10.0,3.0,2.0,7.0,60.0,2437.0,0.0,5.0,2,6,0,0,2


In [41]:
y_pred = lr_clf.predict(x_test_scaled)




# print(y_pred) # ['Bream', 'Smelt', 'Smelt']
print(lr_clf.classes_)
# print(lr_clf.predict_proba(x_test_scaled[6:16])) # 클래스별 확율

[0 1]


In [42]:
print(lr_clf.coef_)         # feature의 가중치
print(lr_clf.intercept_)    # 절편의 가중치

[[ 0.04893188  0.01569848  0.00689245  0.01381346  0.4441018  -1.08579999
   0.02428848 -0.37159206 -0.00477188  0.01553582  0.00343254  0.00446083
   0.04628848  0.02536627]]
[0.03803641]


In [43]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print("혼동 행렬:")
print(cm)

tn = cm[0][0]
tp = cm[1][1]
fp = cm[0][1]
fn = cm[1][0]



# tn, tp, fp, fn

accuracy = (tp + tn) / (tn + tp + fp + fn)
precision = tp / (tp + fp)

accuracy, precision


혼동 행렬:
[[ 2138   852]
 [ 7957 17582]]


(np.float64(0.6912264713098952), np.float64(0.953781056742975))