# 4. 다중 분류 - Logistic Regression

## 4-1. 데이터 준비하기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

ori_data = pd.read_csv('data/02_fish/fish.csv')
data = ori_data.copy()
data.head()

Unnamed: 0,Species,Weight,Vertical_Length,Diagonal_Length,Cross_Length,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [2]:
# 입력데이터 만들기
input_df = data[['Weight','Vertical_Length','Diagonal_Length','Cross_Length','Height','Width']]
input_df.head()

Unnamed: 0,Weight,Vertical_Length,Diagonal_Length,Cross_Length,Height,Width
0,242.0,23.2,25.4,30.0,11.52,4.02
1,290.0,24.0,26.3,31.2,12.48,4.3056
2,340.0,23.9,26.5,31.1,12.3778,4.6961
3,363.0,26.3,29.0,33.5,12.73,4.4555
4,430.0,26.5,29.0,34.0,12.444,5.134


In [3]:
# 입력데이터는 2차원 배열로 만들기
input_data = input_df.to_numpy()
input_data[:5]

array([[242.    ,  23.2   ,  25.4   ,  30.    ,  11.52  ,   4.02  ],
       [290.    ,  24.    ,  26.3   ,  31.2   ,  12.48  ,   4.3056],
       [340.    ,  23.9   ,  26.5   ,  31.1   ,  12.3778,   4.6961],
       [363.    ,  26.3   ,  29.    ,  33.5   ,  12.73  ,   4.4555],
       [430.    ,  26.5   ,  29.    ,  34.    ,  12.444 ,   5.134 ]])

In [4]:
# 타겟데이터는 1차원 배열로 만들기
target_data = data['Species'].to_numpy()
target_data[:5]

array(['Bream', 'Bream', 'Bream', 'Bream', 'Bream'], dtype=object)

## 4-2. 데이터 나누기

In [5]:
from sklearn.model_selection import train_test_split

# 훈련 세트와 테스트 세트 나누기
train_input, test_input, train_target, test_target = train_test_split(input_data, target_data, stratify=target_data, random_state=42)
train_input.shape, test_input.shape

((119, 6), (40, 6))

## 4-3. 데이터 전처리 - 표준화

In [7]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(train_input)
train_scaled = ss.transform(train_input)
test_scaled = ss.transform(test_input)

In [8]:
train_scaled[:5]

array([[-0.75628803, -0.64716022, -0.66065677, -0.62357446, -0.78015159,
        -0.45043644],
       [-0.45991057, -0.12483205, -0.1248453 , -0.24414603, -0.4293487 ,
         0.03516919],
       [ 0.07356886, -0.00991985,  0.0212851 ,  0.2165885 ,  0.79541208,
         0.37481797],
       [ 1.54063728,  1.00339682,  1.0441979 ,  1.23743166,  2.29283234,
         1.34130358],
       [-0.87483902, -0.79341211, -0.75807703, -0.82232269, -0.80672937,
        -0.5697143 ]])

## 4-4. 모델 학습 및 평가

In [41]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=20, max_iter=1000)
lr.fit(train_scaled, train_target)

LogisticRegression(C=20, max_iter=1000)

In [42]:
lr.score(test_scaled, test_target)

0.975

In [32]:
lr.classes_

array(['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish'],
      dtype=object)

In [33]:
lr.predict(test_scaled[:5])

array(['Roach', 'Perch', 'Perch', 'Parkki', 'Parkki'], dtype=object)

In [34]:
proba = lr.predict_proba(test_scaled[:5])
np.round(proba, decimals=3)

array([[0.   , 0.029, 0.231, 0.002, 0.691, 0.01 , 0.036],
       [0.   , 0.033, 0.563, 0.001, 0.36 , 0.003, 0.041],
       [0.   , 0.062, 0.554, 0.001, 0.338, 0.017, 0.028],
       [0.003, 0.931, 0.001, 0.   , 0.05 , 0.   , 0.014],
       [0.001, 0.885, 0.004, 0.   , 0.092, 0.002, 0.017]])

In [35]:
lr.coef_, lr.intercept_

(array([[-1.44151471, -1.61554573, -0.78704717,  3.12219706,  8.05222867,
         -0.51441249],
        [ 0.0540286 , -0.79147455, -1.39886916, -3.3505043 ,  6.5628741 ,
         -1.69683318],
        [ 2.86628839,  2.39927841,  4.9803279 , -9.12942406, -6.44285264,
          4.44411986],
        [-0.61817284,  2.66193984,  2.64162566,  2.84384137, -3.38902286,
         -2.29446439],
        [-1.97439561, -1.10966889, -4.68968845,  5.31581174, -0.97972708,
          2.15547895],
        [-1.36028883,  1.4156424 ,  0.57616558,  0.46322005, -4.77888694,
         -4.47065894],
        [ 2.474055  , -2.96017148, -1.32251436,  0.73485815,  0.97538673,
          2.37677018]]),
 array([ 0.06997354, -0.184137  ,  2.66062997, -0.64745721,  2.20829215,
        -6.56487118,  2.45756973]))

### 다중 분류는 클래스마다 z값을 하나씩 계산
* 이중 분류에서는 z값을 시그모이드 함수를 이용해 0과 1사이의 값으로 변환
* 다중 분류에서는 z값을 소프트맥스 함수를 이용해 확률로 변환 (0~1 사이의 값으로 변환 시 전체 합이 1이 되도록 함)

In [36]:
decision = lr.decision_function(test_scaled[:5])
decision

array([[-4.44583898,  0.41174738,  2.50420277, -2.09900932,  3.59931299,
        -0.62589087,  0.65547603],
       [-4.3075289 ,  0.82940819,  3.6772554 , -2.84480394,  3.2295491 ,
        -1.64835249,  1.06447264],
       [-5.57729539,  1.42128826,  3.61237611, -3.33130325,  3.11792219,
         0.14706522,  0.60994685],
       [-0.1809579 ,  5.49678219, -1.44529013, -5.07400398,  2.5661694 ,
        -2.69638904,  1.33368946],
       [-1.98422348,  4.9340323 , -0.53707309, -4.94249124,  2.66785461,
        -1.113885  ,  0.97578591]])

In [37]:
from scipy.special import softmax

proba = softmax(decision, axis=1)
np.round(proba, decimals=3)

array([[0.   , 0.029, 0.231, 0.002, 0.691, 0.01 , 0.036],
       [0.   , 0.033, 0.563, 0.001, 0.36 , 0.003, 0.041],
       [0.   , 0.062, 0.554, 0.001, 0.338, 0.017, 0.028],
       [0.003, 0.931, 0.001, 0.   , 0.05 , 0.   , 0.014],
       [0.001, 0.885, 0.004, 0.   , 0.092, 0.002, 0.017]])