In [229]:
import numpy as np
import pandas as pd
import plotly.express as px
import math

# 데이터 생성
n_data = 500

x1 = np.random.uniform(-1, 1, n_data)  # 독립 변수 1
x2 = np.random.uniform(1, 3, n_data)  # 독립 변수 2
y = 2.5*x1 + 3.75*x2 + np.random.normal(0, 1.25, n_data)  # 종속 변수 (선형 관계 + 노이즈)
y_prob = np.where(y >= 8, 1, 0) # Logistic Regression을 위해 0~1로 변환

data = pd.DataFrame({'X1': x1, 'X2': x2, 'Y': y_prob})

In [230]:
# Scatter plot 표시
fig = px.scatter(data,
                 x='X1', y='X2',
                 color='Y',
                 color_continuous_scale=px.colors.sequential.Redor_r,
                 title="Logistic Regression Example Data")
fig.update_traces(marker_size=4.5)
fig.update_layout(width=800, height=500)

fig.show()

In [231]:
# 독립 변수와 종속 변수를 구분

X = data[['X1', 'X2']]
Y_prob = data['Y']

In [232]:
# 학습 데이터 80%, 테스트 데이터 20%로 분리

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = \
    train_test_split(X, Y_prob,
                     test_size=0.2,
                     random_state=2025)

**1. Linear Regression**

In [233]:
# Linear Regression

from sklearn.linear_model import LinearRegression

linear_regression = LinearRegression()
linear_regression.fit(X_train, Y_train)

In [234]:
# 선형 회귀식 출력

linear_coef = linear_regression.coef_
linear_intercept = linear_regression.intercept_
linear_coef_x1, linear_coef_x2 = linear_coef[0], linear_coef[1]

print(f'y = {linear_coef_x1} * x1 + {linear_coef_x2} * x2 + {linear_intercept}')

y = 0.29998069887107637 * x1 + 0.5628504249536987 * x2 + -0.6875412107273049


In [235]:
# 선형 회귀식의 y 값이 0.5 가 되는 선분 표시
#    y = w1 * x1 + w2 * x2 + b =  0.5
# =>               w2 * x2     =  0.5 - (w1 * x1 + b)
# =>                    x2     = [0.5 - (w1 * x1 + b)] / w2

import plotly.graph_objects as go

x1_ = np.array([np.min(x1), np.max(x1)])
x2_ = (0.5 - (linear_coef_x1 * x1_ + linear_intercept)) / linear_coef_x2

fig.add_trace(go.Scatter(x=x1_,
                         y=x2_,
                         line_shape='linear',
                         line=dict(
                             color='#E72',
                             width=1.5
                         ),
                         marker={'size': 0.1},
                         name='Regression Line'))

fig.update_layout(title='1. Linear Regression Result Data')
fig.show()

In [236]:
# 성능지표 계산 (TP, TN, FP, FN, Recall, Precision, F1 Score)
# TP, TN, FP, FN 개수 계산

TP = 0
TN = 0
FP = 0
FN = 0

for X1, X2, Y in zip(X_test['X1'], X_test['X2'], Y_test):

    # 예측값이 0.5 이상이면 1 로 예측한 것으로 간주
    predicted_as_1 = linear_coef_x1 * X1 + linear_coef_x2 * X2 + linear_intercept >= 0.5
    true_1 = Y >= 1

    TP += predicted_as_1 and true_1
    TN += not predicted_as_1 and not true_1
    FP += predicted_as_1 and not true_1
    FN += not predicted_as_1 and true_1

print(f'True  Positive : {TP}\nTrue  Negative : {TN}\nFalse Positive : {FP}\nFalse Negative : {FN}')

True  Positive : 36
True  Negative : 50
False Positive : 6
False Negative : 8


In [237]:
# Recall, Precision, F1 Score 계산 함수

def compute_scores(TP, TN, FP, FN):
    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    f1_score = 2 * recall * precision / (recall + precision)

    print(f'Recall    : {recall}\nPrecision : {precision}\nF1 Score  : {f1_score}')

In [238]:
compute_scores(TP, TN, FP, FN)

Recall    : 0.8181818181818182
Precision : 0.8571428571428571
F1 Score  : 0.8372093023255814


**2. Logistic Regression**

In [239]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, Y_train)

In [240]:
# Logistic Regression 회귀식 출력

logistic_coef = logistic_regression.coef_[0]
logistic_intercept = logistic_regression.intercept_[0]
logistic_coef_x1, logistic_coef_x2 = logistic_coef[0], logistic_coef[1]

print(f'z = {logistic_coef_x1} * x1 + {logistic_coef_x2} * x2 + {logistic_intercept}')
print('y = sigmoid(z) = 1 / (1 + exp(-z))')

z = 2.551902154974755 * x1 + 4.179550081886626 * x2 + -8.885509357887493
y = sigmoid(z) = 1 / (1 + exp(-z))


In [241]:
# prediction

Y_logistic_pred = logistic_regression.predict(X_test)

print(f'prediction examples : {Y_logistic_pred[:10]}')

prediction examples : [0 0 1 0 1 1 0 0 1 1]


In [242]:
# 성능지표 계산 (TP, TN, FP, FN, Recall, Precision, F1 Score)
# TP, TN, FP, FN 개수 계산

TP = 0
TN = 0
FP = 0
FN = 0

for Y_pred, Y_true in zip(Y_logistic_pred, Y_test):

    TP += Y_pred == 1 and Y_true == 1
    TN += Y_pred == 0 and Y_true == 0
    FP += Y_pred == 1 and Y_true == 0
    FN += Y_pred == 0 and Y_true == 1

print(f'True  Positive : {TP}\nTrue  Negative : {TN}\nFalse Positive : {FP}\nFalse Negative : {FN}')

True  Positive : 35
True  Negative : 50
False Positive : 6
False Negative : 9


In [243]:
compute_scores(TP, TN, FP, FN)

Recall    : 0.7954545454545454
Precision : 0.8536585365853658
F1 Score  : 0.8235294117647058


**3. MSE (Mean Sqaure Error) 분석**

In [244]:
# Linear Regression 회귀식에 의한 예측 결과

Y_linear_formula_pred = linear_coef_x1 * X_test['X1'] + linear_coef_x2 * X_test['X2'] + linear_intercept
print(Y_linear_formula_pred)

34    -0.154010
400    0.352182
239    0.656882
413   -0.172176
65     0.850232
         ...   
251    0.627895
282    0.246434
17     0.531347
93     0.935018
312    1.212668
Length: 100, dtype: float64


In [245]:
# Scatter plot

fig = px.scatter(x=Y_linear_formula_pred,
                 y=Y_test,
                 title='Linear Regression Prediction & True Values on dataset')

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         line_shape='linear',
                         line=dict(
                             color='#309',
                             width=1.5
                         ),
                         marker={'size': 0.1},
                         name='y = x'))

fig.update_layout(xaxis_title="Prediction", yaxis_title="True Value")
fig.update_layout(width=600, height=600)
fig.show()

In [246]:
# Logistic Regression 회귀식에 의한 예측 결과

Z_logistic_formula_pred = logistic_coef_x1 * X_test['X1'] + logistic_coef_x2 * X_test['X2'] + logistic_intercept
Y_logistic_formula_pred = 1.0 / (1.0 + np.exp(-Z_logistic_formula_pred))
print(Y_logistic_formula_pred)

34     0.005850
400    0.235628
239    0.768656
413    0.004805
65     0.915808
         ...   
251    0.767128
282    0.100663
17     0.588415
93     0.964797
312    0.995909
Length: 100, dtype: float64


In [247]:
# Scatter plot

fig = px.scatter(x=Y_logistic_formula_pred,
                 y=Y_test,
                 title='Logistic Regression Prediction & True Values on dataset')

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1],
                         line_shape='linear',
                         line=dict(
                             color='#309',
                             width=1.5
                         ),
                         marker={'size': 0.1},
                         name='y = x'))

fig.update_layout(xaxis_title="Prediction", yaxis_title="True Value")
fig.update_layout(width=600, height=600)

fig.show()

In [248]:
# Compare MSE for Linear/Logistic Regression

from sklearn.metrics import mean_squared_error

MSE_linear = mean_squared_error(Y_linear_formula_pred, Y_test)
MSE_logistic = mean_squared_error(Y_logistic_formula_pred, Y_test)

print(f'Linear   Regression MSE : {MSE_linear}')
print(f'Logistic Regression MSE : {MSE_logistic}')

Linear   Regression MSE : 0.11244970690143269
Logistic Regression MSE : 0.09787727233221447
