In [1004]:
import numpy as np
import pandas as pd
import plotly.express as px
import math

In [1005]:
# Covariance Matrix 등 표시할 때 줄이 넘어가는 것 방지

np.set_printoptions(linewidth=65536)

In [1006]:
# 여러 번 반복해도 똑같은 데이터 생성

np.random.seed(2025)

In [1007]:
# 데이터 생성

n_data = 10000

# features
x1 = np.random.uniform(-1, 1, n_data)   # feature 1
x2 = np.random.uniform(10, 20, n_data)  # feature 2
x3 = np.random.uniform(0, 3, n_data)    # feature 3
x4 = 3.0 * x1 + 4.0 * x2 + np.random.normal(0, 1.5, n_data)     # feature 4
x5 = 3.0 * x1 - 2.5 * x3 + np.random.normal(0, 1.25, n_data)    # feature 5
x6 = 10.0 * x2 + 5.0 * x3 + np.random.normal(0, 1.75, n_data)   # feature 6
x7 = 2.0 * x4 + 1.5 * x5 + 10.0 * x6 + np.random.normal(0, 3, n_data)  # feature 7

# target features
Z = x1 + 3.5 * x2 - 1.5 * x3 + 0.25 * x4 + 2.25 * x5 - 5.0 * x6 + 0.05 * x7 + np.random.normal(0, 20.0, n_data)
Y = np.where(Z > -650, 1, 0)

In [1008]:
# features 를 DataFrame 으로 묶기

X = pd.DataFrame(np.array([x1, x2, x3, x4, x5, x6, x7]).T,
                 columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'])

In [1009]:
X

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7
0,-0.729024,11.799575,2.210763,46.591230,-6.715616,129.019107,1373.959493
1,0.775703,18.325762,1.015722,75.860538,0.362405,188.361382,2038.659560
2,0.865211,13.171807,0.760341,54.255391,-0.075004,135.691516,1464.496608
3,-0.108864,14.929943,0.135729,59.800570,2.189315,149.282323,1614.552124
4,-0.223529,11.246902,2.806034,44.228312,-5.223614,125.049275,1328.552497
...,...,...,...,...,...,...,...
9995,-0.865921,13.634483,1.797690,53.288799,-7.188415,146.525520,1559.898396
9996,0.259917,19.905819,2.766057,80.774763,-4.985581,211.762857,2271.385457
9997,-0.476120,10.509216,1.620520,40.019763,-3.729654,113.509149,1205.795188
9998,-0.151336,17.137833,1.188643,65.893734,-3.705811,176.294488,1887.561149


In [1010]:
# X 에 대한 데이터 분포 출력

import plotly.figure_factory as ff

for col in X.columns:
    bin_size = (X[col].max() - X[col].min()) / 100

    fig = ff.create_distplot(hist_data=[X[col].values.tolist()],
                             group_labels=[col],
                             bin_size=bin_size)
    fig.update_layout(width=900, height=520)

    fig.show()

In [1011]:
print(Y)

[1 0 1 ... 1 0 1]


In [1012]:
# 상관계수 출력

all_data = pd.DataFrame(np.array([x1, x2, x3, x4, x5, x6, x7, Y]).T,
                        columns = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'y'])

corr = all_data.corr()

fig = ff.create_annotated_heatmap(
            z=corr.round(decimals=3).to_numpy(),
            x=corr.columns.tolist(),
            y=corr.index.tolist(),
            xgap=1,
            ygap=1,
            zmax=1,
            zmin=-1,
            showscale=True,
            colorscale=['#F53', '#BBC', '#57F'],
            hoverongaps=True)

fig.update_layout(title='correlation', width=700, height=450)

In [1013]:
# 1. 각 feature 가 평균 0, 표준편차 1 scaling 된 데이터로 변환

X_scaled = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  # 또는 scikit-learn의 StandardScaler 사용 가능

In [1014]:
X_scaled

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7
0,-1.275738,-1.108602,0.805628,-1.141918,-0.967487,-0.971998,-0.999966
1,1.344625,1.163369,-0.563076,1.358784,1.347090,1.065152,1.119108
2,1.500496,-0.630885,-0.855568,-0.487110,1.204053,-0.742942,-0.711332
3,-0.195779,-0.018823,-1.570949,-0.013342,1.944506,-0.276386,-0.232953
4,-0.395459,-1.301005,1.487404,-1.343800,-0.479589,-1.108277,-1.144724
...,...,...,...,...,...,...,...
9995,-1.514134,-0.469813,0.332527,-0.569693,-1.122097,-0.371024,-0.407190
9996,0.446423,1.713436,1.441617,1.778643,-0.401750,1.868497,1.861041
9997,-0.835327,-1.557816,0.129611,-1.703368,0.008949,-1.504436,-1.536076
9998,-0.269741,0.749813,-0.365027,0.507243,0.016746,0.650910,0.637404


In [1015]:
# 2. Covariance Matrix 계산

cov_matrix = np.cov(X_scaled, rowvar=False)

In [1016]:
print(cov_matrix)

[[ 1.00010001 -0.00490141  0.0011496   0.14103372  0.56468024 -0.00459686  0.01463943]
 [-0.00490141  1.00010001  0.0051673   0.98090967 -0.00150635  0.98693946  0.98983796]
 [ 0.0011496   0.0051673   1.00010001  0.00801632 -0.71504238  0.15501967  0.13418846]
 [ 0.14103372  0.98090967  0.00801632  1.00010001  0.07866411  0.96844361  0.97527477]
 [ 0.56468024 -0.00150635 -0.71504238  0.07866411  1.00010001 -0.10877214 -0.08054929]
 [-0.00459686  0.98693946  0.15501967  0.96844361 -0.10877214  1.00010001  0.99957492]
 [ 0.01463943  0.98983796  0.13418846  0.97527477 -0.08054929  0.99957492  1.00010001]]


In [1017]:
# 3. Eigenvalue, Eigenvector 계산

eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

In [1018]:
for i, evalue in enumerate(eigenvalues):
    print(f'Eigenvalue {i} : {evalue}')

Eigenvalue 0 : 3.962555246044329
Eigenvalue 1 : 1.9313836333924168
Eigenvalue 2 : 1.0016757004389292
Eigenvalue 3 : 0.08987251698007105
Eigenvalue 4 : 0.012977028283058342
Eigenvalue 5 : 0.0021877923476341554
Eigenvalue 6 : 4.8152520561966555e-05


In [1019]:
for i, evector in enumerate(eigenvectors):
    print(f'Eigenvector {i} : {evector}')

Eigenvector 0 : [-1.87746638e-02  4.40511911e-01 -7.78663441e-01  4.26711629e-01 -1.29951283e-01  1.76016737e-02 -2.09904156e-04]
Eigenvector 1 : [-0.49830273  0.0429712   0.10233028  0.02356247 -0.20509283  0.83466831  0.00932382]
Eigenvector 2 : [-0.05831999 -0.54640456 -0.61876041 -0.5412365   0.1013905   0.10934857  0.00151438]
Eigenvector 3 : [-0.49423351  0.10572344 -0.01507457  0.10416016  0.84973702 -0.09219631 -0.05409287]
Eigenvector 4 : [ 0.02966641  0.70174398  0.00364165 -0.71149655  0.01716121  0.00555569 -0.01045489]
Eigenvector 5 : [-0.50099987 -0.03966721  0.00814796 -0.06162678 -0.36335184 -0.37795186 -0.68460881]
Eigenvector 6 : [-0.50178975 -0.01868791  0.00635735 -0.05958607 -0.27640096 -0.37374222  0.72676417]


In [1020]:
# 4. Eigenvalue가 큰 순으로 정렬

eigen_collection = []

for evalue, evector in zip(eigenvalues, eigenvectors):
    eigen_collection.append({'eigenvalue': evalue, 'eigenvector': evector})

# np.linalg.eig 함수 자체에서 이미 정렬을 해 주는 것으로 추정하지만, 확실히 하기 위함
eigen_collection.sort(key=lambda x: x['eigenvalue'], reverse=True)

In [1021]:
for i, info in enumerate(eigen_collection):
    print(f'Eigen value/vector {i} :\neigenvalue  = {info["eigenvalue"]}\neigenvector = {info["eigenvector"]}\n')

Eigen value/vector 0 :
eigenvalue  = 3.962555246044329
eigenvector = [-1.87746638e-02  4.40511911e-01 -7.78663441e-01  4.26711629e-01 -1.29951283e-01  1.76016737e-02 -2.09904156e-04]

Eigen value/vector 1 :
eigenvalue  = 1.9313836333924168
eigenvector = [-0.49830273  0.0429712   0.10233028  0.02356247 -0.20509283  0.83466831  0.00932382]

Eigen value/vector 2 :
eigenvalue  = 1.0016757004389292
eigenvector = [-0.05831999 -0.54640456 -0.61876041 -0.5412365   0.1013905   0.10934857  0.00151438]

Eigen value/vector 3 :
eigenvalue  = 0.08987251698007105
eigenvector = [-0.49423351  0.10572344 -0.01507457  0.10416016  0.84973702 -0.09219631 -0.05409287]

Eigen value/vector 4 :
eigenvalue  = 0.012977028283058342
eigenvector = [ 0.02966641  0.70174398  0.00364165 -0.71149655  0.01716121  0.00555569 -0.01045489]

Eigen value/vector 5 :
eigenvalue  = 0.0021877923476341554
eigenvector = [-0.50099987 -0.03966721  0.00814796 -0.06162678 -0.36335184 -0.37795186 -0.68460881]

Eigen value/vector 6 :
ei

In [1022]:
# 5. Eigenvalue가 가장 큰 K 개의 Eigenvector 선택

K = 3
PCA_eigenvectors = np.array([info['eigenvector'] for info in eigen_collection[:K]])

for i in range(K):
    print(f'Eigen value/vector {i} :\neigenvector = {PCA_eigenvectors[i]}\n')

Eigen value/vector 0 :
eigenvector = [-1.87746638e-02  4.40511911e-01 -7.78663441e-01  4.26711629e-01 -1.29951283e-01  1.76016737e-02 -2.09904156e-04]

Eigen value/vector 1 :
eigenvector = [-0.49830273  0.0429712   0.10233028  0.02356247 -0.20509283  0.83466831  0.00932382]

Eigen value/vector 2 :
eigenvector = [-0.05831999 -0.54640456 -0.61876041 -0.5412365   0.1013905   0.10934857  0.00151438]



In [1023]:
# NumPy.dot 을 이용하여 scaling 된 학습 데이터와 행렬곱 가능하게 변환
PCA_eigenvectors = PCA_eigenvectors.T

In [1024]:
PCA_eigenvectors

array([[-1.87746638e-02, -4.98302731e-01, -5.83199892e-02],
       [ 4.40511911e-01,  4.29711972e-02, -5.46404559e-01],
       [-7.78663441e-01,  1.02330279e-01, -6.18760414e-01],
       [ 4.26711629e-01,  2.35624684e-02, -5.41236501e-01],
       [-1.29951283e-01, -2.05092825e-01,  1.01390502e-01],
       [ 1.76016737e-02,  8.34668310e-01,  1.09348571e-01],
       [-2.09904156e-04,  9.32382063e-03,  1.51438429e-03]])

In [1025]:
# 특징 정보 보존 비율 계산

from itertools import accumulate

sum_eigenvalues = []  # a 번째 원소는 상위 a 위까지의 eigenvalue의 합
preserve_rates = []   # a 번째 원소는 정보 보존 비율 (상위 a 위까지의 eigenvalue가 전체 eigenvalue 중 차지하는 비율)

eigenvalues = [info['eigenvalue'] for info in eigen_collection]
sum_eigenvalues = list(accumulate(eigenvalues))
preserve_rates = [se / sum_eigenvalues[-1] for se in sum_eigenvalues]

In [1026]:
for i, (ev, cum, rate) in enumerate(zip(eigenvalues, sum_eigenvalues, preserve_rates), 1):
    print(f'{i} 번째 Eigenvalue: {ev:7.4f}, 누적 합: {cum:7.4f}, 정보 보존 비율: {rate*100:6.2f}%')

1 번째 Eigenvalue:  3.9626, 누적 합:  3.9626, 정보 보존 비율:  56.60%
2 번째 Eigenvalue:  1.9314, 누적 합:  5.8939, 정보 보존 비율:  84.19%
3 번째 Eigenvalue:  1.0017, 누적 합:  6.8956, 정보 보존 비율:  98.50%
4 번째 Eigenvalue:  0.0899, 누적 합:  6.9855, 정보 보존 비율:  99.78%
5 번째 Eigenvalue:  0.0130, 누적 합:  6.9985, 정보 보존 비율:  99.97%
6 번째 Eigenvalue:  0.0022, 누적 합:  7.0007, 정보 보존 비율: 100.00%
7 번째 Eigenvalue:  0.0000, 누적 합:  7.0007, 정보 보존 비율: 100.00%


In [1027]:
print(f'{K} 차원으로 PCA 시 정보 보존 비율 : {preserve_rates[K-1] * 100.0}%')

3 차원으로 PCA 시 정보 보존 비율 : 98.4989288345384%


In [1028]:
# 6. K 개의 Eigenvector 축을 선택하여 데이터를 K 차원으로 축소

column_names = [f'new axis {i}' for i in range(1, K + 1)]
X_PCA_applied = np.dot(np.array(X_scaled), PCA_eigenvectors)
X_PCA_applied = pd.DataFrame(X_PCA_applied, columns=column_names)

In [1029]:
X_PCA_applied

Unnamed: 0,new axis 1,new axis 2,new axis 3
0,-1.470156,0.021405,0.593808
1,1.348946,-0.022438,-0.846353
2,-0.017135,-1.747524,1.090006
3,0.955424,-0.695989,1.167544
4,-2.254229,-0.575661,0.369357
...,...,...,...
9995,-0.541179,0.671569,0.292639
9996,0.467547,1.699928,-2.650545
9997,-1.525645,-0.949431,1.575714
9998,0.845192,0.687034,-0.368805


In [1030]:
# Logistie Regression 을 이용하여 학습

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

use_PCA = True  # 학습 대상을 PCA 적용된 것으로 할지 선택

logistic_regression = LogisticRegression()

if use_PCA:
    X_PCA_applied_train, X_PCA_applied_test, Y_train, Y_test = \
        train_test_split(X_PCA_applied, Y, test_size=0.2, random_state=2025)
    logistic_regression.fit(X_PCA_applied_train, Y_train)

    Y_pred = logistic_regression.predict(X_PCA_applied_test)

else:
    X_scaled_train, X_scaled_test, Y_train, Y_test = \
        train_test_split(X_scaled, Y, test_size=0.2, random_state=2025)
    logistic_regression.fit(X_scaled_train, Y_train)

    Y_pred = logistic_regression.predict(X_scaled_test)

In [1031]:
# prediction result

print(f'prediction examples : {Y_pred[:20]}')
print(f'ground truth        : {Y_test[:20]}')

prediction examples : [0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1]
ground truth        : [0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 1 1 1 0 1]


In [1032]:
# Accuracy, Recall, Precision, F1 Score 계산 함수

def compute_scores(TP, TN, FP, FN):
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    recall = TP / (TP + FN)
    precision = TP / (TP + FP)
    f1_score = 2 * recall * precision / (recall + precision)

    print(f'Accuracy  : {accuracy}\nRecall    : {recall}\nPrecision : {precision}\nF1 Score  : {f1_score}')

In [1033]:
# 성능지표 계산 (TP, TN, FP, FN, Accuracy, Recall, Precision, F1 Score)
# TP, TN, FP, FN 개수 계산

TP = sum(pred == 1 and Y_true == 1 for pred, Y_true in zip(Y_pred, Y_test))
TN = sum(pred == 0 and Y_true == 0 for pred, Y_true in zip(Y_pred, Y_test))
FP = sum(pred == 1 and Y_true == 0 for pred, Y_true in zip(Y_pred, Y_test))
FN = sum(pred == 0 and Y_true == 1 for pred, Y_true in zip(Y_pred, Y_test))

print(f'True  Positive : {TP}\nTrue  Negative : {TN}\nFalse Positive : {FP}\nFalse Negative : {FN}')

True  Positive : 1011
True  Negative : 905
False Positive : 45
False Negative : 39


In [1034]:
compute_scores(TP, TN, FP, FN)

Accuracy  : 0.958
Recall    : 0.9628571428571429
Precision : 0.9573863636363636
F1 Score  : 0.9601139601139602
