# 10장. 차원 축소 (Dimension Reduction) 과제

In [None]:
!pip install seaborn

## 1. 데이터셋

In [None]:
import matplotlib.pyplot as plt
import os
from typing import List, Tuple
import csv
from scratch.linear_algebra import Vector, get_column

###  1.1 데이터셋 다운로드

In [None]:
import requests

dataset_path = os.path.join('data', 'wdbc.data')
if os.path.exists(dataset_path) is False:
    data = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data")

    with open(dataset_path, "w") as f:
      f.write(data.text)

### 1.2 데이터 파싱

In [None]:
def parse_cancer_row(row: List[str]) -> Tuple[Vector, int]:
    measurements = [float(value) for value in row[2:]]
    label = row[1]
    label = 1 if label == 'M' else 0
    return measurements, label

### 1.3 데이터 읽기
위스콘신 유방암 진단 데이터셋  (Wisconsin Breast Cancer Diagnostic dataset)
https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
X_cancer : List[Vector] = []
y_cancer : List[int] = []
with open(dataset_path) as f:
    reader = csv.reader(f)
    for row in reader:
        x, y = parse_cancer_row(row)
        X_cancer.append(x)
        y_cancer.append(y)

In [None]:
print(X_cancer[0])
print(y_cancer[0])

#### 1.4 데이터 컬럼명

In [None]:
columns = [
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", 
    "compactness_mean", "concavity_mean", "points_mean", "symmetry_mean", "dimension_mean", 
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se", 
    "compactness_se", "concavity_se", "points_se", "symmetry_se", "dimension_se", 
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst", 
    "compactness_worst", "concavity_worst", "points_worst", "symmetry_worst", "dimension_worst",
    ]

## 2. 데이터 탐색

### 2.1 클래스 비율 확인

In [None]:
from collections import defaultdict
label_type = defaultdict(int)
for y in y_cancer:
    label = 'M' if y == 1 else 'B'
    label_type[label] += 1

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(1, 2, 1)
plt.bar(label_type.keys(),
        label_type.values(),
        0.5,
        facecolor="#2E495E",
        edgecolor=(0, 0, 0))                # Black edges for each bar

plt.xlabel("Diagnosis")
plt.ylabel("# of diagnosis")
plt.title("Cancer diagnosis")

plt.subplot(1, 2, 2)
pies = plt.pie(label_type.values(),
               labels=label_type.keys(),
               startangle=90)
plt.legend()
plt.show()

### 2.2 특징 별 히스토그램

In [None]:
from matplotlib import pyplot as plt
from typing import Dict

def draw_histogram(data: List[Vector], 
                   column_names: List[str], 
                   max_columns: int = 5):
    
    num_variables = len(data[0])
    num_rows = (num_variables-1)//max_columns + 1
    num_cols = num_variables if num_rows == 1 else max_columns
      
    def get_ax(row, col):
        if num_rows == 1 and num_cols == 1 :
            current_ax = ax
        elif num_rows == 1:
            current_ax = ax[col]
        else:
            current_ax = ax[row][col]
            
        return current_ax
    
    def histogram(ax, data, column_name):

        n, bins, patches = ax.hist(data,
                                    8,
                                    facecolor="#2E495E",
                                    edgecolor=(0, 0, 0)) 

        ax.set_title(column_name, fontsize=8)

    fig, ax = plt.subplots(num_rows,
                           num_cols, 
                           figsize=(num_cols*4, num_rows*4))

    for row in range(num_rows):
        for col in range(num_cols):
            data_index = num_cols * row + col
            current_ax = get_ax(row, col)
            histogram(current_ax, 
                      get_column(data, data_index), 
                      column_names[data_index])

    plt.show()

In [None]:
draw_histogram(X_cancer, columns)

### 2.3 특징 쌍 별 산포도

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

def draw_scatter(points_by_class: Dict[str, List[Vector]], 
                 column_names: List[str], 
                 index_pairs: List[List], 
                 max_columns:int = 5):
    
    num_rows = (len(index_pairs)-1)//max_columns + 1
    num_cols = len(index_pairs) if num_rows == 1 else max_columns
    rgb_values = sns.color_palette("pastel", len(points_by_class))

    def get_ax(row, col):
        if num_rows == 1 and num_cols == 1 :
            current_ax = ax
        elif num_rows == 1:
            current_ax = ax[col]
        else:
            current_ax = ax[row][col]
            
        return current_ax

    fig, ax = plt.subplots(num_rows,
                           num_cols, 
                           figsize=(num_cols*5, num_rows*5))    
    for row in range(num_rows):
        for col in range(num_cols):
            i, j = pairs[num_cols * row + col]
            current_ax = get_ax(row, col)
            current_ax.set_title(f"{column_names[i]} vs {column_names[j]}",
                                 fontsize=8)
            current_ax.set_xticks([])
            current_ax.set_yticks([])

            for k, (class_type, points) in enumerate(points_by_class.items()):
                xs = [point[i] for point in points]
                ys = [point[j] for point in points]
                current_ax.scatter(xs,
                                   ys, 
                                   color=rgb_values[k],
                                   s=10,
                                   label=class_type)

    last_ax = get_ax(-1, -1)
    last_ax.legend(loc='lower right', prop={'size': 8})
    plt.show()

In [None]:
from typing import Dict
points_by_diagnosis: Dict[str, List[Vector]] = defaultdict(list)
for i, x in enumerate(X_cancer):
    y = y_cancer[i]
    label = 'M' if y == 1 else 'B'
    points_by_diagnosis[label].append(x)

In [None]:
start = 0
end = start + 10
pairs = [(i, j) for i in range(start, end) for j in range(i+1, end) if i < j]
print(pairs)

In [None]:
draw_scatter(points_by_diagnosis, columns, pairs)

## 3. 데이터 전처리

### 3.1 데이터 표준화 (Standardization)

In [None]:
from scratch.working_with_data import scale, rescale

def normalization(data: List[Vector],
                  means : Vector = None,
                  stdevs : Vector = None) -> List[Vector]:
    dim = len(data[0])
    if means is None : 
        means, stdevs = scale(data)

    rescaled = [v[:] for v in data]

    for v in rescaled:
        for i in range(dim):
            if stdevs[i] > 0:
                v[i] = (v[i] - means[i]) / stdevs[i]

    return rescaled, means, stdevs

## 4. 로지스틱 회귀

### 4.1 모델 훈련 

In [None]:
import random
import tqdm
import IPython.display as display
from scratch.linear_algebra import Vector, vector_mean, dot
from scratch.gradient_descent import gradient_step
from scratch.logistic_regression import logistic, negative_log_gradient
from scratch.logistic_regression import negative_log_likelihood

def logistic_regression(xs: List[Vector],
                        ys: List[float],
                        learning_rate: float = 0.001,
                        num_steps: int = 1000,
                        batch_size: int = 1) -> Vector:

    # Start with a random guess
    beta = [random.random() for _ in range(len(xs[0]))]

    with tqdm.trange(num_steps) as t:
        for epoch in t:
            for start in range(0, len(xs), batch_size):
                batch_xs = xs[start:start+batch_size]
                batch_ys = ys[start:start+batch_size]

                gradient = negative_log_gradient(batch_xs, batch_ys, beta)
                beta = gradient_step(beta, gradient, -learning_rate)
                loss = negative_log_likelihood(batch_xs, batch_ys, beta)
            t.set_description(f"epoch {epoch} : loss - {loss:.3f}")

    return beta

### 4.2 모델 테스트

In [None]:
def test(inputs, labels, beta):

    TP = FP = FN = TN = 0
    for x, y in zip(inputs, labels):
        prediction = logistic(dot(beta, x))

        if y == 1 and prediction >= 0.5:  # TP: paid and we predict paid
            TP += 1
        elif y == 1:                      # FN: paid and we predict unpaid
            FN += 1
        elif prediction >= 0.5:             # FP: unpaid and we predict paid
            FP += 1
        else:                               # TN: unpaid and we predict unpaid
            TN += 1

    confusion_matrix = [[TP, FP], [FN, TN]]
    return confusion_matrix

## 5. 차원 축소 적용

### 5.1 차원 축소

In [None]:
from scratch.working_with_data import pca, transform
num_components = 2
components = pca(X_cancer, num_components)
X_cancer_dimension_reducted = transform(X_cancer, components)

In [None]:
columns_dimension_reducted = ['Dim '+ str(i+1) for i in range(num_components)]

### 5.2 차원 축소 후 특징 별 히스토그램

In [None]:
draw_histogram(X_cancer_dimension_reducted, columns_dimension_reducted)

### 5.3 차원 축소 후 특징 쌍 별 산포도

In [None]:
from typing import Dict
points_by_diagnosis_reducted: Dict[str, List[Vector]] = defaultdict(list)
for i, x in enumerate(X_cancer_dimension_reducted):
    y = y_cancer[i]
    label = 'M' if y == 1 else 'B'
    points_by_diagnosis_reducted[label].append(x)

In [None]:
start = 0
end = start + num_components
reducted_pairs = [(i, j) for i in range(start, end) 
                         for j in range(i+1, end) if i < j]
print(reducted_pairs)

In [None]:
draw_scatter(points_by_diagnosis_reducted, columns_dimension_reducted, reducted_pairs)

### 5.4 차원 축소 후 회귀 분석  (Q1)
차원 축소 후 회귀 분석을 하는 코드를 작성하시오.

In [None]:
import random
from scratch.machine_learning import train_test_split
from typing import Tuple

def logistic_regression_dimension_reduction(
                        xs: List[Vector],
                        ys: List[float],
                        num_components: int) -> Tuple[List[Vector], Vector, List[List]]:
    # 1. 차원 축소
    # your code

    # 2. 데이터 분할
    random.seed(12)
    # your code

    # 3. 데이터 표준화
    # your code

    # 4. 회귀 분석 밑 테스트
    # your code
    
    return xs_dimension_reducted, beta, confusion_matrix

2차원에 대해서 테스트 했을 때 결과는 다음과 같이 나오는 것을 확인해 보세요.
* [[36, 0], [15, 92]]
* accuracy : 0.8951048951048951
* precision : 1.0
* recall : 0.7058823529411765
* f1_score : 0.8275862068965517

In [None]:
from scratch.machine_learning import accuracy, precision, recall, f1_score
num_components = 2
X_cancer_dimension_reducted, beta, confusion_matrix = \
    logistic_regression_dimension_reduction(X_cancer, y_cancer, num_components)

# 성능 분석
print(confusion_matrix)
[TP, FP], [FN, TN] = confusion_matrix
print("accuracy :", accuracy(TP, FP, FN, TN))
print("precision :", precision(TP, FP, FN, TN))
print("recall :", recall(TP, FP, FN, TN))
print("f1_score :", f1_score(TP, FP, FN, TN))

## 6. 최적의 차원 찾기 (Q2)
1차원에서 15차원까지 각 차원 별로 성능을 확인하고 성능 그래프를 그려보시오.

In [None]:
from scratch.machine_learning import accuracy, precision, recall, f1_score

start_num_components = 1
end_num_components = 31

# your code