# Tobig's 14기 2주차 Optimization 과제
### Made by 이지용

# Gradient Descent 구현하기

### 1) "..." 표시되어 있는 빈 칸을 채워주세요  
### 2) 강의내용과 코드에 대해 공부한 내용을 적어서 과제를 채워주세요

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
data = pd.read_csv('assignment_2.csv')
data.head()

Unnamed: 0,Label,bias,experience,salary
0,1,1,0.7,48000
1,0,1,1.9,48000
2,1,1,2.5,60000
3,0,1,4.2,63000
4,0,1,6.0,76000


## Train Test 데이터 나누기
### 데이터셋을 train/test로 나눠주는 메소드  
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, 1:], data.iloc[:, 0], test_size=0.25, random_state = 0)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((150, 3), (50, 3), (150,), (50,))

## Scaling  

experience와 salary의 단위, 평균, 분산이 크게 차이나므로 scaler를 사용해 단위를 맞춰줍니다. 

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
bias_train = X_train["bias"]
bias_train = bias_train.reset_index()["bias"]
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train["bias"] = bias_train
X_train.head()

Unnamed: 0,bias,experience,salary
0,1,0.187893,-1.143335
1,1,1.185555,0.043974
2,1,-0.310938,-0.351795
3,1,-1.629277,-1.34122
4,1,-1.3086,0.043974


이때 scaler는 X_train에 fit 해주시고, fit한 scaler를 X_test에 적용시켜줍니다.  
똑같이 X_test에다 fit하면 안돼요!

In [7]:
bias_test = X_test["bias"]
bias_test = bias_test.reset_index()["bias"]
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
X_test["bias"] = bias_test
X_test.head()

Unnamed: 0,bias,experience,salary
0,1,-1.344231,-0.615642
1,1,0.50857,0.307821
2,1,-0.310938,0.571667
3,1,1.363709,1.956862
4,1,-0.987923,-0.747565


In [8]:
# parameter 개수
N = len(X_train.loc[0])

In [9]:
# 초기 parameter들을 임의로 설정해줍니다.
parameters = np.array([random.random() for i in range(N)])
parameters

array([0.34220622, 0.50740111, 0.37408698])

### * LaTeX   

Jupyter Notebook은 LaTeX 문법으로 수식 입력을 지원하고 있습니다.  
http://triki.net/apps/3466  
https://jjycjnmath.tistory.com/117

## Logistic Function

## $p = 1/(1+e^{−𝑋𝜃})$

In [10]:
def logistic(X, parameters):
    z = 0
    for i in range(len(parameters)) :
        z += parameters[i]*X[i]
    p = 1/(1+np.exp(-z))
    
    return p

In [11]:
logistic(X_train.iloc[1], parameters)

0.7231635494151939

## Object Function

Object Function : 목적함수는 Gradient Descent를 통해 최적화 하고자 하는 함수입니다.  
로지스틱 회귀의 목적함수를 작성해주세요
## $l(p) =-\sum\{y_ilogp+(1-y_i)log(1-p)\}$

In [12]:
def cross_entropy_i(X, y, parameters) :
    p = logistic(X, parameters)              # 위에서 작성한 함수를 활용하세요
    loss = (y*np.log(p)+(1-y)*np.log(1-p))
    return loss

In [13]:
def cross_entropy(X_set, y_set, parameters) :
    loss = 0
    for i in range(X_set.shape[0]):
        X = X_set.iloc[i, :]
        y = y_set.iloc[i]
        loss += cross_entropy_i(X, y, parameters)
    return -loss

In [14]:
cross_entropy(X_test, y_test, parameters)

49.66376517324346

## Gradient of Cross Entropy

## ${\partial\over{\partial \theta_j}}l(p)= -\sum(y_i-p)x_{ij}$

In [15]:
# cross_entropy를 theta_j에 대해 미분한 값을 구하는 함수
def get_gradient_ij_cross_entropy(X, y, parameters, j):
    p = logistic(X, parameters)
    gradient = -((y-p)*X[j].sum())
    return gradient

In [16]:
get_gradient_ij_cross_entropy(X_train.iloc[0, :], y_train.iloc[0], parameters, 1)

-0.09348446515431369

## Batch Gradient Descent  

Batch Gradient Descent : 학습 한 번에 모든 데이터셋에 대해 기울기를 구한다.

In [17]:
def get_gradients_bgd(X_train, y_train, parameters) :
    gradients = [0 for i in range(len(parameters))]
    
    for i in range(len(X_train)):  ## 모든 데이터셋에 대하여 기울기를 구하므로 행의 크기만큼 반복문 필요
        X = X_train.iloc[i, :]
        y = y_train.iloc[i]
        for j in range(len(parameters)):
            gradients[j] += get_gradient_ij_cross_entropy(X, y, parameters, j)
            
    return gradients

In [18]:
gradients_bgd = get_gradients_bgd(X_train, y_train, parameters)
gradients_bgd

[43.74985691648572, 7.7011055539137185, 38.315262419862115]

## Stochastic Gradient Descent  

Stochastic Gradient Descent : 학습 한 번에 임의의 데이터에 대해서만 기울기를 구한다.

In [19]:
def get_gradients_sgd(X_train, y, parameters) :
    gradients = [0 for i in range(len(parameters))]
    r = int(random.random()*len(X_train))  ## 임의의 데이터셋 하나의 인덱스만 필요하므로 0~n중에 랜덤하게 뽑기 위해 데이터 길이를 곱해줌
    X = X_train.iloc[r, :]
    y = y_train.iloc[r]
        
    for j in range(len(parameters)):
        gradients[j] = get_gradient_ij_cross_entropy(X, y, parameters, j)
        
    return gradients

In [20]:
gradients_sgd = get_gradients_sgd(X_train, y_train, parameters)
gradients_sgd

[-0.14562106039248757, -0.23490492984713987, -0.23693326914776291]

## Update Parameters  

In [21]:
def update_parameters(parameters, gradients, learning_rate) :
    for i in range(len(parameters)) :
        gradients[i] *= learning_rate  ## 학습률을 하나씩 곱해야해서 반복문으로 돌리는 것
    parameters -= gradients
    return parameters

In [22]:
update_parameters(parameters, gradients_bgd, 0.01)

array([-0.09529235,  0.43039005, -0.00906564])

## Gradient Descent  

위에서 작성한 함수들을 조합해서 Gradient Descent를 진행하는 함수를 완성해주세요

learning_rate = 학습률  
max_iter = 최대 반복 횟수  
tolerance = step 이동이 무의미할 시 중단 조건

In [23]:
def gradient_descent(X_train, y_train, learning_rate=0.01, max_iter=100000, tolerance=0.0001, optimizer="bgd") :
    count = 1
    point = 100 if optimizer == "bgd" else 10000  ## point에 도달할 때마다 학습을 중단해야 할지 점검하기 위한 지점
    N = len(X_train.iloc[0])
    parameters = np.array([random.random() for i in range(N)])
    gradients = [0 for i in range(N)]
    loss = 0
    
    while count < max_iter :
        
        if optimizer == "bgd" :
            gradients = get_gradients_bgd(X_train, y_train, parameters)
        elif optimizer == "sgd" :
            gradients = get_gradients_sgd(X_train, y_train, parameters)
            # loss, 중단 확인
        if count%point == 0 :
            new_loss = cross_entropy(X_train, y_train, parameters)
            print(count, "loss: ",new_loss, "params: ", parameters, "gradients: ", gradients)
            
            #중단 조건
            if abs(new_loss-loss) < tolerance*len(y_train) : ## step의 이동이 더이상 의미없을 때, 즉 loss 값의 변동이 극히 작을 때 중단
                break
            loss = new_loss
                
            
                
        parameters = update_parameters(parameters, gradients, learning_rate)
        count += 1
    return parameters

In [24]:
new_param_bgd = gradient_descent(X_train, y_train)
new_param_bgd

100 loss:  45.39258159510335 params:  [-1.62241866  3.46955178 -3.29616911] gradients:  [0.2791665642067376, -0.9187796994117925, 0.8589098074147352]
200 loss:  44.80125103070706 params:  [-1.78150028  3.98957335 -3.77969874] gradients:  [0.08387083233186031, -0.2725490886178115, 0.2520653554990538]
300 loss:  44.74034959814228 params:  [-1.83391516  4.15963553 -3.93671985] gradients:  [0.030178713586985317, -0.09776525432508379, 0.09011721826699808]
400 loss:  44.732141048421326 params:  [-1.85329846  4.22239531 -3.99453556] gradients:  [0.011469097203333514, -0.037115131055586593, 0.03417055280863808]


array([-1.85329846,  4.22239531, -3.99453556])

## Hyper Parameter Tuning

Hyper Parameter들을 매번 다르게 해서 학습을 진행해 보세요. 다른 점들을 발견할 수 있습니다.

In [25]:
new_param_sgd = gradient_descent(X_train, y_train, learning_rate=0.01, max_iter=100000, tolerance=0.0001, optimizer="sgd")
new_param_sgd

10000 loss:  46.56654133021974 params:  [-1.49944534  3.01488961 -2.84906667] gradients:  [0.00803516231230049, 0.009526128499131264, 0.019433788727757133]
20000 loss:  45.20372482617679 params:  [-1.60820482  3.64660253 -3.56619069] gradients:  [0.10853902594093265, -0.08015688901885991, -0.06682117205303673]
30000 loss:  44.84263390766603 params:  [-1.7211349   3.9869904  -3.82098193] gradients:  [-0.6412789397974888, -0.7374223084380066, -0.5780973413372541]
40000 loss:  44.85737417967695 params:  [-1.73579805  4.10272864 -3.79731188] gradients:  [0.16888327751131466, -0.17286113266207195, -0.19309015456256684]


array([-1.73579805,  4.10272864, -3.79731188])

## Predict Label

In [26]:
y_predict = []
for i in range(len(y_test)):
    p = logistic(X_test.iloc[i,:], new_param_bgd)
    if p> 0.5 :
        y_predict.append(1)
    else :
        y_predict.append(0)

## Confusion Matrix

In [27]:
from sklearn.metrics import *
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
confusion_matrix(y_test, y_predict)

array([[38,  2],
       [ 1,  9]], dtype=int64)

In [28]:
accuracy_score(y_test, y_predict)

0.94

In [29]:
f1_score(y_test, y_predict)

0.8571428571428572

정확도는 0.94, f1 score는 0.857로, TN의 비중이 높아 imbalance로 인해 f1 score가 조금 낮게 나오긴 했으나, 전반적으로 잘 예측했다고 평가할 수 있음.