# Logistic Regression : Single-variable

In [1]:
import numpy as np

In [26]:
x_data = np.array([2,4,6,8,10,12,14,16,18,20]).reshape(10,1)
t_data = np.array([0,0,0,0,0,0,1,1,1,1]).reshape(10,1)

print("X_data = ", x_data.shape, ", t_data.shape = ", t_data.shape)

X_data =  (10, 1) , t_data.shape =  (10, 1)


In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
W = np.random.rand(1, 1)
b = np.random.rand(1)

print('W = ', W, ", W.shape = ", W.shape, ", b = ", b, ", b.shape = ", b.shape)

W =  [[0.25975025]] , W.shape =  (1, 1) , b =  [0.45997823] , b.shape =  (1,)


In [29]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

In [30]:
def loss_func(x, t):
  delta = 1e-7

  z = np.dot(x, W) + b
  y = sigmoid(z)

  return -np.sum(t * np.log(y + delta) + (1 - t) * np.log((1 - y) + delta))

In [31]:
def numerical_derivative(f, x):
  delta_x = 1e-4
  grad = np.zeros_like(x)

  it = np.nditer(x, flags = ['multi_index'], op_flags = ['readwrite'])

  while not it.finished:
    idx = it.multi_index
    tmp_val = x[idx]
    x[idx] = float(tmp_val) + delta_x
    fx1 = f(x)

    x[idx] = tmp_val - delta_x
    fx2 = f(x)
    grad[idx] = (fx1 - fx2) / (2 * delta_x)

    x[idx] = tmp_val
    it.iternext()
  return grad

In [32]:
def error_val(x, t):
  delta = 1e-7

  z = np.dot(x, W) + b
  y = sigmoid(z)  
  return -np.sum(t * np.log(y + delta) + (1 - t) * np.log((1 - y) + delta))

def predict(x):
  z = np.dot(x, W) + b
  y = sigmoid(z)  

  if y >= 0.5:
    result = 1
  else:
    result = 0
  return y, result

In [75]:
learning_rate = 1e-5
f = lambda x : loss_func(x_data, t_data)

print('Initial error value = ', error_val(x_data, t_data))
print('Initial W = ', W, ', b = ', b)

for step in range(50001):
  W -= learning_rate * numerical_derivative(f, W)
  b -= learning_rate * numerical_derivative(f, b)

  if (step % 400 == 0):
    print("step = ", step, "error value = ", error_val(x_data, t_data), "W = ", W, ", b = ", b)

Initial error value =  4.21995990113905
Initial W =  [[ 0.21518494]
 [-0.13663429]] , b =  [-0.81417466]
step =  0 error value =  4.219950279024985 W =  [[ 0.2151854 ]
 [-0.13663374]] , b =  [-0.81418444]
step =  400 error value =  4.216104935013846 W =  [[ 0.21537034]
 [-0.13641407]] , b =  [-0.81809586]
step =  800 error value =  4.212266566236059 W =  [[ 0.21555519]
 [-0.13619471]] , b =  [-0.82200372]
step =  1200 error value =  4.208435156938768 W =  [[ 0.21573993]
 [-0.13597566]] , b =  [-0.82590805]
step =  1600 error value =  4.2046106914006485 W =  [[ 0.21592458]
 [-0.13575692]] , b =  [-0.82980884]
step =  2000 error value =  4.20079315393191 W =  [[ 0.21610913]
 [-0.13553849]] , b =  [-0.8337061]
step =  2400 error value =  4.196982528874318 W =  [[ 0.21629358]
 [-0.13532037]] , b =  [-0.83759983]
step =  2800 error value =  4.1931788006012 W =  [[ 0.21647794]
 [-0.13510255]] , b =  [-0.84149004]
step =  3200 error value =  4.18938195351745 W =  [[ 0.21666219]
 [-0.13488505]

In [58]:
(real_val, logical_val) = predict(2)
print(real_val, logical_val)

ValueError: ignored

# Logistic Regression : Multi-variable

In [59]:
x_data = np.array([[2,4],[4,11],[6,6],[8,5],[10,7],[12,16],[14,8],[16,3],[18,7]])
t_data = np.array([0,0,0,0,0,1,1,1,1]).reshape(9,1)

print("X_data = ", x_data.shape, ", t_data.shape = ", t_data.shape)

X_data =  (9, 2) , t_data.shape =  (9, 1)


In [60]:
W = np.random.rand(2, 1)
b = np.random.rand(1)

print('W = ', W, ", W.shape = ", W.shape, ", b = ", b, ", b.shape = ", b.shape)

W =  [[0.02053117]
 [0.33184857]] , W.shape =  (2, 1) , b =  [0.31552567] , b.shape =  (1,)


In [62]:
test_data = np.array([3, 11])
predict(test_data)

(array([0.98249233]), 1)

>> ### 실습. 아래의 망막 데이터의 질병 유무를 분류하라.

In [71]:
cd /content/drive/MyDrive/Colab Notebooks/2021_2학기_데이터분석과머신러닝/LinearRegression_LogisticRegression

/content/drive/MyDrive/Colab Notebooks/2021_2학기_데이터분석과머신러닝/LinearRegression_LogisticRegression


In [72]:
import pandas as pd

In [73]:
train_df = pd.read_csv('RFMiD_Train_SET.csv')
print(train_df.columns)
train_df.head()

Index(['ID', 'Disease_Risk', 'DR', 'ARMD', 'MH', 'DN', 'MYA', 'BRVO', 'TSLN',
       'ERM', 'LS', 'MS', 'CSR', 'ODC', 'CRVO', 'TV', 'AH', 'ODP', 'ODE', 'ST',
       'AION', 'PT', 'RT', 'RS', 'CRS', 'EDN', 'RPEC', 'MHL', 'RP', 'CWS',
       'CB', 'ODPM', 'PRH', 'MNF', 'HR', 'CRAO', 'TD', 'CME', 'PTCR', 'CF',
       'VH', 'MCA', 'VS', 'BRAO', 'PLQ', 'HPED', 'CL'],
      dtype='object')


Unnamed: 0,ID,Disease_Risk,DR,ARMD,MH,DN,MYA,BRVO,TSLN,ERM,LS,MS,CSR,ODC,CRVO,TV,AH,ODP,ODE,ST,AION,PT,RT,RS,CRS,EDN,RPEC,MHL,RP,CWS,CB,ODPM,PRH,MNF,HR,CRAO,TD,CME,PTCR,CF,VH,MCA,VS,BRAO,PLQ,HPED,CL
0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [74]:
train_x_data = train_df.iloc[:, 2:].values
train_t_data = train_df.iloc[:, [1]].values
print("x_data.ndim = ", train_x_data.ndim, ", x_data.shape = ", train_x_data.shape)
print("t_data.ndim = ", train_t_data.ndim, ", t_data.shape = ", train_t_data.shape)

x_data.ndim =  2 , x_data.shape =  (599, 45)
t_data.ndim =  2 , t_data.shape =  (599, 1)


In [78]:
W = np.random.rand(45, 1)
b = np.random.rand(1)

print('W = ', W, ", W.shape = ", W.shape, ", b = ", b, ", b.shape = ", b.shape)

W =  [[0.61016245]
 [0.02653432]
 [0.61455895]
 [0.49900152]
 [0.47295468]
 [0.20047102]
 [0.49943286]
 [0.44074099]
 [0.22267961]
 [0.21856224]
 [0.58748785]
 [0.79071085]
 [0.410579  ]
 [0.81762721]
 [0.16686084]
 [0.19969229]
 [0.38198562]
 [0.29905177]
 [0.75813789]
 [0.96133428]
 [0.84202619]
 [0.60755579]
 [0.55758183]
 [0.60285672]
 [0.99369823]
 [0.18110793]
 [0.157464  ]
 [0.56351227]
 [0.46589928]
 [0.57141424]
 [0.00649595]
 [0.91206188]
 [0.81047039]
 [0.48261881]
 [0.54305401]
 [0.55621432]
 [0.95063778]
 [0.32894386]
 [0.65576351]
 [0.13414942]
 [0.86690774]
 [0.20472077]
 [0.58457699]
 [0.243757  ]
 [0.29759104]] , W.shape =  (45, 1) , b =  [0.07344975] , b.shape =  (1,)


In [83]:
learning_rate = 1e-2
f = lambda x : loss_func(train_x_data, train_t_data)

print('Initial error value = ', error_val(train_x_data, train_t_data))
print('Initial W = ', W, ', b = ', b)

for step in range(10001):
  W -= learning_rate * numerical_derivative(f, W)
  b -= learning_rate * numerical_derivative(f, b)

  if (step % 400 == 0):
    print("step = ", step, "error value = ", error_val(train_x_data, train_t_data))

Initial error value =  204.23949786731302
Initial W =  [[1.16419764]
 [0.26596418]
 [1.05086259]
 [0.7486648 ]
 [0.64161828]
 [0.33033008]
 [0.72323104]
 [0.48222853]
 [0.32061719]
 [0.25798517]
 [0.65402344]
 [1.01551465]
 [0.46960249]
 [0.82522304]
 [0.1881699 ]
 [0.32158652]
 [0.51247671]
 [0.31256008]
 [0.78678638]
 [0.96706397]
 [0.86310189]
 [0.69564033]
 [0.61575075]
 [0.62136996]
 [1.02015473]
 [0.20423692]
 [0.16654183]
 [0.5811055 ]
 [0.46589928]
 [0.58202102]
 [0.0164081 ]
 [0.91206188]
 [0.81047039]
 [0.49685295]
 [0.55421543]
 [0.56296984]
 [0.95559311]
 [0.33751945]
 [0.66131682]
 [0.13414942]
 [0.88170875]
 [0.21900839]
 [0.59018703]
 [0.243757  ]
 [0.29759104]] , b =  [0.74244104]
step =  0 error value =  198.7401104003527
step =  400 error value =  31.101066308684285
step =  800 error value =  17.006788531488
step =  1200 error value =  11.537889725411318
step =  1600 error value =  8.682877690993513
step =  2000 error value =  6.943263588017326
step =  2400 error valu

In [84]:
test_data = np.array([1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])
predict(test_data)

(array([0.99999999]), 1)

>> ### 실습. RFMiD_Test_SET.csv에 있는 데이터를 Pandas로 로드한 후, 위 모델에 검증 해보기.

1) 정확도 구하기 : RFMiD_Test_SET.csv에서 정답 값들과 예측을 통해 나온 분류 결과를 비교해서 구할것.   
(예측값 == 실제값) good_count++   
good_count / len(실제값)   

In [85]:
test_df = pd.read_csv('RFMiD_Test_SET.csv')
print(test_df.columns)
test_df.head()

Index(['ID', 'Disease_Risk', 'DR', 'ARMD', 'MH', 'DN', 'MYA', 'BRVO', 'TSLN',
       'ERM', 'LS', 'MS', 'CSR', 'ODC', 'CRVO', 'TV', 'AH', 'ODP', 'ODE', 'ST',
       'AION', 'PT', 'RT', 'RS', 'CRS', 'EDN', 'RPEC', 'MHL', 'RP', 'CWS',
       'CB', 'ODPM', 'PRH', 'MNF', 'HR', 'CRAO', 'TD', 'CME', 'PTCR', 'CF',
       'VH', 'MCA', 'VS', 'BRAO', 'PLQ', 'HPED', 'CL'],
      dtype='object')


Unnamed: 0,ID,Disease_Risk,DR,ARMD,MH,DN,MYA,BRVO,TSLN,ERM,LS,MS,CSR,ODC,CRVO,TV,AH,ODP,ODE,ST,AION,PT,RT,RS,CRS,EDN,RPEC,MHL,RP,CWS,CB,ODPM,PRH,MNF,HR,CRAO,TD,CME,PTCR,CF,VH,MCA,VS,BRAO,PLQ,HPED,CL
0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [90]:
test_x_data = train_df.iloc[:, 2:].values
test_t_data = train_df.iloc[:, [1]].values
print("x_data.ndim = ", test_x_data.ndim, ", x_data.shape = ", test_x_data.shape)
print("t_data.ndim = ", test_t_data.ndim, ", t_data.shape = ", test_t_data.shape)

x_data.ndim =  2 , x_data.shape =  (599, 45)
t_data.ndim =  2 , t_data.shape =  (599, 1)


In [93]:
good_count = 0
for idx, vector_data in enumerate(test_x_data):
  y, result = predict(vector_data)
  if result == test_t_data[idx]:
    good_count += 1

good_count / len(test_t_data)

1.0