## Урок 3. Логистическая регрессия. Log Loss
___

In [1]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
X = np.array([ [   1,    1,  500,    1],
               [   1,    1,  700,    1],
               [   1,    2,  750,    2],
               [   1,    5,  600,    1],
               [   1,    3, 1450,    2],
               [   1,    0,  800,    1],
               [   1,    5, 1500,    3],
               [   1,   10, 2000,    3],
               [   1,    1,  450,    1],
               [   1,    2, 1000,    2]], dtype = np.float64)

y = np.array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1])

In [3]:
def standard_scale(X):
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    return (X - mean) / std

In [4]:
X_st = X.copy().astype(np.float64)
X_st[:, 1:4] = standard_scale(X_st[:, 1:4])
X_st

array([[ 1.        , -0.70710678, -0.97958969, -0.89625816],
       [ 1.        , -0.70710678, -0.56713087, -0.89625816],
       [ 1.        , -0.35355339, -0.46401617,  0.38411064],
       [ 1.        ,  0.70710678, -0.77336028, -0.89625816],
       [ 1.        ,  0.        ,  0.97958969,  0.38411064],
       [ 1.        , -1.06066017, -0.36090146, -0.89625816],
       [ 1.        ,  0.70710678,  1.08270439,  1.66447944],
       [ 1.        ,  2.47487373,  2.11385144,  1.66447944],
       [ 1.        , -0.70710678, -1.08270439, -0.89625816],
       [ 1.        , -0.35355339,  0.05155735,  0.38411064]])

### 1. Измените функцию calc_logloss так, чтобы нули по возможности не попадали в np.log.
___

In [5]:
def calc_logloss(y, y_pred):
    err = 0
    for i in range(len(y)):
        if y_pred[i] == 0:
            err += (1.0 - y[i]) * np.log(1.0 - y_pred[i])
        elif y_pred[i] == 1:
            err += y[i] * np.log(y_pred[i])
        else:
            err += y[i] * np.log(y_pred[i]) + (1.0 - y[i]) * np.log(1.0 - y_pred[i])
    return - err / len(y)

In [6]:
y1 = np.array([1, 0])
y_pred1 = np.array([0.9, 0])
calc_logloss(y1, y_pred1)

0.05268025782891314

In [7]:
y1 = np.array([1, 0])
y_pred1 = np.array([1, 0.1])
calc_logloss(y1, y_pred1)

0.05268025782891314

In [8]:
y1 = np.array([1, 0])
y_pred1 = np.array([0.9, 0.1])
calc_logloss(y1, y_pred1)

0.10536051565782628

### 2. Подберите аргументы функции eval_model для логистической регрессии таким образом, чтобы log loss был минимальным.
___

In [9]:
def sigmoid(z):
    res = 1 / (1 + np.exp(-z))
    return res

In [10]:
def eval_model(X, y, iterations, eta=1e-4):
    np.random.seed(42)
    W = np.random.randn(X.shape[1])
    n = X.shape[0]
    
    for i in range(iterations):
        z = np.dot(X, W)
        y_pred = sigmoid(z)
        err = calc_logloss(y, y_pred)
        
        dQ = 1/n * X.T @ (y_pred - y)
        W -= eta * dQ
        if i % (iterations / 10) == 0:
            print(i, W, err)
            
    final_error = calc_logloss(y, y_pred)
    return W, final_error

In [11]:
W, error = eval_model(X_st, y, iterations=10000, eta=500)

0 [ -17.25286897 -135.54802183 -116.06566704  -60.71685833] 0.760958797591889
1000 [  39.48816055 -385.13208099 -216.94352637  598.62406372] 7.855242570540186e-06
2000 [  39.37104944 -384.56058391 -217.84492807  598.72902551] 5.480659849428925e-06
3000 [  39.28138516 -384.17601504 -218.44076795  598.80938785] 4.435345127916754e-06
4000 [  39.20670362 -383.88915509 -218.87962865  598.87632179] 3.8619676650579484e-06
5000 [  39.14190225 -383.66220785 -219.22438527  598.93440055] 3.504650407743363e-06
6000 [  39.08426669 -383.47554462 -219.50731543  598.98605689] 3.262189382592807e-06
7000 [  39.03211265 -383.31767179 -219.74699157  599.03280038] 3.0872719010359613e-06
8000 [  38.98429631 -383.18130022 -219.95496407  599.07565616] 2.9551125819269544e-06
9000 [  38.93999676 -383.06153359 -220.13884295  599.11535999] 2.8515926497519817e-06


### 3. Создайте функцию calc_pred_proba, возвращающую предсказанную вероятность класса 1 (на вход подаются W, который уже посчитан функцией eval_model и X, на выходе - массив y_pred_proba).
___

In [12]:
def calc_pred_proba(W, X):
    y_pred_proba = sigmoid(np.dot(X, W))
    return y_pred_proba

In [13]:
calc_pred_proba(W, X_st)

array([1.01375549e-005, 3.49336464e-045, 1.00000000e+000, 1.17299369e-260,
       1.00000000e+000, 4.10411064e-006, 1.00000000e+000, 1.41170746e-164,
       9.99986560e-001, 1.00000000e+000])

### 4. Создайте функцию calc_pred, возвращающую предсказанный класс (на вход подаются W, который уже посчитан функцией eval_model и X, на выходе - массив y_pred).
___

In [14]:
def calc_pred(W, X):
    y_pred_proba = calc_pred_proba(W, X)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)
    return y_pred

In [15]:
pred = calc_pred(W, X_st)
pred

array([0, 0, 1, 0, 1, 0, 1, 0, 1, 1])

### 5. Реализуйте функции для подсчета Accuracy, матрицы ошибок, точности и полноты, а также F1 score.
___

In [16]:
def accuracy(y, y_pred):
    accuracy = np.mean(y == y_pred)
    return accuracy

In [17]:
print(f'True \t  {y}')
print(f'Predicted {pred}')

True 	  [0 0 1 0 1 0 1 0 1 1]
Predicted [0 0 1 0 1 0 1 0 1 1]


In [18]:
accuracy(y, pred)

1.0

In [19]:
def confusion_matrix(y, y_pred):
    cm = np.zeros((2, 2))
    for i in range(len(y)): 
        if y[i] == y_pred[i] == 1: # TP
            cm[0][0] += 1
        elif y[i] == y_pred[i] == 0: # TN
            cm[1][1] += 1
        elif y[i] != y_pred[i] and y[i] == 1: # FN
            cm[1][0] += 1
        elif y[i] != y_pred[i] and y[i] == 0: # FP
            cm[0][1] += 1
    return cm

In [20]:
cm = confusion_matrix(y, pred)
cm

array([[5., 0.],
       [0., 5.]])

In [21]:
def precision(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    TP = cm[0][0]
    FP = cm[0][1]
    precision = TP / (TP + FP)
    return precision

In [22]:
precision(y, pred)

1.0

In [23]:
def recall(y, y_pred):
    cm = confusion_matrix(y, y_pred)
    TP = cm[0][0]
    FN = cm[1][0]
    recall = TP / (TP + FN)
    return recall

In [24]:
recall(y, pred)

1.0

In [25]:
def f_score(y, y_pred):
    pr = precision(y, y_pred)
    rec = recall(y, y_pred)
    f_score = 2 * pr * rec / (pr + rec)
    return f_score

In [26]:
f_score(y, pred)

1.0

In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


print(f'accuracy {accuracy_score(y, pred)}')
print(f'precision {precision_score(y, pred)}')
print(f'recall {recall_score(y, pred)}')
print(f'f-score {f1_score(y, pred)}')
print(f' {confusion_matrix(y, pred)}')

accuracy 1.0
precision 1.0
recall 1.0
f-score 1.0
 [[5 0]
 [0 5]]
