### Создание датасета

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True)
partitions = []
for i in range(5):
    random_state = 47 + 112 * i
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_state)
    partitions.append((X_train, X_test, y_train, y_test))

## Логистическая регрессия на numpy

In [None]:
import numpy as np

class BinaryLogisticRegression:

    def __init__(self, eta=1e-4, random_state=176):
        self.eta = eta
        np.random.seed(random_state)
    
    def fit_epoch(self, X, y):
        # добавляем нулевой признак константу -1
        X = np.concatenate([-np.ones_like(X[:,:1]), X], axis=1)
        if not hasattr(self, "weight_"):
            self.weight_ = np.zeros_like(X[0], dtype=float)
        order = np.arange(len(X))
        np.random.shuffle(order)
        X, y = X[order], y[order]
        for elem, label in zip(X, y):
            # вероятность положительного класса
            score = 1 - 1 / (1 + np.exp(np.dot(self.weight_, elem)))
            # w' <- w + eta (y - p) * x
            self.weight_ += self.eta * ((label - score) * elem)
        return self

    def predict(self, X):
        return (self._score(X) >= 0).astype("int")
    
    def _score(self, X):
        return np.dot(X, self.weight_[1:]) - self.weight_[0]

    def predict_proba(self, X):
        return 1 - 1 / (1 + np.exp(self._score(X)))

In [None]:
from sklearn.metrics import accuracy_score

scores = []
for X_train, X_test, y_train, y_test in partitions:
    cls = BinaryLogisticRegression(eta=1e-3)
    best_train_score, best_weight = 0.0, None
    patience = 0
    for i in range(100):
        cls.fit_epoch(X_train, y_train)
        y_train_pred = cls.predict(X_train)
        train_score = accuracy_score(y_train, y_train_pred)
        y_test_pred = cls.predict(X_test)
        test_score = accuracy_score(y_test, y_test_pred)
        if train_score > best_train_score:
            best_train_score, best_weight = train_score, np.copy(cls.weight_)
            patience = 0
        else:
            patience += 1
            if patience >= 10:
                # print("Ran out of patience after {} epochs".format(i+1))
                cls.weight_ = best_weight
                break
    y_test_pred = cls.predict(X_test)
    test_score = accuracy_score(y_test, y_test_pred)
    # print("Final test score {:.2f}".format(100 * test_score))
    scores.append(test_score)
print(*("{:.2f}".format(100*x) for x in scores))
print("{:.2f}".format(100*np.mean(scores)))



93.01 92.31 90.91 93.01 93.01
92.45


## Логистическая регрессия из sklearn

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

sklearn_scores = []
for X_train, X_test, y_train, y_test in partitions:
    cls = LogisticRegression()
    cls.fit(X_train, y_train)
    y_train_pred = cls.predict(X_train)
    train_score = accuracy_score(y_train, y_train_pred)
    y_test_pred = cls.predict(X_test)
    test_score = accuracy_score(y_test, y_test_pred)
    # print("{:.2f} {:.2f}".format(100*train_score, 100*test_score))
    sklearn_scores.append(test_score)
print(*("{:.2f}".format(100*x) for x in sklearn_scores))
print("{:.2f}".format(100*np.mean(sklearn_scores)))

92.31 95.80 95.80 98.60 97.90
96.08


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

## Логистическая регрессия на pytorch

In [None]:
import torch

class PytorchLogisticRegression(torch.nn.Module):

    def __init__(self, n_features, lr=1e-3):
        super(PytorchLogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(n_features, 1)
        self.sigmoid = torch.nn.Sigmoid()
        self.criterion = torch.nn.BCELoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
        torch.nn.init.zeros_(self.linear.weight)

    def forward(self, X):
        logits = self.linear(X)
        probs = self.sigmoid(logits)
        return probs[:,0]

    def predict_proba(self, X):
        with torch.no_grad():
            y = self(X)
        return y.numpy()

    def predict(self, X):
        probs = self.predict_proba(X)
        return (probs >= 0.5).astype("int")

    def fit_epoch(self, X, y):
        order = np.arange(len(X))
        np.random.shuffle(order)
        X, y = X[order], y[order]
        for r, (elem, label) in enumerate(zip(X, y)):
            label = torch.unsqueeze(label, dim=0)
            self.optimizer.zero_grad()
            prob = self.forward(elem[None,:]) # n -> (1 \times n)
            loss = self.criterion(prob, label)
            loss.backward()
            self.optimizer.step()
        return self

In [None]:
from torch import Tensor, LongTensor
import copy

scores = []
for X_train, X_test, y_train, y_test in partitions:
    X_train, X_test = Tensor(X_train), Tensor(X_test)
    y_train, y_test = Tensor(y_train), Tensor(y_test)
    cls = PytorchLogisticRegression(n_features=X_train.shape[1], lr=1e-3)
    best_train_score, best_weights = 0.0, None
    patience = 0
    for i in range(100):
        cls.fit_epoch(X_train, y_train)
        y_train_pred = cls.predict(X_train)
        train_score = accuracy_score(y_train, y_train_pred)
        y_test_pred = cls.predict(X_test)
        test_score = accuracy_score(y_test, y_test_pred)
        if train_score > best_train_score:
            best_train_score = train_score
            best_weights = copy.deepcopy(cls.state_dict())
            patience = 0
        else:
            patience += 1
            if patience >= 10:
                # print("Ran out of patience after {} epochs".format(i+1))
                break
    if best_weights is not None:
        cls.load_state_dict(best_weights)
    y_test_pred = cls.predict(X_test)
    test_score = accuracy_score(y_test, y_test_pred)
    # print("Final test score {:.2f}".format(100 * test_score))
    scores.append(test_score)
print(*("{:.2f}".format(100*x) for x in scores))
print("{:.2f}".format(100*np.mean(scores)))

92.31 91.61 93.71 95.10 96.50
93.85
