In [1]:
import numpy as np
from sklearn.datasets import make_regression
import random


In [2]:
X, y = make_regression(n_samples=1000, n_features=14, n_informative=10, noise=15, random_state=42)
X = np.array(X)
y = np.array(y)

In [15]:
# Линейная регрессия
class MyLineReg():
    # Инициализация класса
    def __init__(self, weights=None, 
                 n_iter: int=100, 
                 learning_rate: int=0.1, 
                 metric: str=None,
                 reg: str=None,
                 l1_coef: float=0,
                 l2_coef: float=0,
                 sgd_sample: float=None,
                 random_state: int=42,) -> None:
        self.weights = weights
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.metric = metric
        self.reg = reg
        self.regflag = self.reg in ['l1', 'l2', 'elasticnet']
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.sgd_sample = sgd_sample
        self.random_state = random_state
        self.best_score = 0
        
        
    # Вывод информации при передаче в print()
    def __str__(self) -> str:
        return f"MyLineReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}"

    # Алгоритм тренировки модели
    def fit(self, X: np.ndarray, y: np.ndarray, verbose: int=0) -> None:
        random.seed(self.random_state)
        # Дополняем матрицу фичей столбцом единиц слева для w0
        X = np.concatenate((np.ones(len(X))[:, np.newaxis], X), axis=1)
        y = np.array(y)
        self.weights = np.ones(X.shape[1])

        # процесс обучения
        for i in range(1, self.n_iter + 1):
            # Формируем мини-батч и делаем предсказания
            if not self.sgd_sample or self.sgd_sample <= 0:
                sample_rows_idx = np.arange(0, len(X))
            elif self.sgd_sample > 1 and isinstance(self.sgd_sample, int):
                sample_rows_idx = random.sample(range(X.shape[0]), self.sgd_sample) 
            else:
                sample_rows_idx = random.sample(range(X.shape[0]), int(self.sgd_sample * len(X)))
            mini_X = X[sample_rows_idx]
            mini_y = y[sample_rows_idx]
            pred = np.matmul(mini_X, self.weights.T)        
            
            # Логи
            if verbose and i % verbose == 0:
                # Подсчет ошибки
                pred_for_metric = np.matmul(X, self.weights.T)
                MSE = np.sum(np.power(pred_for_metric - y, 2)) / len(pred_for_metric)
                print(f"Iteration {i}, loss: {MSE}, {self.metric}: {self.__calc_metrics(pred_for_metric, y)}")
            # подсчет градиента
            gradient = (2 / len(pred)) * np.matmul((pred - mini_y).T, mini_X)
            # обновление весов
            lr = 0
            if callable(self.learning_rate):
                lr = self.learning_rate(i)
            else:
                lr = self.learning_rate
            self.weights = self.weights - lr * \
            (gradient + self.regflag * self.__calc_reg(gradient=True))
        
        pred = np.matmul(X, self.weights.T)
        self.best_score = self.__calc_metrics(pred, y)
    
    # Получение весов модели
    def get_coef(self) -> np.ndarray:
        return self.weights[1:]
    
    # Предсказания
    def predict(self, X: np.ndarray) -> np.ndarray:
        X = np.concatenate((np.ones(len(X))[:, np.newaxis], X), axis=1)
        return np.matmul(X, self.weights.T)
    
    # Метрики
    def __calc_metrics(self, pred: np.ndarray, y: np.ndarray) -> np.float64:
        if self.metric not in ['mae', 'mse', 'rmse', 'mape', 'r2']:
            return None
        
        elif self.metric == 'mae':
            return np.sum(np.abs(pred - y)) / len(pred)
        elif self.metric == 'mse':
            return np.sum(np.power(pred - y, 2)) / len(pred)
        elif self.metric == 'rmse':
            return np.sqrt(np.sum(np.power(pred - y, 2)) / len(pred))
        elif self.metric == 'r2':
            return 1 - (np.sum(np.power(y - pred, 2)) / np.sum(np.power(y - y.mean(), 2)))
        elif self.metric == 'mape':
            return np.sum(np.abs((y - pred) / y)) / len(pred) * 100

    # лучший результат модели
    def get_best_score(self) -> np.float64:
        return self.best_score
    
    # подсчет регуляризации
    def __calc_reg(self, gradient: bool=False) -> np.ndarray:        
        l1 = np.sum(np.abs(self.weights))
        l1dif = np.sign(self.weights) * self.l1_coef
        l2 = np.sum(np.power(self.weights, 2)) * self.l2_coef
        l2dif = 2 * self.weights * self.l2_coef
        if not gradient:
            if self.reg == 'l1':
                return l1
            elif self.reg == 'l2':
                return l2
            return l1 + l2
        if self.reg == 'l1':
            return l1dif
        elif self.reg == 'l2':
            return l2dif
        return l1dif + l2dif
            
    

In [21]:
linear = MyLineReg(n_iter=50, learning_rate=0.1, metric='mae', sgd_sample=0.1, random_state=42)
linear.fit(X, y, verbose=5)
linear.get_coef().mean()

Iteration 5, loss: 4577.424689371495, mae: 53.852504812448274
Iteration 10, loss: 644.8204293770057, mae: 20.129234243885367
Iteration 15, loss: 275.18165467946324, mae: 13.324546732719392
Iteration 20, loss: 230.72956743654993, mae: 12.17361663404447
Iteration 25, loss: 228.54338146051754, mae: 12.083618424347463
Iteration 30, loss: 226.3324916046762, mae: 12.046104339332379
Iteration 35, loss: 225.41862681182985, mae: 12.079855223693361
Iteration 40, loss: 225.51771346164134, mae: 12.056696339206987
Iteration 45, loss: 229.74278066621216, mae: 12.117626896790489
Iteration 50, loss: 226.70960486645845, mae: 12.07443502691104


30.377720712645836