# Machine Learning Algorithms - Linear Models

## Imports and Config

In [1]:
import numpy as np
import pandas as pd
import random
import sklearn.linear_model

print("Imports Done!")

Imports Done!


In [10]:
class Config:
    random_seed = 42
    train_size = 0.75

print(Config.random_seed)
print(Config.train_size)

42
0.75


## Regression

In [3]:
from sklearn.datasets import load_diabetes

data = load_diabetes(as_frame=True)
X, y = data['data'], data['target']

In [4]:
X

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
...,...,...,...,...,...,...,...,...,...,...
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
439,0.041708,0.050680,-0.015906,0.017293,-0.037344,-0.013840,-0.024993,-0.011080,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [11]:
class MyLineReg:

    def __init__(self, metric=None, reg=None, sgd_sample=None, random_state=Config.random_seed, l1_coef=0, l2_coef=0, n_iter=100, learning_rate=0.1, weights=[]):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = weights
        self.metric = metric
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.sgd_sample = sgd_sample
        self.random_state = random_state

    def __str__(self):
        return f'MyLineReg class: n_iter={n_iter}, learning_rate={learning_rate}'

    def fit(self, X, y, verbose=False):
        n_samples, n_features = X.shape
        X = np.hstack((np.ones((n_samples, 1)), X))
        self.weights = np.ones(n_features + 1)

        # random.seed(self.random_state)

        if callable(self.learning_rate):
            for num_iter in range(1, self.n_iter + 1):

                # sample_rows_idx = random.sample(range(X.shape[0]), self.sgd_sample)
                
                # if self.sgd_sample >= 1:
                #     pass
                # else:
                #     pass
                
                y_pred = np.dot(X, self.weights)
    
                if self.reg == "l1":
                    loss = np.mean((y - y_pred)**2) + self.l1_coef * np.sum(abs(self.weights))
                    self.weights = self.weights - self.learning_rate(num_iter) * ((2 / n_samples) * np.dot(X.T, y_pred - y) + self.l1_coef * np.sign(self.weights))
                elif self.reg == "l2":
                    loss = np.mean((y - y_pred)**2) + self.l2_coef * np.sum(self.weights**2)
                    self.weights = self.weights - self.learning_rate(num_iter) * ((2 / n_samples) * np.dot(X.T, y_pred - y) + self.l2_coef * 2 * self.weights)
                elif self.reg == "elasticnet":
                    loss = np.mean((y - y_pred)**2) + self.l1_coef * np.sum(abs(self.weights)) + self.l2_coef * np.sum(self.weights**2)
                    self.weights = self.weights - self.learning_rate(num_iter) * ((2 / n_samples) * np.dot(X.T, y_pred - y) + self.l1_coef * np.sign(self.weights) + self.l2_coef * 2 * self.weights)
                else:
                    loss = np.mean((y - y_pred)**2)
                    self.weights = self.weights - self.learning_rate(num_iter) * (2 / n_samples) * np.dot(X.T, y_pred - y)
                
                if verbose and self.metric:
                    if num_iter == 1:
                        print(f"start | loss: {loss} | {self.metric}: {self.compute_metric(self.metric, y, y_pred)}")
                    if num_iter % verbose == 0:
                        print(f"{self.n_iter} | loss: {loss} | {self.metric}: {self.compute_metric(self.metric, y, y_pred)}")
        
        else:
            for num_iter in range(1, self.n_iter + 1):

                # sample_rows_idx = random.sample(range(X.shape[0]), self.sgd_sample)

                # if self.sgd_sample >= 1:
                #     pass
                # else:
                #     pass
                
                
                y_pred = np.dot(X, self.weights)
    
                if self.reg == "l1":
                    loss = np.mean((y - y_pred)**2) + self.l1_coef * np.sum(abs(self.weights))
                    self.weights = self.weights - self.learning_rate * ((2 / n_samples) * np.dot(X.T, y_pred - y) + self.l1_coef * np.sign(self.weights))
                elif self.reg == "l2":
                    loss = np.mean((y - y_pred)**2) + self.l2_coef * np.sum(self.weights**2)
                    self.weights = self.weights - self.learning_rate * ((2 / n_samples) * np.dot(X.T, y_pred - y) + self.l2_coef * 2 * self.weights)
                elif self.reg == "elasticnet":
                    loss = np.mean((y - y_pred)**2) + self.l1_coef * np.sum(abs(self.weights)) + self.l2_coef * np.sum(self.weights**2)
                    self.weights = self.weights - self.learning_rate * ((2 / n_samples) * np.dot(X.T, y_pred - y) + self.l1_coef * np.sign(self.weights) + self.l2_coef * 2 * self.weights)
                else:
                    loss = np.mean((y - y_pred)**2)
                    self.weights = self.weights - self.learning_rate * (2 / n_samples) * np.dot(X.T, y_pred - y)
                
                if verbose and self.metric:
                    if num_iter == 1:
                        print(f"start | loss: {loss} | {self.metric}: {self.compute_metric(self.metric, y, y_pred)}")
                    if num_iter % verbose == 0:
                        print(f"{self.n_iter} | loss: {loss} | {self.metric}: {self.compute_metric(self.metric, y, y_pred)}")

    def get_coef(self):
        return self.weights[1:]

    def predict(self, X):
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        return np.dot(X, self.weights)

    def compute_metric(self, metric_name, y, y_pred):
        if metric_name == None:
            pass
        elif metric_name == "mse":
            return np.mean((y - y_pred)**2)
        elif metric_name == "mae":
            return np.mean(abs(y - y_pred))
        elif metric_name == "rmse":
            return np.sqrt(np.mean((y - y_pred)**2))
        elif metric_name == "mape":
            return 100 * np.mean(abs((y - y_pred) / y))
        elif metric_name == "r2": 
            return 1 - np.mean((y - y_pred)**2)/np.mean((y - np.mean(y))**2)

    def get_best_score(self):
        return self.compute_metric(self.metric, y, np.dot(np.hstack((np.ones((X.shape[0], 1)), X)), self.weights))


In [22]:
test = MyLineReg("mae", learning_rate=lambda iter: 0.5 * (0.85 ** iter))
test.fit(X, y, verbose=100)
test.get_coef()

start | loss: 28752.020614660887 | mae: 151.13348416289594
100 | loss: 12825.625411522504 | mae: 87.55436511861369


array([1.40765193, 1.09087324, 2.2822899 , 1.96377428, 1.45846028,
       1.3754835 , 0.13653466, 1.93822301, 2.23582123, 1.83357799])

In [29]:
test = MyLineReg("r2", n_iter=10**5, learning_rate=0.75)
test.fit(X, y)
test.get_best_score()

0.5177371239918096

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
model = LinearRegression()
model.fit(X, y)

model.score(X, y)

0.5177484222203499

In [30]:
test.get_best_score() - model.score(X, y)

-1.1298228540290722e-05

---

Красивый код от коллеги, надо посмотреть и научиться

In [None]:
# class MyLineReg:
#     def __init__(self, n_iter: int = 100, learning_rate: float = 0.1, metric: str = None) -> None:
#         self.n_iter = n_iter
#         self.learning_rate = learning_rate
#         self._weights = None
#         self.metric = metric

#     def __str__(self) -> str:
#         params = [f'{key}={value}' for key, value in self.__dict__.items()]
#         return 'MyLineReg class: ' + ', '.join(params)

#     @staticmethod
#     def _mae(y_true: np.array, y_pred: np.array):
#         return (y_true - y_pred).abs().mean()

#     @staticmethod
#     def _mse(y_true: np.array, y_pred: np.array):
#         return (y_true - y_pred).pow(2).mean()

#     @staticmethod
#     def _rmse(y_true: np.array, y_pred: np.array):
#         return np.sqrt((y_true - y_pred).pow(2).mean())

#     @staticmethod
#     def _mape(y_true: np.array, y_pred: np.array):
#         return 100 * ((y_true - y_pred) / y_true).abs().mean()

#     @staticmethod
#     def _r2(y_true: np.array, y_pred: np.array):
#         return 1 - (y_true - y_pred).pow(2).sum() / ((y_true - y_true.mean()).pow(2).sum())

#     def get_best_score(self):
#         return self.score

#     def fit(self, x: pd.DataFrame, y: pd.Series, verbose: int) -> None:
#         x = pd.concat([pd.Series([1] * x.shape[0], index=x.index), x], axis=1).values
#         self._weights = pd.Series([1.] * x.shape[1]).values
#         for i in range(self.n_iter):
#             y_hat = self._weights @ x.T
#             mse = (y_hat - y).pow(2).mean()
#             grad = (2 / y.shape[0]) * (y_hat - y) @ x
#             self._weights -= self.learning_rate * grad
#             if self.metric:
#                 self.score = getattr(self, '_' + self.metric)(y, x @ self._weights)
#             if verbose and i % verbose == 0:
#                 if self.metric:
#                     print(f'{i} | loss: {mse} | {self.metric}: {self.score}')
#                 else:
#                     print(f'{i} | loss: {mse}')

#     def get_coef(self):
#         return self._weights[1:]

#     def predict(self, x: pd.DataFrame):
#         x = pd.concat([pd.Series([1] * x.shape[0], index=x.index), x], axis=1).values
#         return x @ self._weights

---

## Classification

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features=14, n_informative=10, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)
X.columns = [f'col_{col}' for col in X.columns]

In [None]:
class MyLogReg:

    def __init__(self, n_iter=10, learning_rate=0.1, eps = 1e-15, weights=[]):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = weights
        self.eps = eps

    def __str__(self):
        return f'MyLogReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}'

    def fit(self, X, y, verbose=False):
        n_samples, n_features = X.shape
        X = np.hstack((np.ones((n_samples, 1)), X))
        self.weights = np.ones(n_features + 1)
        
        for num_iter in range(1, self.n_iter + 1):
            y_pred = 1 / (1 + np.exp(-1*np.dot(X, self.weights)))
            loss = -1 * np.mean(y * np.log(y_pred + self.eps) + (1 - y) * np.log(1 - y_pred + self.eps))
            self.weights = self.weights - self.learning_rate * (1 / X.shape[0]) * np.dot(X.T, y_pred - y)
            
            if verbose:
                if num_iter == 1:
                    print(f"start | loss: {loss}")
                if num_iter % verbose == 0:
                    print(f"{self.n_iter} | loss: {loss}")

    def predict(self):
        pass

    def predict_proba(self):
        pass
                                       
    def get_coef(self):
        return self.weights[1:]

In [None]:
test = MyLogReg()
test.fit(X, y, verbose=1)