In [398]:
import numpy as np
import pandas as pd
import random

In [400]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=900, n_features=14, n_informative=5, noise=5, random_state=42)
X = pd.DataFrame(X)
y = pd.Series(y)

In [408]:
from sklearn.utils import shuffle

In [410]:
print(X)
X = shuffle(X)
print(X)
X_1 = pd.concat([pd.DataFrame(np.ones(X.shape[0])), X], axis = 1)
X_1.columns = [i for i in range(X_1.shape[1])]
print(X_1)
X_1.reset_index(inplace=True, drop=True)
print(X_1)
weights = np.ones((X_1.shape[1]))
X_1.shape, weights.shape

           0         1         2         3         4         5         6   \
0    0.225762 -0.692250  0.103438 -0.011092 -0.022279  0.277358  0.017613   
1    0.122298 -0.601368 -1.592994  1.364140  0.552490 -0.019638  0.543298   
2    0.431632 -0.190241  1.363229 -1.125587  1.588940 -0.488886  0.739675   
3   -0.832477  0.779696 -0.872785  0.177830 -0.510291 -1.477164  0.095350   
4   -0.585928 -0.714681  0.321679  0.178477 -0.181503 -0.995148  0.329107   
..        ...       ...       ...       ...       ...       ...       ...   
895 -1.133421 -0.162704  0.072153 -0.718065  0.087474 -1.143339  0.349924   
896 -1.803140  0.213197 -0.319054 -0.060661  0.021312  1.076007 -1.584136   
897  1.310309  1.304340  1.032546 -0.214921 -0.410814 -1.090966  1.395684   
898  1.110911 -2.161304  0.660343  1.413770  0.309047 -0.395047 -1.500802   
899 -0.182585  0.521216 -0.558230 -0.706476  1.272199 -0.421483 -0.554593   

           7         8         9         10        11        12        13  

((900, 15), (15,))

In [342]:
class MyLineReg:
    def __init__(self, n_iter=100, learning_rate=0.5, weights=None, metric=None, reg=None, l1_coef=0.1, l2_coef=0.1, sgd_sample=None, random_state=42):
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.weights = weights
        self.metric = metric
        self.metric_value = None
        self.reg = reg
        self.l1_coef = l1_coef
        self.l2_coef = l2_coef
        self.learning_rate = learning_rate
        self.sgd_sample = sgd_sample
        self.random_state = random_state
        self.metric_formulas = {'mae': '1/X.shape[0] * sum(abs(y_pred_ - y))',
                                'mse': '1/X.shape[0] * np.dot((y_pred - y).T, (y_pred - y))',
                                'rmse': '(1/X.shape[0] * np.dot((y_pred - y).T, (y_pred - y)))**(1/2)',
                                'mape': '100/X.shape[0] * sum(abs(y_pred - y)/abs(y))',
                                'r2': '1 - np.dot((y_pred - y).T, (y_pred - y))/np.dot((np.mean(y) - y).T, (np.mean(y) - y))'}
        self.reg_formulas = {'l1': 'self.l1_coef * sum(abs(self.weights))',
                             'l2': 'self.l2_coef * sum(np.power(self.weights, 2))',
                             'elasticnet': 'self.l1_coef * sum(abs(self.weights)) + self.l2_coef * sum(np.power(self.weights, 2))'}
        self.reg_grad_formulas = {'l1': 'self.l1_coef * np.sign(self.weights)',
                                 'l2': 'self.l2_coef * 2 * self.weights',
                                 'elasticnet': 'self.l1_coef * np.sign(self.weights) + self.l2_coef * 2 * self.weights'}
    def __str__(self):
        return f'MyLineReg class: n_iter={self.n_iter}, learning_rate={self.learning_rate}'
        
    def fit(self, X, y, verbose=False):
        random.seed(self.random_state)
        flag = True
        X.reset_index(inplace=True, drop=True)
        X_1 = pd.concat([pd.DataFrame(np.ones(X.shape[0])), X], axis = 1)
        X_1.columns = [i for i in range(X_1.shape[1])]
        self.weights = np.ones((X_1.shape[1]))
        if not self.sgd_sample:
            self.sgd_sample = X_1.shape[0]
        if self.metric:
            formula = self.metric_formulas[self.metric]
        for i in range(1, self.n_iter + 1):
            sample_rows_idx = random.sample(range(X_1.shape[0]), self.sgd_sample if isinstance(self.sgd_sample, int) else round(self.sgd_sample * X_1.shape[0]))
            X_1.reset_index(inplace=True, drop=True)
            y.reset_index(inplace=True, drop=True)
            X_b = X_1.reindex(sample_rows_idx)
            y_batch = y.reindex(sample_rows_idx)
            y_pred = np.dot(X_1, self.weights)
            y_batch_pred = np.dot(X_b, self.weights)
            if self.reg:
                reg_value = eval(self.reg_formulas[self.reg])
                reg_grad_value = eval(self.reg_grad_formulas[self.reg])
            else:
                reg_value = 0
                reg_grad_value = 0
            loss = 1 / X_1.shape[0] * np.dot((y_pred - y).T, (y_pred - y)) + reg_value
            grad = 2 / X_b.shape[0] * np.dot(X_b.T, (y_batch_pred - y_batch)) + reg_grad_value
            if callable(self.learning_rate):
                self.weights = self.weights - self.learning_rate(i) * grad
            else:
                self.weights = self.weights - self.learning_rate * grad
            if self.metric:
                self.metric_value = eval(formula)
            if verbose and (i + 1) % verbose == 0:
                if flag:
                    if self.metric:
                        print(f'start| loss: {loss}| {self.metric}: {self.metric_value}')
                    else: 
                        print(f'start| loss: {loss}')
                    flag = False
                else:
                    if self.metric:
                        print(f'{i} | loss: {loss}| {self.metric}: {self.metric_value}')
                    else: 
                        print(f'{i} | loss: {loss}')
                
    def get_coef(self):
        return self.weights[1:]
        
    def predict(self, X):
        return np.dot(pd.concat([pd.DataFrame(np.ones(X.shape[0])), X], axis = 1), self.weights)

    def get_best_score(self):
        return self.metric_value

In [344]:
model = MyLineReg(n_iter=100, learning_rate=lambda i: 0.5 * (0.85 ** i), metric='mse', reg='l1', sgd_sample=800)
model.fit(X, y)
print(model.get_best_score())

25.353008072233013


In [264]:
for reg in [None, 'l1', 'l2', 'elasticnet', None]:
    print(reg)
    for metric in ['mae', 'mse', 'rmse', 'mape', 'r2', None]:
        print(metric)
        for lr in [lambda i: 0.5 * (0.85 ** i), 0.1]:
            print(lr)
            for sgd_sample in [50, 0.7, None]:
                print(sgd_sample)
                model = MyLineReg(n_iter=100, learning_rate=lr, metric=metric, reg=reg, sgd_sample=sgd_sample)
                model.fit(X, y)
                print(model.get_best_score())

None
mae
<function <lambda> at 0x000001F3B10F4720>
50
      0         1         2         3         4         5         6   \
0    1.0  0.225762 -0.692250  0.103438 -0.011092 -0.022279  0.277358   
1    1.0  0.122298 -0.601368 -1.592994  1.364140  0.552490 -0.019638   
2    1.0  0.431632 -0.190241  1.363229 -1.125587  1.588940 -0.488886   
3    1.0 -0.832477  0.779696 -0.872785  0.177830 -0.510291 -1.477164   
4    1.0 -0.585928 -0.714681  0.321679  0.178477 -0.181503 -0.995148   
..   ...       ...       ...       ...       ...       ...       ...   
895  1.0 -1.133421 -0.162704  0.072153 -0.718065  0.087474 -1.143339   
896  1.0 -1.803140  0.213197 -0.319054 -0.060661  0.021312  1.076007   
897  1.0  1.310309  1.304340  1.032546 -0.214921 -0.410814 -1.090966   
898  1.0  1.110911 -2.161304  0.660343  1.413770  0.309047 -0.395047   
899  1.0 -0.182585  0.521216 -0.558230 -0.706476  1.272199 -0.421483   

           7         8         9         10        11        12        13  \
0   

KeyboardInterrupt: 