In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso, Ridge
import matplotlib.pyplot as plt

In [26]:
X = np.random.randint(10, 10000, size = (50, 4))

X.shape

(50, 4)

In [27]:
arr = np.array([1/4] * 4)

a = arr.reshape(4, 1)

a.shape

(4, 1)

In [6]:
t = np.dot(X, a)

t.shape

(50, 1)

In [28]:
def y_generator(X, a, b, c):
  t = (np.dot(X, a)*b)
  n = (np.random.normal(0, (c/(X.shape[0] + X.shape[1] - 1))**0.5, size = X.shape[0])).reshape(X.shape[0], 1)
  y = np.sum(np.hstack((t, n)), axis = 1)
  return y

In [32]:
y = y_generator(X, a, 5, 9)

y.shape

(50,)

In [30]:
y

array([14334.91031282, 24028.85540358, 29447.28909535, 10147.42572274,
       21622.06652324, 21098.40471394, 40207.88689147, 32082.70512308,
       17443.89233557, 22169.73673145, 20203.3702562 , 23702.5228245 ,
       31682.74011728, 25704.99711741, 32407.44868878, 29727.72898301,
       21030.46570128, 25592.1709964 , 11585.44773144, 30604.6774167 ,
       29638.36720541, 15197.32500082, 13647.53357248, 34210.57576115,
       35541.36474919, 14954.78794358, 14614.83033878, 30659.88348815,
        9783.83781844, 33271.21308156, 25598.46343594, 35380.81243801,
       22741.3306047 , 25895.58373136, 28984.46218625, 21367.3149789 ,
       30043.21206956, 25573.08311483, 24899.67453562, 39804.93417626,
       22897.52751152, 15153.65564362, 18147.90088136, 15073.87812932,
       24350.68370739, 19637.22641526, 25565.50868781, 24840.52785963,
       23055.74899526, 32714.69144137])

#Лассо

In [8]:
class InsufficientData(Exception):
    """
    Rises when there is not enough data to build a model.
    """
    def __str__(self):
        string_exp = 'Данных недостаточно. Их должно быть не менее 2^k строк, \
где k – количество признаков. Если признак 1, то хотя бы 10 строк.'
        return string_exp


class LinearlyDependent(Exception):
    """
    Rises when there is a linear relationship between the signs, which makes it impossible
    to apply the method of least squares.
    """
    def __str__(self):
        return 'Присутствуют линейно зависимые признаки. Мы не можем применить МНК.'


class DegreeError(Exception):
    """
    Rises when an incorrect degree is entered to construct a polynomial regression.
    """
    def __str__(self):
        return 'Степень полинома должна быть целым неотрицательным числом.'


class NegativeValue(Exception):
    """
    Rises when negative values of y have been fed to the input for the exponential regression. 
    """
    def __str__(self):
        return 'Значения y должны быть положительными'


class VeryBig(Exception):
    """
    Rises when the free term in the exponential regression is too large and further calculations 
    are impossible. 
    """
    def __str__(self):
        return 'Свободный член получился слишком большим, чтобы произвести вычисления'
    
class RegularizationError(Exception):
    """
    It raises when one tries to apply regularization to polynomial regression. 
    """
    def __str__(self):
        return 'К сожалению, мы не можем построить полиномиальную регрессию с регулязацией'

In [9]:
def check_data(X):
    """
    Checks the data size for sufficiency to build an adequate model.
    """
    if len(X.shape) == 2 and X.shape[1] > 1:
        if X.shape[0] < 2**X.shape[1] or len(X) < 10:
            raise InsufficientData
    else:
        if len(X) < 10:
            raise InsufficientData


def plot_3d_regression(X, y, coef, a0, reg_type):
    """
    Plots the graph of the function and plots points from the dataset. 
    Works if there are two feature. 
    Parameters
    ----------
    X : pandas.DataFrame
        A dataset of features.
    y : numpy.array or pandas.DataFrame
        A dataset of targets.
    coef : numpy.array
        The coefficients at the features in the resulting function.
    a0: float or numpy.float64
        The free term in the resulting function.
    reg_type : string
        Type of function.
    """
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_xlabel("X0")
    ax.set_ylabel("X1")
    ax.set_zlabel("y")
    
    if reg_type=='lin':
        ax.scatter(X.iloc[:, 0], X.iloc[:, 1], y, marker='.', color='red') 

        a = np.arange(min(X.min())-1, max(X.max())+1)
        xs = np.tile(a,(len(a),1))
        ys = np.tile(a, (len(a),1)).T       
        zs = a0 + coef[0]*xs + ys * coef[1]

    elif reg_type=='exp':
        ax.scatter(X.iloc[:, 0], X.iloc[:, 1], y, marker='.', color='red') 

        a = np.arange(min(X.min())-1, max(X.max())+1)
        xs = np.tile(a,(len(a),1))
        ys = np.tile(a, (len(a),1)).T  
        zs = a0 * np.exp(coef[0]*xs) * np.exp(ys * coef[1])
    elif reg_type=='poly1': 
        ax.scatter(X.iloc[:, 1], X.iloc[:, 2], y, marker='.', color='red') 
        a = np.arange(min(X.iloc[:, 1:].min())-1, max(X.iloc[:, 1:].max())+1)
        xs = np.tile(a,(len(a),1))
        ys = np.tile(a, (len(a),1)).T 
        zs = a0 + coef[0]*xs + ys * coef[1]
    else:
        ax.scatter(X.iloc[:, 1], X.iloc[:, 2], y, marker='.', color='red') 
        a = np.arange(X.iloc[:, 1].min(), X.iloc[:, 1].max())
        xs = np.tile(a,(len(a),1))
        ys = np.square(np.tile(a, (len(a),1))).T 
        zs = a0 + coef[0]*xs + xs**2 * coef[1]      

    ax.plot_surface(xs, ys, zs, alpha=0.5)
    plt.show()


def plot_2d_regression(X, y, coef, a0, reg_type):
    """
    Plots the graph of the function and plots points from the dataset. 
    Works if there are one feature. 
    Parameters
    ----------
    X : pandas.DataFrame
        A dataset of features.
    y : numpy.array or pandas.DataFrame
        A dataset of targets.
    coef : numpy.array
        The coefficients at the features in the resulting function.
    a0: float or numpy.float64
        The free term in the resulting function.
    reg_type : string
        Type of function.
    """
    xs = np.linspace(X.min()-1, X.max()+1)
    if reg_type == 'lin':
        zs = a0 + xs*coef
    else:
        zs = a0 * np.exp(coef*xs)
    plt.plot(xs, zs, color="blue", linewidth=3, label='Прогноз')
    plt.scatter(X, y, marker='.', color='red', label='Исходные') 
    plt.legend()
    plt.show()

In [10]:
def lin_regression(X, y, tol = 5, regularization = None, alpha=1.0, draw = False):
    """
    Ordinary least squares linear regression. Fits the model to minimize the residual 
    sum of squares between the observed targets of the data set and the targets predicted 
    by the approximation.
    Parameters
    ----------
    X : pandas.DataFrame
        A dataset of features.
    y : pandas.DataFrame
        A dataset of targets.
    tol : int, default= 5
        The number of decimal places to round the coefficient when writing the function 
        in analytic form.
    regularization: string, optional
        Type of regularization.
    alpha : float, default=1.0
        Constant for regularization.
    draw : bool, optional
        Flag for the chart. If the value is True, the graph is drawn. Works only for 
        two- and three-dimensional cases.
    Examples
    --------
    >>> from HW4.regression import *
    >>> import pandas as pd
    >>> import yfinance as yf
    >>> aapl = yf.download('AAPL', '2021-01-01', '2022-01-01')
    >>> aapl = aapl.reset_index(level=0)
    >>> lin_regression(aapl[['Open', 'Volume']], aapl['Close'], regularization='L2')
    {'func': '0.33756 + 1.00283x1 -0.0x2',
     'weights': array([ 1.00283393e+00, -6.79315538e-09]),
     'bias': 0.33756283615903726}
    """
    y_new = y.to_numpy()
    check_data(X)

    if len(X.shape) < 2:
        X = X.to_numpy().reshape(-1, 1)

    if regularization is None:
        reg = LinearRegression().fit(X, y_new)
    elif regularization == 'L1':
        reg = Lasso(alpha=alpha).fit(X, y_new)
    elif regularization == 'L2':
        reg = Ridge(alpha=alpha).fit(X, y_new)

    weights, bias = reg.coef_, reg.intercept_[0]

    func = str(round(bias, tol)) + ' '
    for i in range(len(weights)):
        if str(weights[i])[0] == '-':
            func += str(round(weights[i], tol)) + 'x' + str(i + 1) + ' '
        else:
            func += '+ ' + str(round(weights[i], tol)) + 'x' + str(i + 1) + ' '
    if draw == True and X.shape[1] > 2:
        print('К сожалению, мы не можем построить график, так как размерность пространства признаков велика.')
    elif draw == True and X.shape[1] == 2:
        plot_3d_regression(X, y, weights, bias, reg_type='lin')
    elif draw == True and X.shape[1] == 1:
        plot_2d_regression(X, y, weights, bias, reg_type='lin')
    
    y_pred = reg.predict(X)
    c = 0
    for i in range(len(y_pred)):
        c += (y_new[i] - y_pred[i])**2

    return {'func': func[:-1], 
            'weights': weights, 
            'bias': bias,
            'y_pred': y_pred,
            'c': c}

In [40]:
lin_regression(pd.DataFrame(X), pd.DataFrame(y), regularization = 'L1', alpha = 300)

{'bias': 1.171269353475509,
 'c': array([8.08334532]),
 'func': '1.17127 + 1.24996x1 + 1.24991x2 + 1.24996x3 + 1.24994x4',
 'weights': array([1.24996048, 1.24991214, 1.24996475, 1.24994204]),
 'y_pred': array([14334.28890627, 24022.5639816 , 29444.91372289, 10132.04419088,
        21631.50449323, 21077.61685164, 40218.17522174, 32072.36204968,
        17447.82257985, 22163.92641522, 20198.9840367 , 23715.19519342,
        31691.24592747, 25707.48883018, 32410.9704841 , 29726.33682446,
        21022.80860228, 25593.92170493, 11584.36257258, 30605.92723848,
        29646.32139152, 15192.93052516, 13649.38517681, 34213.60852457,
        35544.8412241 , 14950.51989907, 14610.3366389 , 30676.28709606,
         9788.22668548, 33278.52593615, 25593.90626203, 35377.00239595,
        22741.34762621, 25883.62591659, 28979.90128225, 21357.56137735,
        30054.97563713, 25565.02030163, 24907.75407142, 39808.28623176,
        22897.61342108, 15157.75589018, 18139.18810213, 15063.06633773,
      