# HSE 2022: Mathematical Methods for Data Analysis

## Homework 2

In [518]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import datasets
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from statsmodels.regression.linear_model import OLSResults
from math import sqrt
import random
import sys

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

sns.set(style="darkgrid")

### Data

For this homework we use Dataset from seaborn on diamonds prices.

In [519]:
data = sns.load_dataset('diamonds')

y = data.price
x = data.drop(['price'], axis=1)
columns = data.drop(['price'], axis=1).columns

## Linear regression

#### 0. [0.25 points] Encode categorical variables.

In [520]:
data_ohe = pd.get_dummies(data, drop_first=True)
x = data_ohe.drop('price', axis=1)
y = data_ohe['price']

#### 1. [0.25 points] Split the data into train and test sets with ratio 80:20 with random_state=17.

In [521]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=17, test_size=0.2)

#### 2. [1 point] Train models on train data using StatsModels library and apply it to the test set; use $RMSE$ and $R^2$ as the quality measure.

* [`LinearRegression`](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html);
* [`Ridge`](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) with $\alpha = 0.01$;
* [`Lasso`](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) with $\alpha = 0.01$
* [`ElasticNet`](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html) with $\alpha = 0.01$, $l_{1}$_$ratio = 0.6$

Don't forget to scale the data before training the models with StandardScaler!

In [522]:
scaler = StandardScaler()
columns_to_scalar = ['carat', 'depth', 'table', 'x', 'y', 'z']
scaler.fit(x_train[columns_to_scalar])
x_train[columns_to_scalar] = scaler.transform(x_train[columns_to_scalar])
x_test[columns_to_scalar] = scaler.transform(x_test[columns_to_scalar])

In [523]:
features_train_constant = sm.add_constant(x_train)
features_test_constant = sm.add_constant(x_test)

In [524]:
model_ols = sm.OLS(y_train, features_train_constant)
linear_for_summary = model_ols.fit()
predicted_linear = linear_for_summary.predict(features_test_constant)
mse = mean_squared_error(y_test, predicted_linear)
sqrt_mse = mse ** .5
r2 = r2_score(y_test, predicted_linear)
print("MSE =", mse)
print("RMSE =", sqrt_mse)
print("R2 = ", r2)

MSE = 1322478.130934603
RMSE = 1149.990491671389
R2 =  0.9183333802164863


In [525]:
ridge_model = model_ols.fit_regularized(L1_wt=0, alpha=0.01)
ridge_for_summary = sm.regression.linear_model.OLSResults(model_ols, ridge_model.params,
                                                          model_ols.normalized_cov_params)
predicted_ridge = ridge_model.predict(features_test_constant)
mse = mean_squared_error(y_true=y_test, y_pred=predicted_ridge)
sqrt_mse = mse ** .5
r2 = r2_score(y_test, predicted_ridge)
print("MSE =", mse)
print("RMSE =", sqrt_mse)
print('R2 = ', r2)

MSE = 1499815.5963935482
RMSE = 1224.6695866206314
R2 =  0.9073823096269312


In [526]:
lasso_model = model_ols.fit_regularized(L1_wt=1, alpha=0.01)
lasso_for_summary = sm.regression.linear_model.OLSResults(model_ols, lasso_model.params,
                                                          model_ols.normalized_cov_params)
predicted_lasso = ridge_model.predict(features_test_constant)
mse = mean_squared_error(y_test, predicted_lasso)
sqrt_mse = mse ** .5
r2 = r2_score(y_test, predicted_lasso)
print("MSE =", mse)
print("RMSE =", sqrt_mse)
print('R2 = ', r2)

MSE = 1499815.5963935482
RMSE = 1224.6695866206314
R2 =  0.9073823096269312


In [527]:
elastic_model = model_ols.fit_regularized(L1_wt=0.6, alpha=0.01)
elastic_for_summary = sm.regression.linear_model.OLSResults(model_ols, elastic_model.params,
                                                            model_ols.normalized_cov_params)
predicted_elastic = elastic_model.predict(features_test_constant)
mse = mean_squared_error(y_test, predicted_elastic)
sqrt_mse = mse ** .5
r2 = r2_score(y_test, predicted_elastic)
print("MSE =", mse)
print("RMSE =", sqrt_mse)
print('R2 = ', r2)

MSE = 1385150.6626689616
RMSE = 1176.9242382876484
R2 =  0.9144631810046453


#### 3. [1 point] Explore the values of the parameters of the resulting models and compare the number of zero weights in them. Comment on the significance of the coefficients, overal model significance and other related factors from the results table

In [528]:
display(linear_for_summary.summary())
print(linear_for_summary.pvalues[linear_for_summary.pvalues > 0.05].index.values)

0,1,2,3
Dep. Variable:,price,R-squared:,0.92
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,21600.0
Date:,"Sun, 16 Oct 2022",Prob (F-statistic):,0.0
Time:,23:49:23,Log-Likelihood:,-364390.0
No. Observations:,43152,AIC:,728800.0
Df Residuals:,43128,BIC:,729000.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5998.1755,34.906,171.839,0.000,5929.759,6066.592
carat,5386.7101,26.223,205.422,0.000,5335.313,5438.107
depth,-97.9285,7.156,-13.685,0.000,-111.954,-83.903
table,-61.6945,7.248,-8.512,0.000,-75.900,-47.489
x,-1186.2280,40.719,-29.132,0.000,-1266.038,-1106.418
y,2.8156,24.711,0.114,0.909,-45.618,51.249
z,-31.7705,24.767,-1.283,0.200,-80.314,16.773
cut_Premium,-75.6406,16.251,-4.654,0.000,-107.493,-43.788
cut_Very Good,-97.8485,15.842,-6.177,0.000,-128.899,-66.798

0,1,2,3
Omnibus:,11910.55,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,377007.939
Skew:,0.693,Prob(JB):,0.0
Kurtosis:,17.414,Cond. No.,32.8


['y' 'z']


На уровне значимости 0.05 есть два признака, которые имеют нулевые веса. Это признаки ['y'], ['z'].

In [529]:
display(ridge_for_summary.summary())
print(features_train_constant.columns[ridge_for_summary.pvalues > 0.05].values)

0,1,2,3
Dep. Variable:,price,R-squared:,0.907
Model:,OLS,Adj. R-squared:,0.907
Method:,Least Squares,F-statistic:,18380.0
Date:,"Sun, 16 Oct 2022",Prob (F-statistic):,0.0
Time:,23:49:23,Log-Likelihood:,-367580.0
No. Observations:,43152,AIC:,735200.0
Df Residuals:,43128,BIC:,735400.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3943.1458,37.585,104.914,0.000,3869.479,4016.812
carat,4297.5148,28.235,152.205,0.000,4242.173,4352.856
depth,-104.5765,7.705,-13.573,0.000,-119.678,-89.475
table,-130.3229,7.804,-16.700,0.000,-145.619,-115.027
x,-258.1331,43.844,-5.888,0.000,-344.068,-172.198
y,15.4047,26.607,0.579,0.563,-36.746,67.556
z,-42.8857,26.668,-1.608,0.108,-95.155,9.384
cut_Premium,43.2635,17.498,2.472,0.013,8.967,77.561
cut_Very Good,47.3255,17.058,2.774,0.006,13.892,80.759

0,1,2,3
Omnibus:,15835.711,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,297019.622
Skew:,1.29,Prob(JB):,0.0
Kurtosis:,15.591,Cond. No.,32.8


['y' 'z']


На уровне значимости 0.05 есть два признака, которые имеют нулевые веса. Это признаки ['y'], ['z'].


In [530]:
display(lasso_for_summary.summary())
print(features_train_constant.columns[lasso_for_summary.pvalues > 0.05].values)

0,1,2,3
Dep. Variable:,price,R-squared:,0.92
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,21470.0
Date:,"Sun, 16 Oct 2022",Prob (F-statistic):,0.0
Time:,23:49:23,Log-Likelihood:,-364520.0
No. Observations:,43152,AIC:,729100.0
Df Residuals:,43128,BIC:,729300.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5516.7760,35.010,157.578,0.000,5448.156,5585.396
carat,5290.9319,26.301,201.170,0.000,5239.382,5342.482
depth,-98.7304,7.177,-13.756,0.000,-112.798,-84.663
table,-64.5031,7.269,-8.873,0.000,-78.751,-50.255
x,-1128.8524,40.840,-27.641,0.000,-1208.900,-1048.805
y,15.2731,24.784,0.616,0.538,-33.305,63.851
z,-22.4373,24.841,-0.903,0.366,-71.126,26.251
cut_Premium,-89.3686,16.300,-5.483,0.000,-121.316,-57.421
cut_Very Good,-108.6368,15.889,-6.837,0.000,-139.780,-77.494

0,1,2,3
Omnibus:,12978.589,Durbin-Watson:,1.999
Prob(Omnibus):,0.0,Jarque-Bera (JB):,376787.537
Skew:,0.843,Prob(JB):,0.0
Kurtosis:,17.378,Cond. No.,32.8


['y' 'z' 'clarity_VVS2']


На уровне значимости 0.05 есть три признака, которые имеют нулевые веса. Это признаки ['y'], ['z'], ['claruty_VVS2'].

In [531]:
display(elastic_for_summary.summary())
print(features_train_constant.columns[elastic_for_summary.pvalues > 0.05].values)

0,1,2,3
Dep. Variable:,price,R-squared:,0.915
Model:,OLS,Adj. R-squared:,0.915
Method:,Least Squares,F-statistic:,20220.0
Date:,"Sun, 16 Oct 2022",Prob (F-statistic):,0.0
Time:,23:49:23,Log-Likelihood:,-365710.0
No. Observations:,43152,AIC:,731500.0
Df Residuals:,43128,BIC:,731700.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4559.9255,35.984,126.720,0.000,4489.396,4630.455
carat,4824.8898,27.033,178.483,0.000,4771.905,4877.875
depth,-104.5162,7.377,-14.168,0.000,-118.975,-90.058
table,-99.3859,7.472,-13.302,0.000,-114.030,-84.741
x,-697.7107,41.977,-16.621,0.000,-779.986,-615.435
y,-4.5791,25.474,-0.180,0.857,-54.509,45.351
z,-43.1341,25.532,-1.689,0.091,-93.178,6.909
cut_Premium,-27.3443,16.753,-1.632,0.103,-60.181,5.492
cut_Very Good,-31.8043,16.331,-1.947,0.051,-63.814,0.205

0,1,2,3
Omnibus:,14825.528,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,377176.717
Skew:,1.088,Prob(JB):,0.0
Kurtosis:,17.319,Cond. No.,32.8


['y' 'z' 'cut_Premium' 'cut_Very Good' 'color_F' 'clarity_VS2']


На уровне значимости 0.05 есть шесть признака, которые имеют нулевые веса. Это признаки ['y'], ['z'], ['cut_Premium'], ['cut_Very Good'], ['color_F'], ['clarity_VS2'].


#### 4. [1 point] Implement one of the elimination algorithms that were described in the Seminar_4 (Elimination by P-value, Forward elimination, Backward elimination), make conclusions.

In [532]:
class GroupOfOSLModels(Exception):
    pass

In [533]:
from pandas import DataFrame


def eliminate_pval(model_ols, alpha):
    if not isinstance(model_ols, sm.OLS):
        raise GroupOfOSLModels('Only OLS eliminate')

    x, y = crutch(model_ols)
    iterations = model_ols.exog.shape[1]

    x = columns_to_drop(alpha, iterations, x, y)

    return sm.OLS(y, x)


def columns_to_drop(alpha, iterations, x, y):
    for i in range(iterations):
        res = sm.OLS(y, x).fit()
        max_pvalue = res.pvalues.max()
        if max_pvalue < alpha:
            break
        columns_dropping = res.pvalues.index.values[res.pvalues.argmax()]
        x = x.drop(labels=columns_dropping, axis=1, level=0)
    return x


def crutch(model):
    x = DataFrame(data=model.exog, columns=[model.exog_names])
    y = DataFrame(data=model.endog, columns=[model.endog_names])
    return x, y

In [534]:
threshold = 0.05
elimination_model = eliminate_pval(model_ols, threshold)
elim_results = elimination_model.fit()
elim_results.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.92
Dependent Variable:,price,AIC:,728832.0274
Date:,2022-10-16 23:49,BIC:,729022.8221
No. Observations:,43152,Log-Likelihood:,-364390.0
Df Model:,21,F-statistic:,23660.0
Df Residuals:,43130,Prob (F-statistic):,0.0
R-squared:,0.920,Scale:,1266200.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,5997.9955,34.9049,171.8381,0.0000,5929.5812,6066.4098
carat,5386.4129,26.1997,205.5905,0.0000,5335.0609,5437.7648
depth,-101.6578,6.5305,-15.5665,0.0000,-114.4577,-88.8578
table,-61.5921,7.2458,-8.5004,0.0000,-75.7940,-47.3901
x,-1214.1301,26.2216,-46.3026,0.0000,-1265.5250,-1162.7352
cut_Premium,-75.2719,16.2344,-4.6366,0.0000,-107.0915,-43.4522
cut_Very Good,-98.3208,15.8288,-6.2115,0.0000,-129.3456,-67.2961
cut_Good,-250.7595,22.4646,-11.1624,0.0000,-294.7906,-206.7284
cut_Fair,-824.8827,36.9688,-22.3129,0.0000,-897.3423,-752.4231

0,1,2,3
Omnibus:,11910.897,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,377224.072
Skew:,0.693,Prob(JB):,0.0
Kurtosis:,17.418,Condition No.:,24.0


Сработало! Порогов выше 0.05 больше нет!

#### 5. [1 point] Find the best (in terms of RMSE) $\alpha$ for Lasso regression using cross-validation with 4 folds. You must select values from range $[10^{-4}, 10^{3}]$.

In [535]:
fold = 4
alphas = np.logspace(-4, 3, 1000)
search = GridSearchCV(Ridge(), [{"alpha": alphas}], scoring="neg_mean_squared_error", cv=fold)
search.fit(features_train_constant, y_train)
print("Best alpha = ", search.best_params_['alpha'])

Best alpha =  13.681576279674704


## Gradient descent

#### 6. [3.5 points] Implement a Ridge regression model for the MSE loss function, trained by gradient descent.

All calculations must be vectorized, and python loops can only be used for gradient descent iterations. As a stop criterion, you must use (simultaneously):

* checking for the Absolute-value norm of the weight difference on two adjacent iterations (for example, less than some small number of the order of $10^{-6}$, set by the `tolerance` parameter);
* reaching the maximum number of iterations (for example, 10000, set by the `max_iter` parameter).

You need to implement:

* Full gradient descent:

$$
w_{k + 1} = w_{k} - \eta_{k} \nabla_{w} Q(w_{k}).
$$

* Stochastic Gradient Descent:

$$
w_{k + 1} = w_{k} - \eta_{k} \nabla_{w} q_{i_{k}}(w_{k}).
$$

$\nabla_{w} q_{i_{k}}(w_{k}) \, $ is the estimate of the gradient over the batch of objects selected randomly.

* Momentum method:

$$
h_0 = 0, \\
h_{k + 1} = \alpha h_{k} + \eta_k \nabla_{w} Q(w_{k}), \\
w_{k + 1} = w_{k} - h_{k + 1}.
$$

* Adagrad method:

$$
G_0 = 0, \\
G_{k + 1} = G_{k} + (\nabla_{w} Q(w_{k+1}))^2, \\
w_{k + 1} = w_{k} - \eta * \frac{\nabla_{w} Q(w_{k+1})}{\sqrt{G_{k+1} + \epsilon}}.
$$



To make sure that the optimization process really converges, we will use the `loss_history` class attribute. After calling the `fit` method, it should contain the values of the loss function for all iterations, starting from the first one (before the first step on the anti-gradient).

You need to initialize the weights with a random vector from normal distribution. The following is a template class that needs to contain the code implementing all variations of the models.

In [536]:
from sklearn.base import BaseEstimator


class LinReg(BaseEstimator):
    def __init__(self, delta=1.0, gd_type='Momentum',
                 tolerance=1e-4, max_iter=1000, w0=None, eta=1e-2, alpha=1e-3):
        self.gd_type = gd_type
        self.delta = delta
        self.tolerance = tolerance
        self.max_iter = max_iter
        self.w0 = w0
        self.alpha = alpha
        self.w = None
        self.eta = eta
        self.loss_history = None

    support = ['GradientDescent', 'StochasticDescent', 'Momentum']

    def fit(self, X, y):
        w_tmp, h_tmp = self.filling_missed_values(X)

        for i in range(0, self.max_iter):
            if self.gd_type == self.support[0]:
                self.gradient(X, y)
            elif self.gd_type == self.support[1]:
                self.stoch(X, y)
            else:
                self.momentum(X, h_tmp, y)
            self.loss_history.append(self.calc_loss(X, y))
            if np.linalg.norm(self.w - w_tmp) < self.tolerance:
                break
            w_tmp = np.array(self.w)

        return self

    def momentum(self, X, h, y):
        indexes = np.random.choice(X.shape[0], int(X.shape[0] * self.delta))
        h = self.alpha * h + self.eta * self.calc_gradient(np.take(X, indexes, axis=0), np.take(y, indexes))
        self.w -= h

    def stoch(self, X, y):
        indexes = np.random.choice(X.shape[0], int(X.shape[0] * self.delta))
        self.gradient(np.take(X, indexes, axis=0), np.take(y, indexes))

    def gradient(self, X, y):
        self.w -= self.eta * self.calc_gradient(X, y)

    def filling_missed_values(self, X):
        if self.gd_type not in self.support:
            raise GroupOfOSLModels('Only OLS model')
        np.random.seed(0)
        self.loss_history = []
        if self.w0 is None:
            self.w0 = np.zeros(X.shape[1])
        self.w = np.array(self.w0)
        cur_w = np.array(self.w)
        h = np.zeros(X.shape[1])
        return cur_w, h

    def predict(self, X):
        if self.w is None:
            raise Exception('Not trained yet')
        return self.deep_calc_grad(X)

    def calc_gradient(self, X, y):
        return 2 * np.dot(X.T, self.deep_calc_grad(X) - y) / y.shape[0]

    def deep_calc_grad(self, X):
        return np.dot(X, self.w)

    def calc_loss(self, X, y):

        return np.mean(self.deep_calc(X, y))

    def deep_calc(self, X, y):
        return (self.predict(X) - y) ** 2

#### 7. [1 points] Train and validate "hand-written" models on the same data, and compare the quality with the Sklearn or StatsModels methods. Investigate the effect of the `max_iter` and `alpha` parameters on the optimization process. Is it consistent with your expectations?

#### 8. [1 points] Plot graphs (on the same picture) of the dependence of the loss function value on the iteration number for Full GD, SGD, Momentum and Adagrad. Draw conclusions about the rate of convergence of various modifications of gradient descent.

Don't forget about what *beautiful* graphics should look like!

In [537]:
# your code here 
#╰( ͡° ͜ʖ ͡° )つ──☆*:・ﾟ