In [10]:
import numpy as np
from source.coordinate_descent import CoordinateDescent
from source.utils.load_data import get_high_dimensional_artificial_ds
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import Lasso

# Artificially generated high-dimensional dataset
## Loading Data

In [2]:
X, y = get_high_dimensional_artificial_ds(n_samples=10_000, n_features=5_000, n_informative=1_000)

In [3]:
X.shape

(10000, 5000)

## Splitting data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Standardization

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.fit_transform(X_test, y_test)

## Lasso Regression with our implementation of Coordinate Descent algorithm

In [None]:
l = 2
cd = CoordinateDescent(lambda_=l)
cd.fit(X_train, y_train)
y_predict = cd.predict(X_test)
mse = mean_squared_error(y_test, y_predict)
msa = mean_absolute_error(y_test, y_predict)
print(f'Mean Squared Error: {mse}, Mean Absolute Error: {msa}')

  0%|          | 0/100 [02:33<?, ?it/s]


KeyboardInterrupt: 

## Comparing with sklearn Lasso Linear Regression

In [13]:
lr = Lasso(alpha=l)
lr.fit(X_train, y_train)
y_predict = lr.predict(X_test)
mse = mean_squared_error(y_test, y_predict)
msa = mean_absolute_error(y_test, y_predict)
print(f'Mean Squared Error: {mse}, Mean Absolute Error: {msa}')

Mean Squared Error: 7394.37300329507, Mean Absolute Error: 68.18182996381589


In [17]:
# How many features were indicated to be important by Lasso (coeff > 0.0)
np.count_nonzero(lr.coef_ != 0.0)

1018