# Coordinate descent
CuML library can implement lasso and elastic net algorithms. The lasso model extends LinearRegression with L2 regularization and elastic net extends LinearRegression with a combination of L1 and L2 regularizations.
We see tremendous speed up for datasets with large number of rows and less number of rows. Furthermore, the MSE value for the cuML implementation is much smaller than the scikit-learn implementation for very small datasets.

In [None]:
# Select a particular GPU to run the notebook  (if needed)
# %env CUDA_VISIBLE_DEVICES=2
# Import the required libraries
import numpy as np
import pandas as pd
import cudf
import os
from cuml import Lasso as cuLasso
from sklearn.linear_model import Lasso
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error
from cuml.linear_model import ElasticNet as cuElasticNet
from sklearn.linear_model import ElasticNet

# Helper Functions

In [None]:
# Check if the mortgage dataset is present and then extract the data from it, else just create a random dataset for regression 
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    # Split the dataset in a 80:20 split
    train_rows = int(nrows*0.8)
    if os.path.exists(cached):
        print('use mortgage data')

        with gzip.open(cached) as f:
            X = np.load(f)
        # The 4th column is 'adj_remaining_months_to_maturity'
        # used as the label
        X = X[:,[i for i in range(X.shape[1]) if i!=4]]
        y = X[:,4:5]
        rindices = np.random.randint(0,X.shape[0]-1,nrows)
        X = X[rindices,:ncols]
        y = y[rindices]
        df_y_train = pd.DataFrame({'fea%d'%i:y[0:train_rows,i] for i in range(y.shape[1])})
        df_y_test = pd.DataFrame({'fea%d'%i:y[train_rows:,i] for i in range(y.shape[1])})
    else:
        print('use random data')
        X,y = make_regression(n_samples=nrows,n_features=ncols,n_informative=ncols, random_state=0)
        df_y_train = pd.DataFrame({'fea0':y[0:train_rows,]})
        df_y_test = pd.DataFrame({'fea0':y[train_rows:,]})

    df_X_train = pd.DataFrame({'fea%d'%i:X[0:train_rows,i] for i in range(X.shape[1])})
    df_X_test = pd.DataFrame({'fea%d'%i:X[train_rows:,i] for i in range(X.shape[1])})

    return df_X_train, df_X_test, df_y_train, df_y_test

# Obtain and convert the dataset

In [None]:
%%time
# nrows = number of samples
# ncols = number of features of each sample 
nrows = 2**21
ncols = 500

# Split the dataset into training and testing sets, in the ratio of 80:20 respectively
X_train, X_test, y_train, y_test = load_data(nrows,ncols)
print('training data',X_train.shape)
print('training label',y_train.shape)
print('testing data',X_test.shape)
print('testing label',y_test.shape)
print('label',y_test.shape)

In [None]:
%%time
# Convert the pandas dataframe to cudf format
X_cudf = cudf.DataFrame.from_pandas(X_train)
X_cudf_test = cudf.DataFrame.from_pandas(X_test)
y_cudf = y_train.values
y_cudf = y_cudf[:,0]
y_cudf = cudf.Series(y_cudf)

### Define the model parameters

In [None]:
# lr = learning rate
# algo = algorithm used in the model
lr = 0.001
algo = 'cyclic'

### Lasso
The lasso model implemented in cuml allows the user to change the following parameter values:
1. alpha: regularizing constant that is multiplied with L1 to control the extent of regularization. (default = 1)
2. normalize: variable decides if the predictors in X will be normalized or not. (default = False)
3. fit_intercept: if set to True the model tries to center the data. (default = True)
4. max_iter: maximum number of iterations for training (fitting) the data to the model. (default = 1000)
5. tol: the tolerance for optimization. (default = 1e-3)
3. algorithm: the user can set the algorithm value as 'cyclic' or 'random'

The model accepts only numpy arrays or cudf dataframes as the input. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the lasso model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html

#### Scikit-learn model for lasso

In [None]:
%%time
# Use the sklearn lasso model to fit the training dataset 
skols = Lasso(alpha=np.array([lr]), fit_intercept = True, normalize = False, max_iter = 1000, selection=algo, tol=1e-10)
skols.fit(X_train, y_train)

In [None]:
%%time
# Calculate the mean squared error for the sklearn lasso model on the testing dataset
sk_predict = skols.predict(X_test)
error_sk = mean_squared_error(y_test,sk_predict)

#### CuML model for lasso

In [None]:
%%time
# Run the cuml linear regression model to fit the training dataset 
cuols = cuLasso(alpha=np.array([lr]), fit_intercept = True, normalize = False, max_iter = 1000, selection=algo, tol=1e-10)
cuols.fit(X_cudf, y_cudf)

In [None]:
%%time
# Calculate the mean squared error of the testing dataset using the cuml linear regression model
cu_predict = cuols.predict(X_cudf_test).to_array()
error_cu = mean_squared_error(y_test,cu_predict)

In [None]:
# Print the mean squared error of the sklearn and cuml model to compare the two
print("SKL MSE(y):")
print(error_sk)
print("CUML MSE(y):")
print(error_cu)

### Elastic Net
The elastic net model implemented in cuml contains the same parameters as the lasso model.
In addition to the variable values that can be altered in lasso, elastic net has another variable who's value can be changed
- l1_ratio: decides the ratio of amount of L1 and L2 regularization that would be applied to the model. When L1 ratio = 0, the model will have only L2 reqularization shall be applied to the model. (default = 0.5)

The model accepts only numpy arrays or cudf dataframes as the input. In order to convert your dataset to cudf format please read the cudf documentation on https://rapidsai.github.io/projects/cudf/en/latest/. For additional information on the lasso model please refer to the documentation on https://rapidsai.github.io/projects/cuml/en/latest/index.html

#### Scikit-learn model for elastic net

In [None]:
%%time
# Use the sklearn linear regression model to fit the training dataset 
elastic_sk = ElasticNet(alpha=np.array([lr]), fit_intercept = True, normalize = False, max_iter = 1000, selection=algo, tol=1e-10)
elastic_sk.fit(X_train, y_train)

In [None]:
%%time
# Calculate the mean squared error of the sklearn linear regression model on the testing dataset
sk_predict_elas = elastic_sk.predict(X_test)
error_sk_elas = mean_squared_error(y_test,sk_predict_elas)

#### CuML model for elastic net

In [None]:
%%time
# Run the cuml linear regression model to fit the training dataset 
elastic_cu = cuElasticNet(alpha=np.array([lr]), fit_intercept = True, normalize = False, max_iter = 1000, selection=algo, tol=1e-10)
elastic_cu.fit(X_cudf, y_cudf)

In [None]:
%%time
# Calculate the mean squared error of the testing dataset using the cuml linear regression model
cu_predict_elas = elastic_cu.predict(X_cudf_test).to_array()
error_cu_elas = mean_squared_error(y_test,cu_predict_elas)

In [None]:
# Print the mean squared error of the sklearn and cuml model to compare the two
print("SKL MSE(y):")
print(error_sk_elas)
print("CUML MSE(y):")
print(error_cu_elas)