## Import modules

In [1]:
import numpy as np
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
import os
import time

## Utils

In [2]:
def load_matrix(path: str, verbose: bool = True, num_to_print: int = 3) -> np.ndarray:
    matrix = np.loadtxt(path, delimiter=",")
    if verbose:
        print("Loaded matrix from", os.path.basename(path))
        print("Shape:", matrix.shape)
        print("Example:\n", matrix[:num_to_print], end="\n\n")
    return matrix

In [3]:
%cd ../../

/home/shkarupa/Documents/Semester6/Coursework/parallel-gradient-descent


## Generate data

In [4]:
!python demo/gen_data.py --help

Generate train and test data for linear regression model

Tool arguments:
  -h, --help          show this help message and exit
  -t , --n-train      number of train samples (default: 1000)
  -e , --n-eval       number of evaluation samples (default: 100)
  -f , --n-features   number of per sample features (default: 10)
  -s , --seed         seed that make data generation deterministic (default:
                      None)
  -o , --out-dir      path to directory where generated data will be stored
                      (default: ./data)


In [5]:
!python demo/gen_data.py --n-train 10000 --n-eval 1000 --n-features 50 --seed 42 --out-dir data

## Look at data

In [6]:
!ls data

x_eval.csv  x_train.csv  y_eval.csv  y_train.csv


In [7]:
x_train = load_matrix("data/x_train.csv")
y_train = load_matrix("data/y_train.csv")

x_eval = load_matrix("data/x_eval.csv")
y_eval = load_matrix("data/y_eval.csv")

Loaded matrix from x_train.csv
Shape: (10000, 50)
Example:
 [[ 0.00837893 -0.10885068  1.56037392 -0.12066917  0.72925308 -1.64875532
  -2.46482282 -1.6193807  -1.92134435 -1.01536413  0.00732488 -0.2737064
  -0.41135527 -1.01846696  0.06612678 -0.89081492  0.1507947  -0.69712409
   1.69730467 -0.5842055  -1.08315327  0.55754335  0.94753562 -0.48650709
   0.46183739  0.41235071  1.13286166  0.49435865 -1.37962067  1.27530329
  -0.21265261  0.90224016  0.74432705  0.44751107  1.41371174 -1.55982878
  -0.72163126  0.11735925  0.51293586  0.69831565 -0.6681979  -0.63091615
  -0.06307134 -0.91857534 -0.16397104 -1.50636156  0.29101263  1.05506763
  -0.26739796  2.13032334]
 [ 0.63357274  0.86563418 -1.74369451  0.27489301  0.68030289  0.59951124
  -0.3265366   1.192769   -1.31840945 -0.61239986 -0.18750388 -0.5810673
   1.32719122  0.48778602  0.74382963  1.24113432 -1.40967929 -1.17732104
  -1.75110861 -0.74742425  0.08530452 -0.96818675  0.56041463 -0.42524835
   0.05463944 -0.13710052  

## Run Linear regression

In [8]:
!./build/bin/gradient_descent -h

Linear regression config:

CLI options:
  -h [ --help ]                        produce help message
  -c [ --config ] arg                  path to config file

Algorithm options:
  -i [ --input-path ] arg              path to input CSV file
  -t [ --target-path ] arg             path to target CSV file
  -e [ --eval-path ] arg               path to evaluation CSV file
  -o [ --out-path ] arg (=output.csv)  path to output CSV file
  -p [ --parallel ]                    wether to use parallel or serial SGD
  -n [ --num-epochs ] arg (=1000)      number of training epochs
  -l [ --lr ] arg (=0.001)             learning rate
  -w [ --weight-decay ] arg (=0.01)    L2 regularization lambda term
  --normalize                          wether to normalize input
  --num-threads arg (=11)              number of threads to use for parallel 
                                       SGD
  --num-step-epochs arg (=1)           number of epochs to compute in each 
                                       th

### SGD

In [9]:
!cat examples/serial_config.cfg

input-path = data/x_train.csv
target-path = data/y_train.csv
eval-path = data/x_eval.csv
out-path = output/serial_pred.csv
parallel = false
num-epochs = 10000
lr = 0.001
weight-decay = 0.01
normalize = true


In [10]:
!./build/bin/gradient_descent --config examples/serial_config.cfg

SGD: 1867 ms


In [11]:
y_pred_sgd = load_matrix("output/serial_pred.csv", verbose=False)

In [12]:
print("MSE:", mean_squared_error(y_pred_sgd.squeeze(), y_eval))

MSE: 3.643120203281636


### Parallel SGD

In [13]:
!cat examples/parallel_config.cfg

input-path = data/x_train.csv
target-path = data/y_train.csv
eval-path = data/x_eval.csv
out-path = output/parallel_pred.csv
parallel = true
num-epochs = 10000
lr = 0.001
weight-decay = 0.01
normalize = true
num-threads = 11
num-step-epochs = 100


In [14]:
!./build/bin/gradient_descent --config examples/parallel_config.cfg

SGD: 500 ms


In [15]:
y_pred_parallel_sgd = load_matrix("output/parallel_pred.csv", verbose=False)

In [16]:
print("MSE:", mean_squared_error(y_pred_parallel_sgd, y_eval))

MSE: 3.692160818550913


### Sklearn

In [17]:
regressor = make_pipeline(StandardScaler(), LinearRegression())
# regressor = make_pipeline(
#     StandardScaler(), 
#     SGDRegressor(alpha=0.01, learning_rate="constant", eta0=0.001),
# )

In [18]:
# start = time.perf_counter_ns()
regressor.fit(x_train, y_train.squeeze())
# end = time.perf_counter_ns()
# print("SGD:", (end - start) / 1e6, "ms")

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

In [19]:
y_pred = regressor.predict(x_eval)

In [20]:
print("MSE:", mean_squared_error(y_pred, y_eval))

MSE: 3.6441656695476525
