# Imports 

In [1]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# torch specific imports
import torch
from torch import nn
from torch.optim import RMSprop
from torch.utils.data import TensorDataset

# utils
from torchmetrics import MeanAbsoluteError, R2Score
from torchinfo import summary
from torchvision.io import read_image

In [4]:
# pytorch lightning is more highlevel and allows us to skip boilerplate code
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger

In [6]:
# to make results consistent we seed everything
from pytorch_lightning import seed_everything
seed_everything(0, workers=True)
torch.use_deterministic_algorithms(True, warn_only=True)

Global seed set to 0


In [7]:
# importing datasets
from torchvision.datasets import MNIST, CIFAR100
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.transforms import Resize, Normalize, CenterCrop, ToTensor

# simpledatamodule and simplemodule are simpler versions of objects in pytorch_lightning
# errortracker handles collections of targets and 
#   predictions over each mini-batch in the validation/test stage
from ISLP.torch import SimpleDataModule, SimpleModule, ErrorTracker, rec_num_workers

# utils to pull from the IMDb database 
from ISLP.torch.imdb import load_lookup, load_tensor, load_sparse, load_sequential

In [8]:
# glob finds all matching wildcard characters
from glob import glob
import json

# (10.9.1) Single Layer Network on Hitters Data

In [10]:
Hitters = load_data('Hitters').dropna()
n = Hitters.shape[0]

to_numpy converts pandas df to numpy arrays. We do this because we will use sklearn to fit lasso, and lasso needs this conversion. 

We also use LinearRegression from sklearn to facilitate the comparison

In [14]:
model = MS(Hitters.columns.drop('Salary'), intercept=False)
X = model.fit_transform(Hitters).to_numpy()
Y = Hitters['Salary'].to_numpy()

In [13]:
# we split into training and test data
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=1/3,random_state=1)

hit_lm = LinearRegression().fit(X_train,Y_train)
Yhat_test = hit_lm.predict(X_test) 
np.abs(Yhat_test - Y_test).mean()

259.71528833146317

We now fit the Lasso using sklearn. We use MAE rather than MSE to fit the model in this case. 

We create a cross-balidation grid and perform the cross-validation directly. 

We encode the pipeline in two steps: 
1) We normalize the features using StandardScaler() transform
2) Fit the lasso without further normalization

In [15]:
scaler = StandardScaler(with_mean=True, with_std=True)
lasso = Lasso(warm_start=True, max_iter=30000)
standard_lasso = Pipeline(steps=[('scaler', scaler), ('lasso', lasso)])

We need to create a grid of values for $\lambda$. We choose a grid of 100 values, uniform on the log scale from lam_max 
down to $0.01\cdot lam_max$. 

Here lam_max is the smallest value of $\lambda$ with an all-zero solution. 
This value equals the largest absolute inner product between any predictor and the (centered) response. 

In [16]:
X_s = scaler.fit_transform(X_train)
n = X_s.shape[0]
lam_max = np.fabs(X_s.T.dot(Y_train - Y_train.mean())).max() / n
param_grid = {'alpha': np.exp(np.linspace(0,np.log(0.01), 100) ) * lam_max}

In [17]:
# now we perform cross validation
cv = KFold(10, shuffle=True, random_state=1)
grid = GridSearchCV(lasso, param_grid, cv=cv, scoring='neg_mean_absolute_error')
grid.fit(X_train, Y_train)

In [18]:
trained_lasso = grid.best_estimator_
Yhat_test = trained_lasso.predict(X_test)
np.fabs(Yhat_test - Y_test).mean()

257.23820107995016

We now specify the NN Model. Doing so requires us to specify classes specific to the model we want to fit. 

The way to do it in pytorch is to sub-class a generic representation of a network. 

* nn.Module is ubiquitous in pytorch and represents the mappings in the NN
* flatten and sequential are used in forward to describe the map that this Module implements
* sequential is a composition of 4 maps: 
    1) The input features are mapped to 50 dimensions, introducing in this case 19*50 
     50 parameters for weights and intercepts of the map
    2) This layer is mapped to a ReLU layer 
    3) A 40% dropout layer follows
    4) Linear map down to 1 dimension again with a bias

The total number of trainable parameters is $(19*50+50)+(50+1)=1051$.

The package torchinfo provides a function summary() that summarizes this information. 

In [19]:
class HittersModel(nn.Module): # nn.Model is ubiquitous in pytorch and represents the mappings in the NN. 
    def __init__(self, input_size) -> None:
        super(HittersModel, self).__init__()
        
        # flatten and sequential are used in forward to describe the map that this module implements
        self.flatten = nn.Flatten()
        self.sequential = nn.Sequential(nn.Linear(input_size, 50), nn.ReLU(), nn.Dropout(0.4), nn.Linear(50, 1))

    def forward(self, x): 
        x = self.flatten(x)
        return torch.flatten(self.sequential(x))

In [23]:
hit_model = HittersModel(X.shape[1])

summary(hit_model, input_size=X_train.shape, col_names=["input_size", "output_size", "num_params"])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
HittersModel                             [175, 19]                 [175]                     --
├─Flatten: 1-1                           [175, 19]                 [175, 19]                 --
├─Sequential: 1-2                        [175, 19]                 [175, 1]                  --
│    └─Linear: 2-1                       [175, 19]                 [175, 50]                 1,000
│    └─ReLU: 2-2                         [175, 50]                 [175, 50]                 --
│    └─Dropout: 2-3                      [175, 50]                 [175, 50]                 --
│    └─Linear: 2-4                       [175, 50]                 [175, 1]                  51
Total params: 1,051
Trainable params: 1,051
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.18
Input size (MB): 0.01
Forward/backward pass size (MB): 0.07
Params size (MB): 0.00
Estimated Total Size (MB): 0.09