In [37]:
import numpy as np, pandas as pd
from matplotlib.pyplot import subplots
from sklearn.linear_model import \
     (LinearRegression,
      LogisticRegression,
      Lasso)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from ISLP import load_data
from ISLP.models import ModelSpec as MS
from sklearn.model_selection import \
     (train_test_split,
      GridSearchCV)

In [38]:
import numpy as np, pandas as pd, random, torch
from pytorch_lightning import seed_everything

seed_everything(57, workers=True)
torch.use_deterministic_algorithms(True, warn_only=True)

Seed set to 57


In [39]:
import torch
from torch import nn
from torch.optim import RMSprop
from torch.utils.data import TensorDataset

In [40]:
from torchmetrics import (MeanAbsoluteError,
                          R2Score)
from torchinfo import summary
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger

In [41]:
from ISLP.torch import (SimpleDataModule,
                        SimpleModule,
                        ErrorTracker,
                        rec_num_workers)
from ISLP.torch.imdb import (load_lookup,
                             load_tensor,
                             load_sparse,
                             load_sequential)

## Question 1

part A

In [42]:
# load wage data
wage = load_data('Wage')
wage.head()

Unnamed: 0,year,age,maritl,race,education,region,jobclass,health,health_ins,logwage,wage
0,2006,18,1. Never Married,1. White,1. < HS Grad,2. Middle Atlantic,1. Industrial,1. <=Good,2. No,4.318063,75.043154
1,2004,24,1. Never Married,1. White,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,2. No,4.255273,70.47602
2,2003,45,2. Married,1. White,3. Some College,2. Middle Atlantic,1. Industrial,1. <=Good,1. Yes,4.875061,130.982177
3,2003,43,2. Married,3. Asian,4. College Grad,2. Middle Atlantic,2. Information,2. >=Very Good,1. Yes,5.041393,154.685293
4,2005,50,4. Divorced,1. White,2. HS Grad,2. Middle Atlantic,2. Information,1. <=Good,1. Yes,4.318063,75.043154


In [43]:

D = wage[['year','age','education','jobclass']].copy()
X = np.asarray(D)
y = np.array(wage['wage'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=57
)

In [44]:
# Standardize features for training data

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[:, [0, 1]] = scaler.fit_transform(X_train[:, [0, 1]].astype(float))

# Scale target variable (y_train is 1D, so we need to reshape)
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()


In [45]:
# Convert categorical variables to numeric for X_train_scaled
from sklearn.preprocessing import LabelEncoder

# Create copies to avoid modifying original data
X_train_encoded = X_train_scaled.copy()
X_test_encoded = X_test.copy()

# Encode categorical variables for training data
le_education = LabelEncoder()
le_jobclass = LabelEncoder()

# Fit on training data and transform both train and test
X_train_encoded[:, 2] = le_education.fit_transform(X_train_scaled[:, 2])
X_train_encoded[:, 3] = le_jobclass.fit_transform(X_train_scaled[:, 3])

# Apply same encoding to test data
X_test_encoded[:, 2] = le_education.transform(X_test[:, 2])
X_test_encoded[:, 3] = le_jobclass.transform(X_test[:, 3])

# Scale the continuous features in test data using the same scaler
X_test_encoded[:, [0, 1]] = scaler.transform(X_test[:, [0, 1]].astype(float))

# Convert to tensors
X_train_t = torch.tensor(X_train_encoded.astype(np.float32)) # Convert training features to PyTorch tensor.
Y_train_t = torch.tensor(y_train_scaled.astype(np.float32)) # Convert training targets to PyTorch tensor.
TensorDataset_train = TensorDataset(X_train_t, Y_train_t)

X_test_t = torch.tensor(X_test_encoded.astype(np.float32)) # Convert test features to PyTorch tensor.
Y_test_t = torch.tensor(y_test.astype(np.float32)) # Convert test targets to PyTorch tensor.
TensorDataset_test = TensorDataset(X_test_t, Y_test_t)

In [46]:
max_num_workers = rec_num_workers()

In [47]:
wage_dm = SimpleDataModule(TensorDataset_train,
                          TensorDataset_test,
                          batch_size=64,
                          validation=0.25,
                          num_workers=min(4, max_num_workers),
                          )


In [48]:
# Print shapes of one training batch
wage_dm.setup('fit')  # Setup the data module for training
train_loader = wage_dm.train_dataloader()

# Get one batch from the training loader
for X_, Y_ in train_loader:
    print(f"Training batch shapes:")
    print(f"X_ (features) shape: {X_.shape}")
    print(f"Y_ (targets) shape: {Y_.shape}")
    break  # Only get the first batch

Training batch shapes:
X_ (features) shape: torch.Size([64, 4])
Y_ (targets) shape: torch.Size([64])


part B

In [49]:
class WageModel(nn.Module):
    def __init__(self, input_dim): 
        
        super(WageModel, self).__init__() # Initialize the parent class.
        self.layer1 = nn.Sequential( 
            nn.Flatten(),
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3)) # First layer: Flatten input images, pass through a fully connected layer, then apply ReLU activation and dropout.
        self.layer2 = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.3)) # Second layer: Fully connected layer, ReLU activation, and dropout.
        self._forward = nn.Sequential(
            self.layer1,
            self.layer2,
            nn.Linear(32, 1)) # Output layer: Fully connected layer mapping to a single output.
    def forward(self, x): 
        return self._forward(x)

In [50]:
input_dim = X_train_t.shape[1]
wage_model = WageModel(input_dim)
summary(wage_model, input_size=(64, input_dim))

Layer (type:depth-idx)                   Output Shape              Param #
WageModel                                [64, 1]                   --
├─Sequential: 1-1                        [64, 1]                   --
│    └─Sequential: 2-1                   [64, 64]                  --
│    │    └─Flatten: 3-1                 [64, 4]                   --
│    │    └─Linear: 3-2                  [64, 64]                  320
│    │    └─ReLU: 3-3                    [64, 64]                  --
│    │    └─Dropout: 3-4                 [64, 64]                  --
│    └─Sequential: 2-2                   [64, 32]                  --
│    │    └─Linear: 3-5                  [64, 32]                  2,080
│    │    └─ReLU: 3-6                    [64, 32]                  --
│    │    └─Dropout: 3-7                 [64, 32]                  --
│    └─Linear: 2-3                       [64, 1]                   33
Total params: 2,433
Trainable params: 2,433
Non-trainable params: 0
Total mult-ad