In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import transforms 
import pandas as pd
import numpy as np

In [70]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import transforms 
import pandas as pd
import numpy as np
class WFHEmployeeDataset(Dataset):
  """WFH Survey Dataset, targeting employee wishes for WFH."""

  def __init__(self, csv_file, transform=None):
    """
    Args:
        csv_file (string): Path to the csv file with annotations.
        root_dir (string): Directory with all the images.
        transform (callable, optional): Optional transform to be applied
            on a sample.
    """
    self.variables = [
      "income",
      "age_quant",
      "educ_years",
      "wfhcovid_frac",
      "commutetime_quant",
      "wfh_eff_COVID_quant",
      "gender",
      "wfh_days_postCOVID_ss",
      "live_children",
      "race_ethnicity",
      "hourly_wage",
      "workhours_preCOVID",
      "workhours_duringCOVID",
      "downloadspeed",
      "uploadspeed",
      "grass_color_attnfull",
      "party_affiliation_s",
    ]

    self.data = pd.read_csv(csv_file, usecols=self.variables)

    # Data Filtering
    # Must have answered 3 for the questionnaire to be filled out correctly
    self.data = self.data[self.data.grass_color_attnfull == 3]
    self.data = self.data.fillna(value={"workhours_preCOVID": 0, "workhours_duringCOVID": 0, "party_affiliation_s":0})
    self.data = self.data.dropna()

    print(len(data))

    self.transform = transform

    self.targets = np.array(self.data['wfh_days_postCOVID_ss'] == 1).astype(np.float32)

    self.variables.remove("wfh_days_postCOVID_ss")

    self.features = self.data[self.variables].to_numpy(dtype='float32')

    # .astype(np.float32) is necessary because PyTorch doesn't like float64

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    target = self.targets[idx]
    features = self.features[idx]

    if self.transform:
      features = self.transform(features)

    return features, target

In [55]:
transform = transforms.Compose([
    transforms.ToTensor(),
])

data = WFHEmployeeDataset('WFHdata_January22.csv', transform=torch.tensor)

train_size = int(0.8 * len(data))
test_size = len(data) - train_size

train_data, test_data = torch.utils.data.random_split(
    data, 
    [train_size, test_size]
)

5151


In [56]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size = 128)
test_loader = torch.utils.data.DataLoader(test_data, batch_size = 128)

## Training and Testing

In [57]:
def train(model, loader, optimizer):
  total_loss = 0.
  iter = 0

  model.train()

  for data, target in loader:
    optimizer.zero_grad()

    data = data
    pred = model(data)
    loss = loss_function(pred.view(-1), target.view(-1))

    total_loss += loss.item()

    loss.backward()
    optimizer.step()

    iter += 1
  
  return total_loss / iter

In [58]:
def test(model, loader):
  total_loss = 0.
  iter = 0

  model.eval()

  with torch.no_grad():
    for data, target in loader:
      data = data
      pred = model(data)
      loss = loss_function(pred.view(-1), target.view(-1))

      total_loss += loss.item()

      iter += 1
  
  return total_loss / iter

In [59]:
class MLPModel(torch.nn.Module):
  def __init__(self):
    super(MLPModel, self).__init__()

    self.layers = nn.Sequential(
      nn.Linear(16, 64),
      nn.Tanh(),
      nn.Linear(64, 32),
      nn.Dropout(p=0.5),
      nn.Tanh(),
      nn.Linear(32, 16),
      nn.Dropout(p=0.5),
      nn.Tanh(),
      nn.Linear(16, 8),
      nn.Dropout(p=0.5),
      nn.Tanh(),
      nn.Linear(8, 1)
    )
  
  def forward(self, x):
    return self.layers(x)

In [65]:
mlp_model = MLPModel()

optimizer = optim.Adam(mlp_model.parameters(), lr=1e-4, weight_decay=5e-4)
loss_function = nn.MSELoss()

In [66]:
for epoch in range(200):
  loss = train(mlp_model, train_loader, optimizer)
  test_loss = test(mlp_model, test_loader)
  print(f'Epoch: {epoch} | Loss: {loss:4} | Test Loss: {test_loss:4}')

Epoch: 0 | Loss: 0.185813924128359 | Test Loss: 0.13339596500413287
Epoch: 1 | Loss: 0.15714176292672302 | Test Loss: 0.10384277740700378
Epoch: 2 | Loss: 0.14143514836376364 | Test Loss: 0.0960672048644887
Epoch: 3 | Loss: 0.13657742377483484 | Test Loss: 0.09523353001309766
Epoch: 4 | Loss: 0.1327425537235809 | Test Loss: 0.09568920565976037
Epoch: 5 | Loss: 0.13230649223833374 | Test Loss: 0.09596805481447114
Epoch: 6 | Loss: 0.1283673524405017 | Test Loss: 0.09567859727475378
Epoch: 7 | Loss: 0.12358564160989993 | Test Loss: 0.09537872299551964
Epoch: 8 | Loss: 0.12515240275498593 | Test Loss: 0.09502491561902894
Epoch: 9 | Loss: 0.1223241960008939 | Test Loss: 0.09492791278494729
Epoch: 10 | Loss: 0.12344582582061941 | Test Loss: 0.09441603637403911
Epoch: 11 | Loss: 0.12162074649875815 | Test Loss: 0.09397064770261447
Epoch: 12 | Loss: 0.1201124545751196 | Test Loss: 0.09363315420018302
Epoch: 13 | Loss: 0.12276667672576326 | Test Loss: 0.09279983614881833
Epoch: 14 | Loss: 0.116

## Sensitivity Analysis

In [71]:
data = WFHEmployeeDataset('WFHdata_January22.csv', transform=torch.tensor)

5151


In [72]:
mlp_model.eval()

columns = data.variables

bvs = []
features = data.data[data.variables].to_numpy(dtype='float32')
for survey in features:
    bvs.append(mlp_model(torch.tensor(survey)).detach().numpy()[0])
bvs = np.array(bvs)

mean_dev = {}

for variable in columns:
    std_dev = np.std(data.data[variable])
    print(std_dev)

    difference = 0
    
    # Subtract 1 std. dev
    data.data[variable] = data.data[variable] - std_dev
    features = data.data[data.variables].to_numpy(dtype='float32')

    adj = []
    for survey in features:
        adj.append(mlp_model(torch.tensor(survey)).detach().numpy()[0])
    adj = np.array(adj)

    diff = np.abs(adj - bvs)

    # Add 1 std. dev (add two due to subtracting one earlier)
    data.data[variable] = data.data[variable] + 2 * std_dev
    features = data.data[data.variables].to_numpy(dtype='float32')

    adj = []
    for survey in features:
        adj.append(mlp_model(torch.tensor(survey)).detach().numpy()[0])
    adj = np.array(adj)

    diff = diff + np.abs(adj - bvs)

    # Reset by subtracting 1 std. dev again
    data.data[variable] = data.data[variable] - std_dev

    mean_dev[variable] = diff.mean()

print(mean_dev)

169.9696904021095
8.957967992092463
2.1863018830184626
37.24955670274001
25.24253979467531
17.41056695365713
0.4868630040849206
1.1249208286974446
1.9440096215793632
641.9956642495715
15.201763562440355
15.32297187230933
128.93218933500688
107.70580571693812
0.0
0.9226649295101861
{'income': 0.32286134, 'age_quant': 0.043398183, 'educ_years': 0.009114683, 'wfhcovid_frac': 0.12647003, 'commutetime_quant': 0.08064015, 'wfh_eff_COVID_quant': 0.109367594, 'gender': 0.0006728243, 'live_children': 0.0018368693, 'race_ethnicity': 0.0067131645, 'hourly_wage': 0.22932464, 'workhours_preCOVID': 0.052451037, 'workhours_duringCOVID': 0.05553634, 'downloadspeed': 0.16223021, 'uploadspeed': 0.17457998, 'grass_color_attnfull': 0.0, 'party_affiliation_s': 0.0011143773}
