In [None]:
#%matplotlib inline

import numpy as np
from pprint import pprint
import pandas as pd
import random

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import grad
import torchvision
from torchvision import models, datasets, transforms
torch.manual_seed(50)

print(torch.__version__, torchvision.__version__)

In [None]:
num_feature = 3

# Prepare Dataset


In [None]:
df = pd.read_csv("BRAZPD_UnofM_all.csv", encoding="ISO-8859-1")
display(df)

In [None]:
df['followup1yfull'] = df['Followup'].apply(lambda x: 1 if x >= 12 else 0)
df

In [5]:
columns = list(df)
for i in columns: 
  if df[i].isnull().sum() > 0: 
    df.drop(columns=i, inplace=True)

# just drop any column with NaN because lazy to deal with those

In [6]:
df.drop(columns='nome_cidade', inplace=True)
df.drop(columns='DataincioPDNEt', inplace=True)
# these two columns are not numbers
df.drop(columns='Followup', inplace=True)
df.drop(columns='Id', inplace=True)

In [7]:
df

Unnamed: 0,CODPAX,FRR,codigoclinica,ModalidadeCAPD0APD1Mix2,CenterSizenpatients,Deathevent,TechniqueFailureevent,Age,Dropoutsim1,Causeofdropout,...,N_AH_69,N_AH_70,N_AH_71,N_AH_72,N_AH_73,N_AH_74,Modalidade,Ms10ou1,_merge,followup1yfull
0,1349037,0,1,0,128,1,0,71.2,1,10,...,0,0,0,0,0,0,0,0,3,0
1,1349040,0,1,0,128,0,1,23.3,1,2,...,0,0,0,0,0,0,0,0,3,1
2,1349048,1,1,1,128,0,1,49.2,1,1,...,0,0,0,0,0,0,1,1,3,0
3,1349051,1,1,0,128,1,0,48.3,1,10,...,0,0,0,0,0,0,0,0,3,1
4,1349055,0,1,2,128,1,0,93.7,1,10,...,0,0,0,0,0,0,2,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5702,347457070,1,347,1,66,0,0,31.8,0,15,...,0,0,0,0,0,0,1,1,3,0
5703,347457071,1,347,1,66,0,0,57.0,0,15,...,0,0,0,0,0,0,1,1,3,0
5704,347457072,1,347,1,66,0,0,84.0,0,15,...,0,0,0,0,0,0,1,1,3,0
5705,347457073,1,347,1,66,0,0,54.2,0,15,...,0,0,0,0,0,0,1,1,3,0


# The model

In [8]:
def weights_init(m):
    if hasattr(m, "weight"):
        m.weight.data.uniform_(-0.5, 0.5)
    if hasattr(m, "bias"):
        m.bias.data.uniform_(-0.5, 0.5)

class NN(nn.Module):
  def __init__(self):
    super(NN, self).__init__()

    self.l1 = nn.Linear(num_feature, 100)
    self.l2 = nn.Linear(100, 2)

  def forward(self, x):
    x = F.relu(self.l1(x))
    out = self.l2(x)
    return out

# Functions

In [9]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
print("Running on %s" % device)

Running on cpu


In [10]:
def onehot_labels(label, num_classes=2):
  label = torch.unsqueeze(label, 1)
  onehot_label = torch.zeros(label.size(0), num_classes, device=label.device)
  onehot_label.scatter_(1, label, 1)
  return onehot_label

In [11]:
def cross_entropy_for_onehot(pred, target):
    return torch.mean(torch.sum(- target * F.log_softmax(pred, dim=-1), 1))

In [12]:
class BRAZPDDataset(Dataset):
  def __init__(self, dataframe):
    self.dataframe = dataframe
    self.len = len(dataframe)

  def __len__(self):
      return self.len

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()
    data = torch.tensor(df.iloc[idx][0:num_feature].values, dtype=torch.float)
    label = torch.tensor(df['followup1yfull'].iloc[idx])
    label.view(1)
    
    return data, label


In [13]:
def get_batch(batch_size):
    loader = DataLoader(dataset=dst, batch_size=batch_size, shuffle=True)
    it = iter(loader)
    data, label = next(it)
    return data, label

# Experiments

In [14]:
batch_size = 1
net = NN()
net.apply(weights_init)
dst = BRAZPDDataset(df)
criterion = nn.CrossEntropyLoss()

In [15]:
gt_data, gt_label = get_batch(batch_size)
out = net(gt_data)
loss = criterion(out, gt_label)

In [16]:
print("gt_data: ", gt_data)
print("gt_label: ", gt_label)
print("out: ", out)
print("loss: ", loss)


gt_data:  tensor([[6.3366e+07, 0.0000e+00, 6.3000e+01]])
gt_label:  tensor([1])
out:  tensor([[ -900929.7500, -5046908.5000]], grad_fn=<AddmmBackward0>)
loss:  tensor(4145978.7500, grad_fn=<NllLossBackward0>)


In [17]:
dy_dx = torch.autograd.grad(loss, net.parameters())
original_dy_dx = list((_.detach().clone() for _ in dy_dx))

In [18]:
dummy_data = []
dummy_labels = []
for i in range(batch_size):
    dm_dt = torch.randn(gt_data[i].size()).to(device).requires_grad_(True)
    dummy_data.append(dm_dt)
    dummy_labels.append(torch.randn(1,2).to(device).requires_grad_(True)) # 2 here is the number of classes, hard coded for now

# dummy_data and dummy_labels are lists of tensors here, rather than tensor because calling torch.stack makes them non-leaf

In [19]:
print("dm dt: ", dummy_data)
print("dm lb: ", dummy_labels)

dm dt:  [tensor([-1.1644,  0.2247,  1.3256], requires_grad=True)]
dm lb:  [tensor([[-0.8695,  0.2507]], requires_grad=True)]


In [20]:
def predict(data_list):
    pred_list = []
    for data in data_list:
        pred = net(data).view(1,2)
        # print(pred)
        pred_list.append(pred)
    return pred_list        

In [21]:
def batch_crossentropy(pred_list, label_list):
    total_loss = 0
    if (len(pred_list) != len(label_list)):
        print("len(pred_list) != len(label_list)!!")
    
    for i in range(len(pred_list)):
        total_loss += criterion(pred_list[i], label_list[i])
    return total_loss/len(label_list)

In [22]:
print(dummy_labels[0])

tensor([[-0.8695,  0.2507]], requires_grad=True)


In [26]:
history = []
for i in range(batch_size): 
        optimizer = torch.optim.LBFGS([dummy_data[i], dummy_labels[i]]) # we are only optimizing one single image at a time
        history.append([])
        for iters in range(10):
            def closure():
                optimizer.zero_grad()

                pred_list =  predict(dummy_data)
                # print("dummy pred size: ", len(pred))
                print("pred: ", pred_list)
                print("dm dt: ", dummy_data[i])
                # print("dummy label size: ", dummy_ts_labels.size())
                dummy_loss = batch_crossentropy(pred_list, dummy_labels)
                dummy_dy_dx = torch.autograd.grad(dummy_loss, net.parameters(), create_graph=True)
                
                grad_diff = 0
                grad_count = 0
                
                for gx, gy in zip(dummy_dy_dx, original_dy_dx): 
                    print("gx: ", gx)
                    print("gy: ", gy)
                    grad_diff += ((gx - gy) ** 2).sum()
                    grad_count += gx.nelement()
                
                print("old:", grad_diff)
                grad_diff.backward()
                print("new:", grad_diff)
                
                return grad_diff
            
            optimizer.step(closure)
            # if iters % 10 == 0: 
            current_loss = closure()
            print(iters, "%.4f" % current_loss.item())
            history[i].append(dummy_data[i].cpu())

pred:  [tensor([[ 0.0308, -0.8300]], grad_fn=<ViewBackward0>)]
dm dt:  tensor([ 2.4029e-10, -2.0296e-11,  1.8856e-11], requires_grad=True)
gx:  tensor([[ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 4.4343e-21, -3.7454e-22,  3.4796e-22],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [-4.3414e-21,  3.6670e-22, -3.4067e-22],
        [-8.8586e-21,  7.4824e-22, -6.9514e-22],
        [ 1.6287e-21, -1.3757e-22,  1.2781e-22],
        [ 0.0000e+00, -0.0000e+00,  0.0000e+00],
        [-1.2356e-20,  1.0437e-21, -9.6962e-22],
        [-3.4973e-22,  2.9540e-23, -2.7444e-23],
        [ 2.0145e-21, -

In [24]:
print(history)

[[tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08,  1.5799e-07], requires_grad=True), tensor([ 3.9380e-07, -5.7863e-08