<a href="https://colab.research.google.com/github/VishnuSriraj/Data-Influence/blob/main/InfluenceScores_dropout.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import pandas as pd
import scipy
import math
import copy
from torch.autograd import grad as gradient

In [2]:
class InfluenceFunction(object):
	def __init__(self, model, X_train, y_train, loss_func, layer_index):
		if not isinstance(model, torch.nn.Module):
			raise RuntimeError(f"Only torch.nn.Module models are supported, got f{type(model)}")
		if not isinstance(X_train, torch.Tensor):
			raise RuntimeError(f"X_train must be <torch.Tensor>, got f{type(X_train)}")
		if not isinstance(y_train, torch.Tensor):
			raise RuntimeError(f"y_train must be <torch.Tensor>, got f{type(y_train)}")

		try:
			layers = model.layers
		except:
			raise RuntimeError('The model must have an attribute "layers".')

		self._model = copy.deepcopy(model)
		self._X_train = X_train
		self._y_train = y_train
		self._loss_func = loss_func
		self._layer_index = layer_index
		
		self._total_training_loss_grad = None
		self._Hessian_matrix = self._calculate_Hessian_matrix()
		self._Hessian_inv = np.linalg.inv(self._Hessian_matrix)
		

	def _calculate_Hessian_matrix(self):

		X_train = self._X_train
		y_train = self._y_train
		model = self._model

		y_pred = model(X_train)
		loss = self._loss_func(y_pred, y_train)
		layer = model.layers[self._layer_index]

		if not isinstance(layer, torch.nn.Linear):
			raise RuntimeError(
				f"Only support layer type torch.nn.Linear, got f{type(layer)}."
			)

		weights = model.layers[self._layer_index].weight
		bias = model.layers[self._layer_index].bias

		grad_L_w_1 = gradient(
			loss, (weights, bias), retain_graph=True, create_graph=True
		)
		self._total_training_loss_grad = np.array(
			grad_L_w_1[0][0].tolist() + grad_L_w_1[1].tolist()
		)

		Hessian = []
		for i in range(0, grad_L_w_1[0].shape[1]):
			grad_L_w_2 = gradient(
				grad_L_w_1[0][0][i], (weights, bias), retain_graph=True
			)
			Hessian.append(grad_L_w_2[0][0].tolist() + grad_L_w_2[1].tolist())
		grad_L_w_2 = gradient(grad_L_w_1[1][0], (weights, bias), retain_graph=True)
		Hessian.append(grad_L_w_2[0][0].tolist() + grad_L_w_2[1].tolist())

		return np.array(Hessian)

	def influence_remove_single(self, index):
		return -self.influence_add_single(self._X_train[index], self._y_train[index])

	def influence_modify_single(self, index, new_x, new_y):
		return self.influence_add_single(new_x, new_y) + self.influence_remove_single(index)

	def influence_add_single(self, x, y):

		if not isinstance(x, torch.Tensor):
			raise RuntimeError(f"Added x must be <torch.Tensor>, got f{type(x)}")
		if not isinstance(y, torch.Tensor):
			raise RuntimeError(f"Added y must be <torch.Tensor>, got f{type(y)}")

		x = x.reshape(1, -1).detach()
		y = y.reshape(1, -1).detach()

		model = self._model

		y_pred = model(x)
		loss_single = self._loss_func(y_pred, y)

		weights = model.layers[self._layer_index].weight
		bias = model.layers[self._layer_index].bias

		grad = gradient(loss_single, (weights, bias), retain_graph=True)
		grad = np.array(grad[0][0].tolist() + grad[1].tolist())

		param_offset = - np.dot(self._Hessian_inv, grad)

		influence = np.dot(self._total_training_loss_grad, param_offset)

		return influence

In [3]:
class TorchNNCore(torch.nn.Module):
    def __init__(
        self, inps, hiddens=[], bias=True, seed=None, hidden_activation=torch.nn.ReLU
    ):
        super(TorchNNCore, self).__init__()
        if seed is not None:
            torch.manual_seed(seed)
        struct = [inps] + hiddens + [1]
        self.layers = [] # This layer attribute is required under 
        for i in range(1, len(struct)):
            self.layers.append(
                torch.nn.Linear(
                    in_features=struct[i - 1], out_features=struct[i], bias=bias
                )
            )
            if i == len(struct) - 1:
                self.layers.append(torch.nn.Sigmoid())
            else:
                self.layers.append(hidden_activation())
        self.model = torch.nn.Sequential(*self.layers)
        # Adding dropout
        self.dropout = torch.nn.Dropout(0.20)    
           
    def forward(self, x):
        x = self.dropout(x)
        output = self.model(x)
        return output

In [4]:
# Prepare training & testing dataset
data1 = pd.read_csv('./adult.csv').to_numpy()
data2 = pd.read_csv('./broward.csv').to_numpy()
data3 = pd.read_csv('./hospital.csv').to_numpy()

X_train1 = torch.tensor(data1[:,:-1], dtype=torch.float)
y_train1 = torch.tensor(data1[:,-1].reshape(-1,1), dtype=torch.float)

X_train2 = torch.tensor(data2[:,:-1], dtype=torch.float)
y_train2 = torch.tensor(data2[:,-1].reshape(-1,1), dtype=torch.float)

X_train3 = torch.tensor(data3[:,:-1], dtype=torch.float)
y_train3 = torch.tensor(data3[:,-1].reshape(-1,1), dtype=torch.float)

print("\nData1: ",X_train1.shape, y_train1.shape)
print("\nData2: ",X_train2.shape, y_train2.shape)
print("\nData3: ",X_train3.shape, y_train3.shape)


Data1:  torch.Size([45222, 98]) torch.Size([45222, 1])

Data2:  torch.Size([7214, 8]) torch.Size([7214, 1])

Data3:  torch.Size([52778, 124]) torch.Size([52778, 1])


In [5]:
# Specify loss function, define model and optimizer
loss_func = torch.nn.BCELoss()

model1 = TorchNNCore(inps=X_train1.shape[1], hiddens=[512, 256, 128], hidden_activation=torch.nn.LeakyReLU)
optim1 = torch.optim.Adam(model1.parameters(), lr=0.001)
model2 = TorchNNCore(inps=X_train2.shape[1], hiddens=[512, 256, 128], hidden_activation=torch.nn.LeakyReLU)
optim2 = torch.optim.Adam(model2.parameters(), lr=0.001)
model3 = TorchNNCore(inps=X_train3.shape[1], hiddens=[512, 256, 128], hidden_activation=torch.nn.LeakyReLU)
optim3 = torch.optim.Adam(model3.parameters(), lr=0.001)

# Before using influence function, we show the structure of the model
print(model1)
print(model2)
print(model3)

# And we print the "layer" attribute, which is used to fetch the layers above
print("\nModel 1:")
for item in model1.layers:
    print(item)
print("\nModel 2:")
for item in model2.layers:
    print(item)
print("\nModel 3:")
for item in model3.layers:
    print(item)

TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=98, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=128, out_features=1, bias=True)
    (7): Sigmoid()
  )
  (dropout): Dropout(p=0.2, inplace=False)
)
TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=512, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): LeakyReLU(negative_slope=0.01)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): LeakyReLU(negative_slope=0.01)
    (6): Linear(in_features=128, out_features=1, bias=True)
    (7): Sigmoid()
  )
  (dropout): Dropout(p=0.2, inplace=False)
)
TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features

In [None]:
sum_influences1 = np.zeros(X_train1.shape[0])
sum_influences2 = np.zeros(X_train2.shape[0])
sum_influences3 = np.zeros(X_train3.shape[0])

# For 10 times averaged result
for i in range(0,10):
  y_train_np1 = y_train1.detach().numpy()
  y_train_np2 = y_train2.detach().numpy()
  y_train_np3 = y_train3.detach().numpy()
  
  %%timeit
# Training for 455 epochs
  for epoch in range(0,455):
      optim1.zero_grad()
      y_pred1 = model1(X_train1)
      loss1 = loss_func(y_pred1, y_train1)
      loss1.backward()
      optim1.step()
      if epoch%10==0:
          y_pred_np1 = (y_pred1.detach().numpy()) > 0.5
          accuracy1 = sum(y_pred_np1 == y_train_np1)/y_train_np1.shape[0]
          print('Epoch = %d, loss1 = %.4f, accuracy=%.4f'%(epoch, loss1.tolist(), accuracy1))
  print('\n')

# Training for 1443 epochs
  for epoch in range(0,1443):
      optim2.zero_grad()
      y_pred2 = model2(X_train2)
      loss2 = loss_func(y_pred2, y_train2)
      loss2.backward()
      optim2.step()
      if epoch%10==0:
          y_pred_np2 = (y_pred2.detach().numpy()) > 0.5
          accuracy2 = sum(y_pred_np2 == y_train_np2)/y_train_np2.shape[0]
          print('Epoch = %d, loss2 = %.4f, accuracy=%.4f'%(epoch, loss2.tolist(), accuracy2))
  print('\n')        

# Training for 660 epochs
  for epoch in range(0,660):
      optim3.zero_grad()
      y_pred3 = model3(X_train3)
      loss3 = loss_func(y_pred3, y_train3)
      loss3.backward()
      optim3.step()
      if epoch%10==0:
          y_pred_np3 = (y_pred3.detach().numpy()) > 0.5
          accuracy3 = sum(y_pred_np3 == y_train_np3)/y_train_np3.shape[0]
          print('Epoch = %d, loss3 = %.4f, accuracy=%.4f'%(epoch, loss3.tolist(), accuracy3))
  print('\n')

  optim1.zero_grad()
  optim2.zero_grad()
  optim3.zero_grad()

  infl1 = InfluenceFunction(
      model = model1, 
      X_train = X_train1, 
      y_train = y_train1, 
      loss_func = loss_func,
      layer_index = -2 
  )
  infl2 = InfluenceFunction(
      model = model2, 
      X_train = X_train2, 
      y_train = y_train2, 
      loss_func = loss_func, 
      layer_index = -2
  )
  infl3 = InfluenceFunction(
      model = model3, 
      X_train = X_train3, 
      y_train = y_train3, 
      loss_func = loss_func, 
      layer_index = -2
  )

  # Measuring the influence of data by removing records
  influences1 = []
  influences2 = []
  influences3 = []

  for index in range(0, X_train1.shape[0]):
      influences1.append(infl1.influence_remove_single(index))

  for index in range(0, X_train2.shape[0]):
      influences2.append(infl2.influence_remove_single(index))

  for index in range(0, X_train3.shape[0]):
      influences3.append(infl3.influence_remove_single(index))

  # Note: The influence scores depend the current status of the model,
  # even for the same model configuration with different random seed,
  # the model may converge to different point in hyperspace, which will
  # result in different score for the same records.
  # Note2: The absolute value of the influece score is meaningless, but
  # they are comparable with each other.

  sum_influences1 = [sum_influences1[i] + influences1[i] for i in range(len(influences1))]
  sum_influences2 = [sum_influences2[i] + influences2[i] for i in range(len(influences2))]
  sum_influences3 = [sum_influences3[i] + influences3[i] for i in range(len(influences3))]

Epoch = 0, loss1 = 2.2777, accuracy=0.2365
Epoch = 10, loss1 = 3.6823, accuracy=0.7746
Epoch = 20, loss1 = 3.6089, accuracy=0.7734
Epoch = 30, loss1 = 3.6063, accuracy=0.7734
Epoch = 40, loss1 = 3.5229, accuracy=0.7752
Epoch = 50, loss1 = 5.7546, accuracy=0.7537
Epoch = 60, loss1 = 5.7507, accuracy=0.7522
Epoch = 70, loss1 = 5.7502, accuracy=0.7522
Epoch = 80, loss1 = 5.7493, accuracy=0.7537
Epoch = 90, loss1 = 5.7478, accuracy=0.7540
Epoch = 100, loss1 = 5.7475, accuracy=0.7555
Epoch = 110, loss1 = 5.7476, accuracy=0.7543
Epoch = 120, loss1 = 5.7475, accuracy=0.7551
Epoch = 130, loss1 = 5.7475, accuracy=0.7552
Epoch = 140, loss1 = 5.7476, accuracy=0.7552
Epoch = 150, loss1 = 5.7473, accuracy=0.7556
Epoch = 160, loss1 = 5.7472, accuracy=0.7556
Epoch = 170, loss1 = 5.7471, accuracy=0.7551
Epoch = 180, loss1 = 5.7033, accuracy=0.7555
Epoch = 190, loss1 = 5.6866, accuracy=0.7553
Epoch = 200, loss1 = 5.6726, accuracy=0.7555
Epoch = 210, loss1 = 5.6604, accuracy=0.7555
Epoch = 220, loss1 = 

In [None]:
avg_influences1 = [val / 10 for val in sum_influences1]
avg_influences2 = [val / 10 for val in sum_influences2]
avg_influences3 = [val / 10 for val in sum_influences3]

In [None]:
# Creating a dictionary of average influence scores with index
result1 = {idx: val for idx, val in enumerate(avg_influences1)}
print(list(result1.items())[:])
result2 = {idx: val for idx, val in enumerate(avg_influences2)}
print(list(result2.items())[:])
result3 = {idx: val for idx, val in enumerate(avg_influences3)}
print(list(result3.items())[:])

In [None]:
print(avg_influences1)
print(avg_influences2)
print(avg_influences3)

In [None]:
np.savez("adult_scores_l2_0.0001.npz", x = avg_influences1)

In [None]:
np.savez("broward_scores_l2_0.0001.npz", x = avg_influences2)

In [None]:
np.savez("hospital_scores_l2_0.0001.npz", x = avg_influences3)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp adult_scores_l2_0.1.npz "drive/MyDrive/Colab Notebooks/Influence results"

In [None]:
!cp broward_scores_l2_0.1.npz "drive/MyDrive/Colab Notebooks/Influence results"

In [None]:
!cp hospital_scores_l2_0.1.npz "drive/MyDrive/Colab Notebooks/Influence results"
