<a href="https://colab.research.google.com/github/VishnuSriraj/Data-Influence/blob/main/DataToDelete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import pandas as pd
import scipy
import math

In [2]:
# A sample torch model with layer attribute
# inps: Number of input features
# hiddens: Number of neurons on each layer, 
#     e.g., [] means no hidden layer, 
#     [128] means one hidden layer with 128 neurons
# bias: Decide if there is a bias on each layer, must be true in the example
# seed: Reproductivity, None means random seed, otherwise specifiy a integer
# hidden_activation: Activation function after each hidden layer

class TorchNNCore(torch.nn.Module):
    def __init__(
        self, inps, hiddens=[], bias=True, seed=None, hidden_activation=torch.nn.ReLU
    ):
        super(TorchNNCore, self).__init__()
        if seed is not None:
            torch.manual_seed(seed)
        struct = [inps] + hiddens + [1]
        self.layers = [] # This layer attribute is required under 
        for i in range(1, len(struct)):
            self.layers.append(
                torch.nn.Linear(
                    in_features=struct[i - 1], out_features=struct[i], bias=bias
                )
            )
            if i == len(struct) - 1:
                self.layers.append(torch.nn.Sigmoid())
            else:
                self.layers.append(hidden_activation())
        self.model = torch.nn.Sequential(*self.layers)

    def forward(self, x):
        output = self.model(x)
        return output

In [3]:
# Prepare training & testing dataset
data1 = pd.read_csv('./adult.csv').to_numpy()
data2 = pd.read_csv('./broward.csv').to_numpy()
data3 = pd.read_csv('./hospital.csv').to_numpy()

X_train1 = torch.tensor(data1[:,:-1], dtype=torch.float)
y_train1 = torch.tensor(data1[:,-1].reshape(-1,1), dtype=torch.float)

X_train2 = torch.tensor(data2[:,:-1], dtype=torch.float)
y_train2 = torch.tensor(data2[:,-1].reshape(-1,1), dtype=torch.float)

X_train3 = torch.tensor(data3[:,:-1], dtype=torch.float)
y_train3 = torch.tensor(data3[:,-1].reshape(-1,1), dtype=torch.float)

print("\nData1: ",X_train1.shape, y_train1.shape)
print("\nData2: ",X_train2.shape, y_train2.shape)
print("\nData3: ",X_train3.shape, y_train3.shape)


Data1:  torch.Size([45222, 98]) torch.Size([45222, 1])

Data2:  torch.Size([7214, 8]) torch.Size([7214, 1])

Data3:  torch.Size([52778, 124]) torch.Size([52778, 1])


In [4]:
# Specify loss function, define model and optimizer
loss_func = torch.nn.BCELoss()

model1 = TorchNNCore(inps=X_train1.shape[1], hiddens=[128], hidden_activation=torch.nn.LeakyReLU)
optim1 = torch.optim.Adam(model1.parameters(),lr=0.001)
model2 = TorchNNCore(inps=X_train2.shape[1], hiddens=[128], hidden_activation=torch.nn.LeakyReLU)
optim2 = torch.optim.Adam(model2.parameters(),lr=0.001)
model3 = TorchNNCore(inps=X_train3.shape[1], hiddens=[128], hidden_activation=torch.nn.LeakyReLU)
optim3 = torch.optim.Adam(model3.parameters(),lr=0.001)

# Before using influence function, we show the structure of the model
print(model1)
print(model2)
print(model3)

# And we print the "layer" attribute, which is used to fetch the layers above
print("\nModel 1:")
for item in model1.layers:
    print(item)
print("\nModel 2:")
for item in model2.layers:
    print(item)
print("\nModel 3:")
for item in model3.layers:
    print(item)

TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=98, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)
TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)
TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=124, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

Model 1:
Linear(in_features=98, out_features=128, bias=True)
LeakyReLU(negative_slope=0.01)
Linear(in_features=128, out_features=1, bias=True)
Sigmoid()

Model 2:
Linear(in_features=8, out_features=128, bias=True)
LeakyReLU(negative_slope=0.01)
Linear(in_features=128, out_features=1, bias=True)
Sigmoid()

Model 3:
Linear(in_feature

In [5]:
sum_influences1 = np.zeros(X_train1.shape[0])
sum_influences2 = np.zeros(X_train2.shape[0])
sum_influences3 = np.zeros(X_train3.shape[0])

# For 10 times averaged result
for i in range(0,10):
  y_train_np1 = y_train1.detach().numpy()
  y_train_np2 = y_train2.detach().numpy()
  y_train_np3 = y_train3.detach().numpy()
  for epoch in range(0,200):
      optim1.zero_grad()
      optim2.zero_grad()
      optim3.zero_grad()
      
      y_pred1 = model1(X_train1)
      y_pred2 = model2(X_train2)
      y_pred3 = model3(X_train3)
      
      loss1 = loss_func(y_pred1, y_train1)
      loss2 = loss_func(y_pred2, y_train2)
      loss3 = loss_func(y_pred3, y_train3)
      
      loss1.backward()
      loss2.backward()
      loss3.backward()

      optim1.step()
      optim2.step()
      optim3.step()
      if epoch%10==0:
          y_pred_np1 = (y_pred1.detach().numpy()) > 0.5
          y_pred_np2 = (y_pred2.detach().numpy()) > 0.5
          y_pred_np3 = (y_pred3.detach().numpy()) > 0.5

          accuracy1 = sum(y_pred_np1 == y_train_np1)/y_train_np1.shape[0]
          accuracy2 = sum(y_pred_np2 == y_train_np2)/y_train_np2.shape[0]
          accuracy3 = sum(y_pred_np3 == y_train_np3)/y_train_np3.shape[0]

          print('\nEpoch = %d, loss1 = %.4f, accuracy=%.4f'%(epoch, loss1.tolist(), accuracy1))
          print('Epoch = %d, loss2 = %.4f, accuracy=%.4f'%(epoch, loss2.tolist(), accuracy2))
          print('Epoch = %d, loss3 = %.4f, accuracy=%.4f'%(epoch, loss3.tolist(), accuracy3))

  optim1.zero_grad()
  optim2.zero_grad()
  optim3.zero_grad()

  from InfluenceFunction import InfluenceFunction

  infl1 = InfluenceFunction(
      model = model1, # Warning: the class will take a snapshot of the model, any further change requires new instance
      X_train = X_train1, # Features, must be torch.Tensor
      y_train = y_train1, # Labels, must be torch.Tensor
      loss_func = loss_func, # In this example, it's BCELoss
      layer_index = -2, # In this example, as shown in the model structure, we use the second last layer 
  )
  infl2 = InfluenceFunction(
      model = model2, # Warning: the class will take a snapshot of the model, any further change requires new instance
      X_train = X_train2, # Features, must be torch.Tensor
      y_train = y_train2, # Labels, must be torch.Tensor
      loss_func = loss_func, # In this example, it's BCELoss
      layer_index = -2, # In this example, as shown in the model structure, we use the second last layer 
  )
  infl3 = InfluenceFunction(
      model = model3, # Warning: the class will take a snapshot of the model, any further change requires new instance
      X_train = X_train3, # Features, must be torch.Tensor
      y_train = y_train3, # Labels, must be torch.Tensor
      loss_func = loss_func, # In this example, it's BCELoss
      layer_index = -2, # In this example, as shown in the model structure, we use the second last layer 
  )

  # Measuring the influence of data by removing records
  influences1 = []
  influences2 = []
  influences3 = []

  for index in range(0, X_train1.shape[0]):
      influences1.append(infl1.influence_remove_single(index))

  for index in range(0, X_train2.shape[0]):
      influences2.append(infl2.influence_remove_single(index))

  for index in range(0, X_train3.shape[0]):
      influences3.append(infl3.influence_remove_single(index))

  # Note: The influence scores depend the current status of the model,
  # even for the same model configuration with different random seed,
  # the model may converge to different point in hyperspace, which will
  # result in different score for the same records.
  # Note2: The absolute value of the influece score is meaningless, but
  # they are comparable with each other.

  sum_influences1 = [sum_influences1[i] + influences1[i] for i in range(len(influences1))]
  sum_influences2 = [sum_influences2[i] + influences2[i] for i in range(len(influences2))]
  sum_influences3 = [sum_influences3[i] + influences3[i] for i in range(len(influences3))]


Epoch = 0, loss1 = 3.3331, accuracy=0.2466
Epoch = 0, loss2 = 1.9113, accuracy=0.4507
Epoch = 0, loss3 = 0.7275, accuracy=0.3456

Epoch = 10, loss1 = 3.5739, accuracy=0.7734
Epoch = 10, loss2 = 0.8434, accuracy=0.5495
Epoch = 10, loss3 = 0.6549, accuracy=0.6544

Epoch = 20, loss1 = 3.5605, accuracy=0.7734
Epoch = 20, loss2 = 0.7380, accuracy=0.5549
Epoch = 20, loss3 = 0.6360, accuracy=0.6544

Epoch = 30, loss1 = 3.5060, accuracy=0.7761
Epoch = 30, loss2 = 0.6931, accuracy=0.4831
Epoch = 30, loss3 = 0.6293, accuracy=0.6544

Epoch = 40, loss1 = 3.4929, accuracy=0.7771
Epoch = 40, loss2 = 0.6379, accuracy=0.6297
Epoch = 40, loss3 = 0.6215, accuracy=0.6545

Epoch = 50, loss1 = 3.4772, accuracy=0.7768
Epoch = 50, loss2 = 0.6145, accuracy=0.6654
Epoch = 50, loss3 = 0.6131, accuracy=0.6565

Epoch = 60, loss1 = 3.4618, accuracy=0.7804
Epoch = 60, loss2 = 0.6089, accuracy=0.6770
Epoch = 60, loss3 = 0.6040, accuracy=0.6637

Epoch = 70, loss1 = 3.4462, accuracy=0.7844
Epoch = 70, loss2 = 0.6075,

In [31]:
avg_influences1 = [val / 10 for val in sum_influences1]
avg_influences2 = [val / 10 for val in sum_influences2]
avg_influences3 = [val / 10 for val in sum_influences3]

In [32]:
# Converting the influence scores to absolute values
avg_influences1 = [abs(number) for number in avg_influences1]
avg_influences2 = [abs(number) for number in avg_influences2]
avg_influences3 = [abs(number) for number in avg_influences3]

In [33]:
avg_influences1

[0.021516909718732685,
 0.3756625381141868,
 3.0396977785765866,
 0.0019505382192298068,
 0.043563691566087845,
 12.158091268883183,
 0.005066887379295476,
 0.17904857421523993,
 0.005417241335956715,
 2.918929776649961,
 0.015607010479438813,
 13.071861846579656,
 0.13823279094706986,
 0.006942787270344931,
 0.4493900029920872,
 0.020281575890839294,
 0.44509991485881545,
 0.021948961806303454,
 0.12120779887425062,
 0.5020084230483718,
 0.012049622586393225,
 0.011680856118083225,
 0.009719983175509985,
 0.08532473582928554,
 0.3102019273821631,
 0.0006840446855823989,
 0.03976948322878706,
 0.11206908511419082,
 0.6097854827233765,
 0.0410966526056171,
 0.008318236854259331,
 0.013695695255371737,
 0.03198833751367991,
 0.021402013341920843,
 0.6123245906151342,
 1.0032497356006727,
 0.11391371007337563,
 0.20930024166381775,
 0.012635193746612367,
 1.2458006177210041,
 0.04208466725635782,
 3.6330066823070473,
 0.023243437975174232,
 0.051732421247932,
 0.0013071126033960686,
 0.03

In [34]:
avg_influences2

[0.003098580156666495,
 0.04782860935853799,
 0.04262153135908468,
 0.08808248033753743,
 0.005815334316535964,
 0.01695520875547676,
 0.027533323775255504,
 0.019422510403795216,
 0.0036689173281775726,
 0.007874375772584662,
 0.006752150508906654,
 0.006640859187615841,
 0.007672972904689962,
 0.0035697224672085652,
 0.03452174512113327,
 0.0010511956443838527,
 0.027825954586580205,
 0.001130946413837345,
 0.004891243800667806,
 0.010557749298240698,
 0.12095322919112868,
 0.0033830696058738826,
 0.05959454013648417,
 0.008142647769484979,
 0.02057269351560926,
 0.007893351366309805,
 0.01994141585837334,
 0.02075006606417116,
 0.02166457468946175,
 0.0682997648140272,
 0.16513874667995504,
 0.014866768680854278,
 0.013245831876781444,
 0.012054900436123162,
 0.005185651772678985,
 0.06529603750880733,
 0.03369600473761948,
 0.05639674552179763,
 0.05313835860020116,
 0.0319158240066907,
 0.10647362914329758,
 0.006591815719290314,
 0.021564437064153907,
 0.08369891788047468,
 0.062

In [35]:
avg_influences3

[0.021235858270378397,
 0.02581368691118993,
 0.11271247654833154,
 0.032572242166086576,
 0.0012529969706742638,
 0.024456601224718988,
 0.08412937313852692,
 0.106587308736967,
 0.03081043862739798,
 0.03618293321308231,
 0.10959980301760282,
 0.04052066737620903,
 0.07031492730604948,
 0.24583928964866536,
 0.03803702821516142,
 0.11542792476928738,
 0.017164773627112467,
 0.02872223813191263,
 0.08926732786175709,
 0.1708203751011903,
 0.1416642414827,
 0.1741005576870107,
 0.06329898661911551,
 0.16265381527440062,
 0.017711422979923112,
 0.02336413744041089,
 0.11244212872196044,
 0.09262580293891114,
 0.15292812804448555,
 0.02699660607220662,
 0.04011321456076163,
 0.021518670502299372,
 0.0633314552818635,
 0.07406333708038867,
 0.03819920148751227,
 0.060449601981317325,
 0.0754206685229237,
 0.02396797341093709,
 3.6112688103230247e-06,
 0.08472064477138344,
 0.06603351876145108,
 0.020761683221533822,
 0.02156110578920173,
 0.001241254914476627,
 0.04763557038678125,
 0.005

In [38]:
# Implement Mean
def mn(data):
  mn = sum(data) / len(data)
  return mn

# Implementation of Variance
def variance(data):
  mean = mn(data)
  return sum((x - mean) ** 2 for x in data) / len(data)

# Implementation of Standard Deviation
def stdev(data):
  var = variance(data)
  std_dev = math.sqrt(var)
  return std_dev

# Count values under a certain threshold
def count_of_vals_under(num, infl):
  count = 0
  for i in infl:
    if i <= num:
      count += 1
  return count

In [39]:
# Analysing the data and printing the number of data below the certain influence scores
print("\nData1(Adult):")
print("Total number of values in Data 1:",X_train1.shape[0])
print("Standard Deviation of Data 1:",stdev(avg_influences1))
print("Mean of Data 1:", mn(avg_influences1))
print("Number of values under influence score of 1:",count_of_vals_under(1, avg_influences1))
print("Number of values under influence score of 0.5:",count_of_vals_under(0.5, avg_influences1))
print("Number of values under influence score of 0.1:",count_of_vals_under(0.1, avg_influences1))
print("Number of values under influence score of 0.05:",count_of_vals_under(0.05, avg_influences1))
print("Number of values under influence score of 0.01:",count_of_vals_under(0.01, avg_influences1))
print("Number of values with an influence score of 0:",count_of_vals_under(0, avg_influences1))

print("\nData2(Broward):")
print("Total number of values in Data 2:",X_train2.shape[0])
print("Standard Deviation of Data 2:",stdev(avg_influences2))
print("Mean of Data 2:", mn(avg_influences2))
print("Number of values under influence score of 1:",count_of_vals_under(1, avg_influences2))
print("Number of values under influence score of 0.5:",count_of_vals_under(0.5, avg_influences2))
print("Number of values under influence score of 0.1:",count_of_vals_under(0.1, avg_influences2))
print("Number of values under influence score of 0.05:",count_of_vals_under(0.05, avg_influences2))
print("Number of values under influence score of 0.01:",count_of_vals_under(0.01, avg_influences2))
print("Number of values under influence score of 0:",count_of_vals_under(0, avg_influences2))

print("\nData3(Hospital):")
print("Total number of values in Data 3:",X_train3.shape[0])
print("Standard Deviation of Data 3:",stdev(avg_influences3))
print("Mean of Data 3:", mn(avg_influences3))
print("Number of values under influence score of 1:",count_of_vals_under(1, avg_influences3))
print("Number of values under influence score of 0.5:",count_of_vals_under(0.5, avg_influences3))
print("Number of values under influence score of 0.1:",count_of_vals_under(0.1, avg_influences3))
print("Number of values under influence score of 0.05:",count_of_vals_under(0.05, avg_influences3))
print("Number of values under influence score of 0.01:",count_of_vals_under(0.01, avg_influences3))
print("Number of values with an influence score of 0:",count_of_vals_under(0, avg_influences3))


Data1(Adult):
Total number of values in Data 1: 45222
Standard Deviation of Data 1: 190.426797092564
Mean of Data 1: 6.238587768209289
Number of values under influence score of 1: 38106
Number of values under influence score of 0.5: 32866
Number of values under influence score of 0.1: 20411
Number of values under influence score of 0.05: 15577
Number of values under influence score of 0.01: 5830
Number of values with an influence score of 0: 888

Data2(Broward):
Total number of values in Data 2: 7214
Standard Deviation of Data 2: 0.06389953918766562
Mean of Data 2: 0.040791887901062286
Number of values under influence score of 1: 7210
Number of values under influence score of 0.5: 7205
Number of values under influence score of 0.1: 6680
Number of values under influence score of 0.05: 5383
Number of values under influence score of 0.01: 1452
Number of values under influence score of 0: 0

Data3(Hospital):
Total number of values in Data 3: 52778
Standard Deviation of Data 3: 0.052812127

In [40]:
# Changing the influence scores below 0.01 in the data to False(data to be deleted)

def data_to_be_deleted(infl, threshold):
  for idx, val in enumerate(infl):
    if val <= threshold:
        infl[idx] = False
    else:
        infl[idx] = True

data_to_be_deleted(avg_influences1, 0.01)
data_to_be_deleted(avg_influences2, 0.01)
data_to_be_deleted(avg_influences3, 0.01)

In [41]:
# Verifying the data set to False
def verify(infl):
  count = 0
  for i in infl:
    if i == False:
      count += 1
  return count

print("\nTotal number of values to be deleted in data 1:",verify(avg_influences1))
print("\nTotal number of values to be deleted in data 2:",verify(avg_influences2))
print("\nTotal number of values to be deleted in data 3:",verify(avg_influences3))



Total number of values to be deleted in data 1: 5830

Total number of values to be deleted in data 2: 1452

Total number of values to be deleted in data 3: 7100
