<a href="https://colab.research.google.com/github/VishnuSriraj/Data-Influence/blob/main/DataToDelete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import numpy as np
import pandas as pd
import scipy
import math

In [2]:
# A sample torch model with layer attribute
# inps: Number of input features
# hiddens: Number of neurons on each layer, 
#     e.g., [] means no hidden layer, 
#     [128] means one hidden layer with 128 neurons
# bias: Decide if there is a bias on each layer, must be true in the example
# seed: Reproductivity, None means random seed, otherwise specifiy a integer
# hidden_activation: Activation function after each hidden layer

class TorchNNCore(torch.nn.Module):
    def __init__(
        self, inps, hiddens=[], bias=True, seed=None, hidden_activation=torch.nn.ReLU
    ):
        super(TorchNNCore, self).__init__()
        if seed is not None:
            torch.manual_seed(seed)
        struct = [inps] + hiddens + [1]
        self.layers = [] # This layer attribute is required under 
        for i in range(1, len(struct)):
            self.layers.append(
                torch.nn.Linear(
                    in_features=struct[i - 1], out_features=struct[i], bias=bias
                )
            )
            if i == len(struct) - 1:
                self.layers.append(torch.nn.Sigmoid())
            else:
                self.layers.append(hidden_activation())
        self.model = torch.nn.Sequential(*self.layers)

    def forward(self, x):
        output = self.model(x)
        return output

In [6]:
# Prepare training & testing dataset
data1 = pd.read_csv('./adult.csv').to_numpy()
data2 = pd.read_csv('./broward.csv').to_numpy()
data3 = pd.read_csv('./hospital.csv').to_numpy()

X_train1 = torch.tensor(data1[:,:-1], dtype=torch.float)
y_train1 = torch.tensor(data1[:,-1].reshape(-1,1), dtype=torch.float)

X_train2 = torch.tensor(data2[:,:-1], dtype=torch.float)
y_train2 = torch.tensor(data2[:,-1].reshape(-1,1), dtype=torch.float)

X_train3 = torch.tensor(data3[:,:-1], dtype=torch.float)
y_train3 = torch.tensor(data3[:,-1].reshape(-1,1), dtype=torch.float)

print("\nData1: ",X_train1.shape, y_train1.shape)
print("\nData2: ",X_train2.shape, y_train2.shape)
print("\nData3: ",X_train3.shape, y_train3.shape)


Data1:  torch.Size([45222, 98]) torch.Size([45222, 1])

Data2:  torch.Size([7214, 8]) torch.Size([7214, 1])

Data3:  torch.Size([52778, 124]) torch.Size([52778, 1])


In [7]:
# Specify loss function, define model and optimizer
loss_func = torch.nn.BCELoss()

model1 = TorchNNCore(inps=X_train1.shape[1], hiddens=[128], hidden_activation=torch.nn.LeakyReLU)
optim1 = torch.optim.Adam(model1.parameters(),lr=0.001)
model2 = TorchNNCore(inps=X_train2.shape[1], hiddens=[128], hidden_activation=torch.nn.LeakyReLU)
optim2 = torch.optim.Adam(model2.parameters(),lr=0.001)
model3 = TorchNNCore(inps=X_train3.shape[1], hiddens=[128], hidden_activation=torch.nn.LeakyReLU)
optim3 = torch.optim.Adam(model3.parameters(),lr=0.001)

# Before using influence function, we show the structure of the model
print(model1)
print(model2)
print(model3)

# And we print the "layer" attribute, which is used to fetch the layers above
print("\nModel 1:")
for item in model1.layers:
    print(item)
print("\nModel 2:")
for item in model2.layers:
    print(item)
print("\nModel 3:")
for item in model3.layers:
    print(item)

TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=98, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)
TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)
TorchNNCore(
  (model): Sequential(
    (0): Linear(in_features=124, out_features=128, bias=True)
    (1): LeakyReLU(negative_slope=0.01)
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

Model 1:
Linear(in_features=98, out_features=128, bias=True)
LeakyReLU(negative_slope=0.01)
Linear(in_features=128, out_features=1, bias=True)
Sigmoid()

Model 2:
Linear(in_features=8, out_features=128, bias=True)
LeakyReLU(negative_slope=0.01)
Linear(in_features=128, out_features=1, bias=True)
Sigmoid()

Model 3:
Linear(in_feature

In [12]:
sum_influences1 = np.zeros(X_train1.shape[0])
sum_influences2 = np.zeros(X_train2.shape[0])
sum_influences3 = np.zeros(X_train3.shape[0])

# For 10 times averaged result
for i in range(0,10):
  y_train_np1 = y_train1.detach().numpy()
  y_train_np2 = y_train2.detach().numpy()
  y_train_np3 = y_train3.detach().numpy()
  %%timeit
  for epoch in range(0,500):
      optim1.zero_grad()
      optim2.zero_grad()
      optim3.zero_grad()
      
      y_pred1 = model1(X_train1)
      y_pred2 = model2(X_train2)
      y_pred3 = model3(X_train3)
      
      loss1 = loss_func(y_pred1, y_train1)
      loss2 = loss_func(y_pred2, y_train2)
      loss3 = loss_func(y_pred3, y_train3)
      
      loss1.backward()
      loss2.backward()
      loss3.backward()

      optim1.step()
      optim2.step()
      optim3.step()
      if epoch%10==0:
          y_pred_np1 = (y_pred1.detach().numpy()) > 0.5
          y_pred_np2 = (y_pred2.detach().numpy()) > 0.5
          y_pred_np3 = (y_pred3.detach().numpy()) > 0.5

          accuracy1 = sum(y_pred_np1 == y_train_np1)/y_train_np1.shape[0]
          accuracy2 = sum(y_pred_np2 == y_train_np2)/y_train_np2.shape[0]
          accuracy3 = sum(y_pred_np3 == y_train_np3)/y_train_np3.shape[0]

          # print('\nEpoch = %d, loss1 = %.4f, accuracy=%.4f'%(epoch, loss1.tolist(), accuracy1))
          # print('Epoch = %d, loss2 = %.4f, accuracy=%.4f'%(epoch, loss2.tolist(), accuracy2))
          # print('Epoch = %d, loss3 = %.4f, accuracy=%.4f'%(epoch, loss3.tolist(), accuracy3))

  optim1.zero_grad()
  optim2.zero_grad()
  optim3.zero_grad()

  from InfluenceFunction import InfluenceFunction

  infl1 = InfluenceFunction(
      model = model1, # Warning: the class will take a snapshot of the model, any further change requires new instance
      X_train = X_train1, # Features, must be torch.Tensor
      y_train = y_train1, # Labels, must be torch.Tensor
      loss_func = loss_func, # In this example, it's BCELoss
      layer_index = -2, # In this example, as shown in the model structure, we use the second last layer 
  )
  infl2 = InfluenceFunction(
      model = model2, # Warning: the class will take a snapshot of the model, any further change requires new instance
      X_train = X_train2, # Features, must be torch.Tensor
      y_train = y_train2, # Labels, must be torch.Tensor
      loss_func = loss_func, # In this example, it's BCELoss
      layer_index = -2, # In this example, as shown in the model structure, we use the second last layer 
  )
  infl3 = InfluenceFunction(
      model = model3, # Warning: the class will take a snapshot of the model, any further change requires new instance
      X_train = X_train3, # Features, must be torch.Tensor
      y_train = y_train3, # Labels, must be torch.Tensor
      loss_func = loss_func, # In this example, it's BCELoss
      layer_index = -2, # In this example, as shown in the model structure, we use the second last layer 
  )

  # Measuring the influence of data by removing records
  influences1 = []
  influences2 = []
  influences3 = []

  for index in range(0, X_train1.shape[0]):
      influences1.append(infl1.influence_remove_single(index))

  for index in range(0, X_train2.shape[0]):
      influences2.append(infl2.influence_remove_single(index))

  for index in range(0, X_train3.shape[0]):
      influences3.append(infl3.influence_remove_single(index))

  # Note: The influence scores depend the current status of the model,
  # even for the same model configuration with different random seed,
  # the model may converge to different point in hyperspace, which will
  # result in different score for the same records.
  # Note2: The absolute value of the influece score is meaningless, but
  # they are comparable with each other.

  sum_influences1 = [sum_influences1[i] + influences1[i] for i in range(len(influences1))]
  sum_influences2 = [sum_influences2[i] + influences2[i] for i in range(len(influences2))]
  sum_influences3 = [sum_influences3[i] + influences3[i] for i in range(len(influences3))]

In [13]:
avg_influences1 = [val / 10 for val in sum_influences1]
avg_influences2 = [val / 10 for val in sum_influences2]
avg_influences3 = [val / 10 for val in sum_influences3]

In [14]:
# Converting the influence scores to absolute values
avg_influences1 = [abs(number) for number in avg_influences1]
avg_influences2 = [abs(number) for number in avg_influences2]
avg_influences3 = [abs(number) for number in avg_influences3]

In [15]:
# Creating a dictionary of average influence scores with index
result1 = {idx: val for idx, val in enumerate(avg_influences1)}
print(result1)
result2 = {idx: val for idx, val in enumerate(avg_influences2)}
print(result2)
result3 = {idx: val for idx, val in enumerate(avg_influences3)}
print(result3)

{0: 0.0002963327780287423, 1: 0.06967158241800239, 2: 0.05133991940225315, 3: 0.0, 4: 0.0003668807627790605, 5: 0.0, 6: 0.001206911530167198, 7: 0.0102334942868737, 8: 0.0, 9: 0.0376670046993, 10: 0.0006998608710344268, 11: 0.0, 12: 0.007747110220670454, 13: 7.095834063685964e-05, 14: 0.05041749666672708, 15: 0.0003667250476836235, 16: 0.11871749626791302, 17: 0.0010535082576904945, 18: 0.0013155910161564298, 19: 0.16281632923739547, 20: 0.0, 21: 0.0001008074943247663, 22: 0.0017839567055532729, 23: 0.03407004088450295, 24: 0.10265260234033798, 25: 0.0, 26: 0.006609724890277227, 27: 0.1694445097681325, 28: 0.02316177599861057, 29: 0.001138730337642303, 30: 0.03741232614610583, 31: 0.0009299056609782374, 32: 0.0002451329380316697, 33: 0.0001977605885974284, 34: 0.04952969088499777, 35: 0.02753906910127783, 36: 0.03535593179114388, 37: 0.010901571446296896, 38: 6.029160938914353e-05, 39: 0.17952162027377064, 40: 0.0027814608020133255, 41: 0.0, 42: 1.4578913926106317e-06, 43: 0.1107919354

In [16]:
# Implement Mean
def mn(data):
  mn = sum(data) / len(data)
  return mn

# Implementation of Variance
def variance(data):
  mean = mn(data)
  return sum((x - mean) ** 2 for x in data) / len(data)

# Implementation of Standard Deviation
def stdev(data):
  var = variance(data)
  std_dev = math.sqrt(var)
  return std_dev

# Count values under a certain threshold
def count_of_vals_under(num, infl):
  count = 0
  for i in infl:
    if i <= num:
      count += 1
  return count

In [17]:
# Analysing the data and printing the number of data below the certain influence scores
print("\nData1(Adult):")
print("Total number of values in Data 1:",X_train1.shape[0])
print("Standard Deviation of Data 1:",stdev(avg_influences1))
print("Mean of Data 1:", mn(avg_influences1))
print("Number of values under influence score of 1:",count_of_vals_under(1, avg_influences1))
print("Number of values under influence score of 0.5:",count_of_vals_under(0.5, avg_influences1))
print("Number of values under influence score of 0.1:",count_of_vals_under(0.1, avg_influences1))
print("Number of values under influence score of 0.05:",count_of_vals_under(0.05, avg_influences1))
print("Number of values under influence score of 0.01:",count_of_vals_under(0.01, avg_influences1))
print("Number of values with an influence score of 0:",count_of_vals_under(0, avg_influences1))

print("\nData2(Broward):")
print("Total number of values in Data 2:",X_train2.shape[0])
print("Standard Deviation of Data 2:",stdev(avg_influences2))
print("Mean of Data 2:", mn(avg_influences2))
print("Number of values under influence score of 1:",count_of_vals_under(1, avg_influences2))
print("Number of values under influence score of 0.5:",count_of_vals_under(0.5, avg_influences2))
print("Number of values under influence score of 0.1:",count_of_vals_under(0.1, avg_influences2))
print("Number of values under influence score of 0.05:",count_of_vals_under(0.05, avg_influences2))
print("Number of values under influence score of 0.01:",count_of_vals_under(0.01, avg_influences2))
print("Number of values under influence score of 0:",count_of_vals_under(0, avg_influences2))

print("\nData3(Hospital):")
print("Total number of values in Data 3:",X_train3.shape[0])
print("Standard Deviation of Data 3:",stdev(avg_influences3))
print("Mean of Data 3:", mn(avg_influences3))
print("Number of values under influence score of 1:",count_of_vals_under(1, avg_influences3))
print("Number of values under influence score of 0.5:",count_of_vals_under(0.5, avg_influences3))
print("Number of values under influence score of 0.1:",count_of_vals_under(0.1, avg_influences3))
print("Number of values under influence score of 0.05:",count_of_vals_under(0.05, avg_influences3))
print("Number of values under influence score of 0.01:",count_of_vals_under(0.01, avg_influences3))
print("Number of values with an influence score of 0:",count_of_vals_under(0, avg_influences3))


Data1(Adult):
Total number of values in Data 1: 45222
Standard Deviation of Data 1: 0.1229641748878543
Mean of Data 1: 0.0536082959816943
Number of values under influence score of 1: 45114
Number of values under influence score of 0.5: 44576
Number of values under influence score of 0.1: 38472
Number of values under influence score of 0.05: 33664
Number of values under influence score of 0.01: 22919
Number of values with an influence score of 0: 3782

Data2(Broward):
Total number of values in Data 2: 7214
Standard Deviation of Data 2: 0.046797126626487516
Mean of Data 2: 0.028618250201900872
Number of values under influence score of 1: 7213
Number of values under influence score of 0.5: 7206
Number of values under influence score of 0.1: 6940
Number of values under influence score of 0.05: 6189
Number of values under influence score of 0.01: 2544
Number of values under influence score of 0: 1

Data3(Hospital):
Total number of values in Data 3: 52778
Standard Deviation of Data 3: 0.060

In [18]:
# Changing the influence scores below 0.01 in the data to False(data to be deleted)

def data_to_be_deleted(infl, threshold):
  for idx, val in enumerate(infl):
    if val <= threshold:
        infl[idx] = False
    else:
        infl[idx] = True

data_to_be_deleted(avg_influences1, 0.01)
data_to_be_deleted(avg_influences2, 0.01)
data_to_be_deleted(avg_influences3, 0.01)

In [19]:
# Verifying the data set to False
def verify(infl):
  count = 0
  for i in infl:
    if i == False:
      count += 1
  return count

print("\nTotal number of values to be deleted in data 1:",verify(avg_influences1))
print('Result 1: ',avg_influences1)
print("\nTotal number of values to be deleted in data 2:",verify(avg_influences2))
print('Result 2: ',avg_influences2)
print("\nTotal number of values to be deleted in data 3:",verify(avg_influences3))
print('Result 3: ',avg_influences3)


Total number of values to be deleted in data 1: 22919
Result 1:  [False, True, True, False, False, False, False, True, False, True, False, False, False, False, True, False, True, False, False, True, False, False, False, True, True, False, False, True, True, False, True, False, False, False, True, True, True, True, False, True, False, False, False, True, False, False, False, False, True, False, True, False, False, True, True, True, True, False, False, False, False, True, False, False, True, False, False, True, True, True, True, False, False, False, True, True, True, False, True, False, True, False, False, False, True, False, False, True, True, False, False, False, False, False, False, True, False, False, False, False, True, True, False, False, True, False, False, False, False, False, False, False, False, False, True, False, True, False, True, False, False, True, True, True, True, True, False, True, False, False, True, False, False, False, True, False, False, False, True, True, False, T

### Deleting data by entering percentage value

In [20]:
# Sorting the dictionary
sorted_result1 = {k: v for k, v in sorted(result1.items(), key=lambda item: item[1])}
print(sorted_result1)
sorted_result2 = {k: v for k, v in sorted(result2.items(), key=lambda item: item[1])}
print(sorted_result2)
sorted_result3 = {k: v for k, v in sorted(result3.items(), key=lambda item: item[1])}
print(sorted_result3)

{3: 0.0, 5: 0.0, 8: 0.0, 11: 0.0, 20: 0.0, 25: 0.0, 41: 0.0, 47: 0.0, 51: 0.0, 52: 0.0, 59: 0.0, 63: 0.0, 82: 0.0, 96: 0.0, 129: 0.0, 135: 0.0, 136: 0.0, 167: 0.0, 168: 0.0, 195: 0.0, 207: 0.0, 209: 0.0, 217: 0.0, 235: 0.0, 244: 0.0, 254: 0.0, 274: 0.0, 284: 0.0, 288: 0.0, 289: 0.0, 305: 0.0, 313: 0.0, 316: 0.0, 318: 0.0, 319: 0.0, 328: 0.0, 379: 0.0, 384: 0.0, 402: 0.0, 408: 0.0, 411: 0.0, 434: 0.0, 436: 0.0, 443: 0.0, 461: 0.0, 466: 0.0, 486: 0.0, 487: 0.0, 494: 0.0, 510: 0.0, 531: 0.0, 535: 0.0, 538: 0.0, 541: 0.0, 543: 0.0, 548: 0.0, 566: 0.0, 573: 0.0, 580: 0.0, 588: 0.0, 593: 0.0, 595: 0.0, 600: 0.0, 613: 0.0, 621: 0.0, 624: 0.0, 629: 0.0, 636: 0.0, 637: 0.0, 643: 0.0, 645: 0.0, 651: 0.0, 652: 0.0, 657: 0.0, 667: 0.0, 693: 0.0, 712: 0.0, 721: 0.0, 723: 0.0, 735: 0.0, 755: 0.0, 764: 0.0, 766: 0.0, 815: 0.0, 820: 0.0, 825: 0.0, 830: 0.0, 834: 0.0, 847: 0.0, 887: 0.0, 898: 0.0, 899: 0.0, 906: 0.0, 907: 0.0, 912: 0.0, 918: 0.0, 919: 0.0, 921: 0.0, 946: 0.0, 960: 0.0, 964: 0.0, 985: 0

In [21]:
# Indices remaining in each dataset
ind_rem1 = []
ind_rem2 = []
ind_rem3 = []

def delete_data(data, percent):
  cut_val = math.floor(len(data) * percent / 100)
  return list(data.keys())[cut_val:]

ind_rem1 = delete_data(sorted_result1, 10) # Enter data and percentage value to delete
print('Remaining indices in Adult data: ',ind_rem1)
ind_rem2 = delete_data(sorted_result2, 10) # Enter data and percentage value to delete
print('Remaining indices in Broward data: ',ind_rem2)
ind_rem3 = delete_data(sorted_result3, 10) # Enter data and percentage value to delete
print('Remaining indices in Hospital data: ',ind_rem3)

Remaining indices in Adult data:  [39871, 2686, 20867, 37805, 28448, 31810, 21537, 18010, 42889, 44433, 30084, 35267, 36072, 1020, 35002, 23494, 4033, 12825, 21889, 23993, 12490, 19437, 44889, 6448, 34589, 6366, 5234, 10898, 41101, 5992, 43351, 17661, 3159, 395, 4303, 22524, 13367, 9680, 32411, 18873, 32844, 31562, 30047, 34315, 9373, 20941, 11831, 30149, 28723, 39992, 704, 496, 2701, 12738, 17538, 41631, 42992, 13711, 12884, 22650, 9642, 30734, 36427, 5945, 39476, 30988, 27661, 5660, 43120, 22954, 3084, 29135, 28503, 41341, 280, 22536, 38251, 18902, 27135, 20486, 27540, 18915, 7154, 8705, 19909, 35348, 26904, 32033, 14794, 38733, 16015, 37844, 4051, 15624, 31740, 44176, 17213, 37134, 28341, 2527, 37033, 7899, 1493, 2772, 31482, 7254, 6944, 31704, 32689, 24362, 42604, 43142, 1504, 24712, 40167, 24417, 10668, 33248, 20801, 7629, 1958, 43275, 6462, 16628, 19365, 36123, 3496, 8802, 14846, 7265, 23925, 3002, 10117, 26756, 37084, 7983, 962, 7110, 9072, 14786, 15799, 17663, 21933, 22556, 258