In [None]:
!pip install adversarial-robustness-toolbox
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from datetime import datetime 
from tqdm.notebook import tqdm 
import statistics
from math import log10
import struct
from random import randrange
import multiprocessing
import concurrent.futures
import time
import torch.utils.data as data_utils


from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib 


path = '../dataset/texas/100'
df_feats = pd.read_csv(f'{path}/feats',  header=0, error_bad_lines=False)
df_labels = pd.read_csv(f'{path}/labels',  header=0, error_bad_lines=False)

df = pd.concat([df_labels, df_feats], axis=1, ignore_index=True)
#Make 100 labels as [0,1,...,99]
df[df.columns[0]] = df[df.columns[0]] - 1

#check number of classes
print('total class:', len(df[0].unique()))
print('class labels:', df[0].unique())
#records per class
print('records per class:\n', df[0].value_counts())


labels = df[0].unique()

df_list1 = []
df_list2 = []
df_list3 = []
df_list4 = []

for label in labels:
  df_tmp = df[df[0]==label]
  df_list1.append(df_tmp.head(100).head(50))
  df_list2.append(df_tmp.head(100).tail(50))
  df_list3.append(df_tmp.tail(100).head(50))
  df_list4.append(df_tmp.tail(100).tail(50))

df_mem5k1 = pd.concat(df_list1, axis=0)
df_mem5k2 = pd.concat(df_list2, axis=0)
df_nonmem5k1 = pd.concat(df_list3, axis=0)
df_nonmem5k2 = pd.concat(df_list4, axis=0)

#shuffle dataframe
df_mem5k1 = df_mem5k1.sample(frac=1)
df_mem5k2 = df_mem5k2.sample(frac=1)
df_nonmem5k1 = df_nonmem5k1.sample(frac=1)
df_nonmem5k2 = df_nonmem5k2.sample(frac=1)


df_train = pd.concat([df_mem5k1, df_mem5k2], axis=0)
df_test = pd.concat([df_nonmem5k1, df_nonmem5k2], axis=0)

filename = '../dataset/texas_member.csv'
df_train.to_csv (filename, index = None, header=False, sep='\t', encoding='utf-8') 
filename = '../dataset/texas_non_member.csv'
df_test.to_csv (filename, index = None, header=False, sep='\t', encoding='utf-8') 

#separating label from data and converting dataframe into Torch Tensor
col_0 = df_train.columns[0] # 1st column is label; 
col_rest = df_train.columns[1:] # rests are data
X_train = torch.tensor(df_train[col_rest].values, dtype=torch.float32) 
y_train = torch.tensor(df_train[col_0].values) # y is row vector here

print(f'-'*30, 'train', f'-'*30)
display(X_train.size(), y_train.size())
display(X_train, y_train)


BATCH_SIZE = 100

train = data_utils.TensorDataset(X_train, y_train)
train_loader = data_utils.DataLoader(train, batch_size=BATCH_SIZE, shuffle=False)

print(len(train_loader.dataset))


for batch_idx, (X, y_true) in enumerate(train_loader):
  # print(batch_idx, X.size(), y_true.size())
    if (batch_idx == 0):
        print(X)
        print(y_true)
        print(X.dtype, y_true.dtype)
        break


def get_accuracy(model, data_loader):
    '''
    Function for computing the accuracy of the predictions over the entire data_loader
    '''
    
    correct_pred = 0 
    n = 0
    
    with torch.no_grad():
        model.eval()
        for X, y_true in data_loader:

            y_hat = model(X)
            _, predicted_labels = torch.max(y_hat, 1)

            n += y_true.size(0)
            correct_pred += (predicted_labels == y_true).sum()

    return correct_pred.float() / n



# TexasClassifier model
class TexasClassifier(nn.Module):
    def __init__(self, num_features = 600, num_classes=100):
        super(TexasClassifier, self).__init__() 
        self.fc1 = nn.Linear(num_features,1024)
        self.fc2 = nn.Linear(1024,512)
        self.fc3 = nn.Linear(512,256)
        self.fc4 = nn.Linear(256,128)
        self.fc5 = nn.Linear(128,num_classes)
        self.relu = nn.Tanh()

    def forward(self, x):
       #classifier
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x) 
        x = self.relu(x)
        x = self.fc3(x) 
        x = self.relu(x)
        x = self.fc4(x)
        x = self.relu(x)

        x = self.fc5(x)

        # #sigmoid returns a value between 0 and 1, used for binary classification
        # prob = torch.sigmoid(x)   
        
        logits = x
        return logits




RANDOM_SEED = 42

torch.manual_seed(RANDOM_SEED)
target_model = TexasClassifier()
optimizer = torch.optim.Adam(target_model.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss()



N_EPOCHS = 475
print_every = 10



for epoch in range(N_EPOCHS):  # loop over the dataset multiple times
    running_loss = 0.0

    if epoch == 20:
        optimizer = torch.optim.Adam(target_model.parameters(), lr=0.0005) 
        print("\nlearning rate decay to 1/10...")
    if epoch == 40:
        optimizer = torch.optim.Adam(target_model.parameters(), lr=0.0001) 
        print("\nlearning rate decay to 1/100...")

    if epoch == 60:
        optimizer = torch.optim.Adam(target_model.parameters(), lr=0.00005) 
        print("\nlearning rate decay to 1/10...")
    if epoch == 80:
        optimizer = torch.optim.Adam(target_model.parameters(), lr=0.00001) 
        print("\nlearning rate decay to 1/100...")

    if epoch == 100:
        optimizer = torch.optim.Adam(target_model.parameters(), lr=0.000005) 
        print("\nlearning rate decay to 1/100...")

    for data in train_loader:
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data[0], data[1]

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        output = target_model(inputs)

        loss = criterion(output, labels)
        loss.backward()
        #parameter update
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
     
    # print statistics
    if epoch % print_every == (print_every - 1):
        epoch_loss = running_loss / len(train_loader.dataset)
        print(f'{datetime.now().time().replace(microsecond=0)} --- '
        f'Epoch: {epoch+1}\t'
        f'Train loss: {epoch_loss:.4f}\t')

print(f'-'*30, 'training is done', f'-'*30)
valid_acc = get_accuracy(target_model, valid_loader)
print(f'Valid accuracy: {100 * valid_acc:.2f}')

print('Finished Training.')


filename = '../dataset/Texas100.pth'
torch.save(target_model.state_dict(), filename)
