In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import warnings
import numpy as np
from pathlib import Path

import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable
import tqdm
import pickle
import matplotlib.pyplot as plt
import matplotlib

from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [2]:
print('torch version', torch.__version__)
print('cuda version',torch.version.cuda)
print(torch.cuda.device_count())

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu", 0)
print (device)

if device.type == 'cuda':
    print(torch.cuda.get_device_name())
    print('Memory Usage:')
    print('Allocated:   ', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:      ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    print('Total memory:', round(torch.cuda.get_device_properties(0).total_memory/1024**3,1),'GB')

torch version 1.8.0
cuda version 11.1
1
cuda:0
NVIDIA GeForce RTX 2080 Ti
Memory Usage:
Allocated:    0.0 GB
Cached:       0.0 GB
Total memory: 10.8 GB


In [3]:
%matplotlib inline
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rc('xtick', labelsize=20) 
matplotlib.rc('ytick', labelsize=20)
import pylab
pylab.rcParams['figure.figsize'] = (10.0, 8.0)

In [17]:
min_idx = 2
batch_size = 125
num_epoches = 4
negative_samples = 4 # how many false for every correct sample
scale = 0.5
pad_ratio = 0.5
pb = 0.0005
minutes = [180, 300, 900, 1800]
flowSize = minutes[min_idx]

In [18]:
root_dir_wire_normal_no_photos = 'path/to/traces/for/training/and/validation'
results_dir = Path(root_dir_wire_normal_no_photos) / 'deepcorr'
pickles_dir = results_dir / 'pickles'
with open(results_dir /  'all_user_flows.pickle', 'rb') as handle:
    all_user_flows = pickle.load(handle)
with open(results_dir / 'all_channel_flows.pickle', 'rb') as handle:
    all_channel_flows = pickle.load(handle)

In [19]:
userFlows = np.array(all_user_flows[min_idx]) # 2 for 900
advFlows = np.array(all_channel_flows[min_idx])

In [20]:
userFlows.shape, advFlows.shape

((268, 46, 900), (268, 46, 900))

In [21]:
length = len(userFlows)
rr= list(range(length))
np.random.shuffle(rr)
train_index = rr[:200]
test_index = rr[200:] # validation

# Uncomment when running for the first time:
# with open(pickles_dir / 'Wire_train_test_indices_deepcorr.pickle', 'wb') as f:
#     pickle.dump((train_index, test_index), f)

In [22]:
with open(pickles_dir / 'Wire_train_test_indices_deepcorr.pickle', 'rb') as handle:
    (train_index, test_index) = pickle.load(handle)

In [None]:
def generate_data(user_flows, adv_flows, train_index):

    x_train = np.zeros((len(train_index)*user_flows.shape[1]*(negative_samples+1), 1, 2, flowSize))

    y_train = np.zeros((len(train_index)*user_flows.shape[1]*(negative_samples+1), 1))

    print ('x_train shape is {}'.format(x_train.shape))

    index = 0
    random_ordering = [] + train_index
    for i in tqdm.tqdm(train_index):
        for j in range(len(userFlows[i])):
        
            x_train[index, 0, 0, :] = user_flows[i, j, :]
            x_train[index, 0, 1, :] = adv_flows[i, j, :]

            if index % (negative_samples+1) != 0:
                print (index)
                raise
            y_train[index, 0] = 1
            m = 0
            index += 1
            np.random.shuffle(random_ordering)
            for idx in random_ordering:
                if idx == i or m>(negative_samples-1):
                    continue
                m += 1
                x_train[index, 0, 0, :] = user_flows[idx, j, :]
                x_train[index, 0, 1, :] = adv_flows[i, j, :]

                y_train[index,0] = 0
                index += 1
    return x_train, y_train

x_test = np.zeros((len(test_index)*len(test_index)*userFlows.shape[1], 1, 2, flowSize))

y_test = np.zeros((len(test_index)*len(test_index)*userFlows.shape[1], 1))


index = 0
for i in tqdm.tqdm(test_index):
    for k in range(len(userFlows[i])):
        for j in test_index:
            x_test[index, 0, 0, :] = userFlows[i, k, :]
            x_test[index, 0, 1, :] = advFlows[j, k, :]

            if i == j:
                y_test[index, 0] = 1
            else:
                y_test[index, 0] = 0

            index += 1


In [33]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 1000, (2,40), stride=2)
        self.max_pool1 = nn.MaxPool2d((1,4), stride=1)
        
        self.conv2 = nn.Conv2d(1000, 400, (1,20), stride=2)
        self.conv2_bn = nn.BatchNorm2d(400)
        self.max_pool2 = nn.MaxPool2d((1,4), stride=1)

        self.fc1 = nn.Linear(80800, 3000)

        self.fc2 = nn.Linear(3000, 800)
        self.fc3 = nn.Linear(800, 100)
        self.fc4 = nn.Linear(100, 1)
    
    def weight_init(self):
        for m in self._modules:
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                m.weight.data.normal_(0.0, 0.01)
                m.bias.data.zero_()
    
    def forward(self, inp, dropout_prob):
        x = inp
        x = F.relu(self.conv1(x))
        x = self.max_pool1(x)
        x = F.relu(self.conv2_bn(self.conv2(x)))
        x = self.max_pool2(x)
        
        x = x.view(batch_size, -1)
        
        x = F.leaky_relu(self.fc1(x))
        x = F.dropout2d(x, p=dropout_prob)
        x = F.leaky_relu(self.fc2(x))
        x = F.dropout2d(x, p=dropout_prob)
        x = F.leaky_relu(self.fc3(x))
        x = F.dropout2d(x, p=dropout_prob)
        x = self.fc4(x)
        return x

In [34]:
class TrainData(Dataset):
    def __init__(self, x_train, y_train):
        
        self.x = x_train
        self.y = y_train
        
    def __getitem__(self, index):
        x_train_cuda = torch.from_numpy(self.x[index]).float().to(device)
        y_train_cuda = torch.from_numpy(self.y[index]).float().to(device)
        return x_train_cuda, y_train_cuda
    
    def __len__(self):
        return len(self.x)
    

class TestData(Dataset):
    def __init__(self, x_test):
        
        self.x = x_test
        
    def __getitem__(self, index):
        x_test_cuda = torch.from_numpy(self.x[index]).float().to(device)
        return x_test_cuda
    
    def __len__(self):
        return len(self.x)


In [35]:
test_dataset = TestData(x_test)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle=False)

In [15]:
def train(model, device, train_loader, optimizer):
    model.train()
    running_loss=0
    train_corrs= np.zeros((len(train_index)*userFlows.shape[1]*(negative_samples+1), 1))
    for batch_idx, (data, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data, 0.4) # dropout = 0.4
        loss = F.binary_cross_entropy_with_logits(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        o = torch.sigmoid(outputs)
        corrs = o.data.cpu().numpy()
        train_corrs[batch_idx*batch_size:(batch_idx+1)*batch_size] = corrs
        if batch_idx % 100 == 0:
            print ("Loss: {:0.6f} for batch_idx {}".format(loss.item(), batch_idx))       
    train_loss=running_loss/len(train_loader)
    print("Training loss: {} for epoch".format(train_loss))
    l = train_loss
    return l, train_corrs

def test(model, device, test_loader, index):
    test_corrs = np.zeros((len(index)*len(index)*userFlows.shape[1], 1))
    model.eval()
    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            if data.size(0) != batch_size: ### ?
                print('size is {}'.format(data.size(0)))
                continue
            output = model(data, 0.0)
            o = torch.sigmoid(output)
            corrs = o.data.cpu().numpy()

            if batch_idx % 100 == 0:
                print ("batch_idx {}".format(batch_idx))
            test_corrs[batch_idx*batch_size:(batch_idx+1)*batch_size] = corrs
    return test_corrs
            
    
def cal_tp_fp(corrs):
    th = np.arange(0,1,0.1)
    fp = []
    tp = []
    diag = np.diagonal(corrs)
    for t in th:
        tp_temp = np.sum(diag[diag >= t]) / float(len(diag))
        tp.append(tp_temp)
        fp_temp =0
        for i in range(corrs.shape[0]):
            for j in range(corrs.shape[0]):
                if i == j:
                    continue
                else:
                    if corrs[i,j] >= t:
                        fp_temp += 1
        fp.append(float(fp_temp) / (corrs.shape[0]*corrs.shape[1] - len(corrs)))
        
    return (tp,fp)

In [17]:
model = Net().to(device)
model.weight_init()
optimizer = optim.Adam(model.parameters(), lr = 0.0001)

In [None]:
last_acc = 0
last_loss = 100
for epoch in range(num_epoches):
    
    print ('Epoch: ', epoch)
    
    x_train, y_train = generate_data(userFlows, advFlows, train_index)
    
    train_dataset = TrainData(x_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    print('now calculating loss')
    loss, train_corrs = train(model, device, train_loader, optimizer)
    tp = 0
    fp = 0
    for idx in range(int(len(x_train) / len(train_index))):
        best=np.argmax(train_corrs[idx*(len(train_index)):(idx+1)*(len(train_index))])

        if y_train[best+(idx*(len(train_index)))]==1:
            tp+=1
        else:
            fp+=1
    acc = float(tp) / float(tp+fp)
    print ('train', fp, tp, acc)
    print('now testing (validation)')  
    corrs = test(model, device, test_loader, test_index)
    
    tp = 0
    fp = 0
    for idx in range(int(len(x_test) / len(test_index))):
        best=np.argmax(corrs[idx*(len(test_index)):(idx+1)*(len(test_index))])

        if y_test[best+(idx*(len(test_index)))]==1:
            tp+=1
        else:
            fp+=1
    acc = float(tp) / float(tp+fp)
    print ('test (validation)', fp, tp, acc)
    if acc > 0.4 and loss < last_loss:
        print ("saving...")
        torch.save(model.state_dict(), results_dir / 'models/{}/wire-{}-epoch{}-acc{:.2f}-nsp{}.pth.tar'.format(flowSize, flowSize, epoch, acc, negative_samples))
        
        print ("saved for epoch {} and loss {} and accuracy {}".format(epoch, loss, acc))
        last_acc = acc
        last_loss = loss