In [8]:
import json
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

from pathlib import Path

import math
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import f1_score  

  warn(f"Failed to load image Python extension: {e}")


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Helper functions

In [10]:
def list_files(dir):                                                                                                  
    r = []                                                                                                            
    subdirs = [x[0] for x in os.walk(dir)]
    for subdir in subdirs:
        files = os.walk(subdir).__next__()[2]
        if (len(files) > 0):                                                                                          
            for file in files:
                r.append(os.path.join(subdir, file))                   
    return r

def custom_split(x):
    return [x[i:i+20].strip() for i in range(0, len(x), 20)]


### Paths

In [11]:
train_path='./CSV_Data/Training'
test_path='./CSV_Data/Performance Test'
validation_path='./CSV_Data/Validation'

### Preprocessing

In [12]:
class preprocess:
    def __init__(self,dir_path,start_index,end_index):
        self.x=None
        self.y = None
        self.vocab= dict()
        
        self.load(dir_path,start_index,end_index)
        self.tosplit()
        self.build_vocab()
        self.tokenize()
        print("#### Done ####")
        
    def len_vocab(self):
        return len(self.vocab)
    
    def load(self,dir_path,start_index,end_index):
        print('#### Loading files ####')
        files=list_files(dir_path)
        files.sort()
        files=files[start_index:end_index]
        dataframes=[]
        req_cols=["hex_values","class"]
        for file in files:
            print(file)
            df = pd.read_csv(file,sep='\t',usecols=req_cols)
            dataframes.append(df)
        data=pd.concat(dataframes,ignore_index=True)
        self.y=data['class']
        self.x=data['hex_values']
    
    def tosplit(self):
        for idx, value in self.x.iteritems():
            self.x[idx]=custom_split(value)
          
    def build_vocab(self):  
        print('#### Building vocab ####')     
        i=1
        for idx, value in self.x.iteritems():
            for element in value:
                if element in self.vocab:
                    pass
                else:
                    self.vocab[element]=i
                    i=i+1
            
    def tokenize(self):
        print('#### Tokenization ####')
        for idx, value in self.x.iteritems():
            for i in range(len(value)):
                try:
                    value[i]=self.vocab[value[i]]
                except:
                    value[i]=0
                
        

        
        

### Dataset

In [13]:
class classification_set(Dataset):
    def __init__(self,x,y):
        self.x=x
        self.y=y
        self.to_tensor()
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    def __setitem__(self,idx,value):
        self.x[idx]=value[0]
        self.y[idx]=value[1]
    def to_tensor(self):
        for i in range(len(self.x)):
            temp=list(self[i])
            temp[0]=torch.Tensor(temp[0]).int()
            temp[1]=float(temp[1])
            temp=tuple(temp)
            self[i]=temp
        

In [17]:
%%time
train_data=preprocess(train_path,21,29)

#### Loading files ####
./CSV_Data/Training/basic-V_7_8_P1-16.csv
./CSV_Data/Training/basic-V_7_8_P1-24.csv
./CSV_Data/Training/basic-V_7_8_P1-32.csv
./CSV_Data/Training/basic-V_7_8_P1-64.csv
./CSV_Data/Training/basic-V_7_9_P1-16.csv
./CSV_Data/Training/basic-V_7_9_P1-24.csv
./CSV_Data/Training/basic-V_7_9_P1-32.csv
./CSV_Data/Training/basic-V_7_9_P1-64.csv
#### Building vocab ####
#### Tokenization ####
#### Done ####
CPU times: user 34.2 s, sys: 2.3 s, total: 36.5 s
Wall time: 1min 7s


In [18]:
%%time
test_data=preprocess(test_path,0,9)

#### Loading files ####
./CSV_Data/Performance Test/V_7_1_P1-16.csv
./CSV_Data/Performance Test/V_7_1_P1-24.csv
./CSV_Data/Performance Test/V_7_1_P1-32.csv
./CSV_Data/Performance Test/V_7_8_P1-16.csv
./CSV_Data/Performance Test/V_7_8_P1-24.csv
./CSV_Data/Performance Test/V_7_8_P1-32.csv
./CSV_Data/Performance Test/V_7_9_P1-16.csv
./CSV_Data/Performance Test/V_7_9_P1-24.csv
./CSV_Data/Performance Test/V_7_9_P1-32.csv
#### Building vocab ####
#### Tokenization ####
#### Done ####
CPU times: user 5.28 s, sys: 110 ms, total: 5.39 s
Wall time: 5.67 s


In [9]:
# train_data.vocab.update(test_data.vocab)
# test_data.vocab=train_data.vocab
# test_data.tokenize()

In [10]:
# %%time
# validation_data=preprocess(validation_path)

In [20]:
train_set= classification_set(train_data.x,train_data.y)
test_set= classification_set(test_data.x,test_data.y)
# validation_set= classification_set(validation_data.x,validation_data.y)

In [12]:
# validation_dataloader = DataLoader(validation_set, batch_size)

In [21]:
len(test_data.vocab)

2929679

In [22]:
len(train_data.vocab)

14986772

### Model

In [36]:
import n_gram_cnn

In [37]:
model= n_gram_cnn.model

In [18]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [19]:
# del model

In [34]:
print("allocated",torch.cuda.memory_allocated())
print("cached",torch.cuda.memory_reserved())

allocated 1923833344
cached 1941962752


In [33]:
model.to(device)

classifier(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(14986773, 32)
  (conv_1): Conv1d(200, 256, kernel_size=(2,), stride=(2,))
  (conv_2): Conv1d(200, 256, kernel_size=(4,), stride=(2,))
  (conv_3): Conv1d(200, 256, kernel_size=(8,), stride=(2,))
  (conv_4): Conv1d(200, 256, kernel_size=(10,), stride=(2,))
  (pool_1): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool_2): MaxPool1d(kernel_size=4, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool_3): MaxPool1d(kernel_size=8, stride=2, padding=0, dilation=1, ceil_mode=False)
  (pool_4): MaxPool1d(kernel_size=10, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=4864, out_features=1, bias=True)
)

### Training / Evaluation


In [29]:
def train(model, train_set, test_set):
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    batch_size = 100
    train_dataloader = DataLoader(train_set, batch_size)
    test_dataloader = DataLoader(test_set, batch_size)
    
    for epoch in range(10):
        model.train()
        predictions1 =[]
        for x_batch, y_batch in train_dataloader:

            y_batch = y_batch.type(torch.FloatTensor)
            x_batch= x_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(x_batch)
            #remove squeeze in case batch size > 1
            y_batch=torch.squeeze(y_batch)

            
            loss = F.binary_cross_entropy(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            # predictions.append(y_pred.cpu().detach().numpy())

            predictions1.extend(list(y_pred.cpu().detach().numpy()))
        predictions1=np.round_(predictions1)
        predictions = [int(a) for a in predictions1]


        test_predictions = evaluation(model, test_dataloader)
        train_accuary = calculate_accuray(train_set.y.to_list(), predictions1)
        test_accuracy = calculate_accuray(test_set.y.to_list(), test_predictions)
        # train_f1_score= calculate_f1_score(train_set.y.to_list(), predictions1)
        # test_f1_score= calculate_f1_score(test_set.y.to_list(), test_predictions)
        print("Epoch: %d, loss: %.5f, Train accuracy: %.5f,  Test accuracy: %.5f" % (epoch+1, loss.item(), train_accuary, test_accuracy))
        

def evaluation(model, test_dataloader):        
    model.eval()
    predictions2 = []
    # with torch.no_grad():
    for x_batch, y_batch in test_dataloader:
        
        x_batch= x_batch.to(device)
        y_batch = y_batch.to(device)
        y_pred = model(x_batch)
        # predictions.append(y_pred.cpu().detach().numpy())
        predictions2.extend(list(y_pred.cpu().detach().numpy()))
    predictions2=np.round_(predictions2)
    predictions=[int(a) for a in predictions2]
    return predictions

def calculate_accuray(grand_truth, predictions):
    true_positives = 0
    true_negatives = 0
    for true, pred in zip(grand_truth, predictions):
        if (pred == 1 ) and (true == 1):
            true_positives += 1
        elif (pred == 0) and (true == 0):
            true_negatives += 1
        else:
            pass
    return (true_positives+true_negatives) / len(grand_truth)

def calculate_f1_score(grand_truth, predictions):
    return f1_score(grand_truth,predictions)


In [24]:
%%time
train(model,train_set,test_set)

Epoch: 1, loss: 0.70495, Train accuracy: 0.65623,  Test accuracy: 0.43796
Epoch: 2, loss: 0.71717, Train accuracy: 0.65652,  Test accuracy: 0.43796
Epoch: 3, loss: 0.78768, Train accuracy: 0.65599,  Test accuracy: 0.43796
Epoch: 4, loss: 0.75863, Train accuracy: 0.65705,  Test accuracy: 0.43796
Epoch: 5, loss: 0.75729, Train accuracy: 0.65647,  Test accuracy: 0.43796
Epoch: 6, loss: 0.71620, Train accuracy: 0.65713,  Test accuracy: 0.43796
Epoch: 7, loss: 0.73988, Train accuracy: 0.65710,  Test accuracy: 0.43796
Epoch: 8, loss: 0.73501, Train accuracy: 0.65746,  Test accuracy: 0.43796
Epoch: 9, loss: 0.72632, Train accuracy: 0.65603,  Test accuracy: 0.43796
Epoch: 10, loss: 0.75637, Train accuracy: 0.65649,  Test accuracy: 0.43796
CPU times: user 11min 14s, sys: 1.03 s, total: 11min 15s
Wall time: 11min 15s
