# import

In [0]:
import string
import sys
import warnings
from collections import OrderedDict
from collections import defaultdict
import os
import numpy as np
import pandas as pd
import math
import csv
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import argparse
import shutil

# Preprocessing

In [0]:
if sys.version_info < (3,):
    maketrans = string.maketrans
else:
    maketrans = str.maketrans

def text_to_word_sequence(text,
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                          lower=True, split=" "):
    
    if lower:
        text = text.lower()

    if sys.version_info < (3,):
        if isinstance(text, unicode):
            translate_map = dict((ord(c), unicode(split)) for c in filters)
            text = text.translate(translate_map)
        elif len(split) == 1:
            translate_map = maketrans(filters, split * len(filters))
            text = text.translate(translate_map)
        else:
            for c in filters:
                text = text.replace(c, split)
    else:
        translate_dict = dict((c, split) for c in filters)
        translate_map = maketrans(translate_dict)
        text = text.translate(translate_map)

    seq = text.split(split)
    return [i for i in seq if i]

In [0]:
class Tokenizer(object):

    def __init__(self, num_words=None,
                 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                 lower=True,
                 split=' ',
                 char_level=False,
                 oov_token=None,
                 document_count=0,
                 **kwargs):
        # Legacy support
        if 'nb_words' in kwargs:
            warnings.warn('The `nb_words` argument in `Tokenizer` '
                          'has been renamed `num_words`.')
            num_words = kwargs.pop('nb_words')
        if kwargs:
            raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

        self.word_counts = OrderedDict()
        self.word_docs = defaultdict(int)
        self.filters = filters
        self.split = split
        self.lower = lower
        self.num_words = num_words
        self.document_count = document_count
        self.char_level = char_level
        self.oov_token = oov_token
        self.index_docs = defaultdict(int)
        self.word_index = dict()
        self.index_word = dict()
    def fit_on_texts(self, texts):
        
        for text in texts:
            self.document_count += 1
            if self.char_level or isinstance(text, list):
                if self.lower:
                    if isinstance(text, list):
                        text = [text_elem.lower() for text_elem in text]
                    else:
                        text = text.lower()
                seq = text
            else:
                seq = text_to_word_sequence(text,
                                            self.filters,
                                            self.lower,
                                            self.split)
            for w in seq:
                if w in self.word_counts:
                    self.word_counts[w] += 1
                else:
                    self.word_counts[w] = 1
            for w in set(seq):
                # In how many documents each word occurs
                self.word_docs[w] += 1

        wcounts = list(self.word_counts.items())
        wcounts.sort(key=lambda x: x[1], reverse=True)
        # forcing the oov_token to index 1 if it exists
        if self.oov_token is None:
            sorted_voc = []
        else:
            sorted_voc = [self.oov_token]
        sorted_voc.extend(wc[0] for wc in wcounts)

        # note that index 0 is reserved, never assigned to an existing word
        self.word_index = dict(
            list(zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))))

        self.index_word = dict((c, w) for w, c in self.word_index.items())

        for w, c in list(self.word_docs.items()):
            self.index_docs[self.word_index[w]] = c

    def fit_on_sequences(self, sequences):
        self.document_count += len(sequences)
        for seq in sequences:
            seq = set(seq)
            for i in seq:
                self.index_docs[i] += 1

    def texts_to_sequences(self, texts):
        return list(self.texts_to_sequences_generator(texts))

    def texts_to_sequences_generator(self, texts):
        num_words = self.num_words
        oov_token_index = self.word_index.get(self.oov_token)
        for text in texts:
            if self.char_level or isinstance(text, list):
                if self.lower:
                    if isinstance(text, list):
                        text = [text_elem.lower() for text_elem in text]
                    else:
                        text = text.lower()
                seq = text
            else:
                seq = text_to_word_sequence(text,
                                            self.filters,
                                            self.lower,
                                            self.split)
            vect = []
            for w in seq:
                i = self.word_index.get(w)
                if i is not None:
                    if num_words and i >= num_words:
                        if oov_token_index is not None:
                            vect.append(oov_token_index)
                    else:
                        vect.append(i)
                elif self.oov_token is not None:
                    vect.append(oov_token_index)
            yield vect

def pad_sequences(sequences, maxlen=None, dtype='int32',
                  padding='pre', truncating='pre', value=0.):
    
    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)

    x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" '
                             'not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s '
                             'is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

# loaddata

In [0]:
class dataset:
    def __init__(self,filename,line=3):
        self.output = []
        self.content = []
        self.columns=line
        self.loadcsv(filename)
        
    def loadcsv(self, filename):
        reader = csv.reader(open(filename, "rt", encoding = "utf8"))
        count = 0
        for row in reader:
            if not row:
                continue
            if self.columns==2:
                self.output.append(int(row[0])-1)
                self.content.append((row[1]).lower())
            elif self.columns==3:
                self.output.append(int(row[0])-1)
                self.content.append((row[1] + " " + row[2]).lower())           
            elif self.columns==4:
                self.output.append(int(row[0])-1)
                self.content.append((row[1] + " " + row[2] + " " + row[3]).lower())       
def loaddata(i = 0):
    datanames = ['covid'] 
    lines = [2]
    classes= [2]
    # if not 'blog' in datanames[i]:
    trainadd = 'covid_wordrnn_charDelete_advTrain_25000.csv'
    valadd='covid_val_7500.csv'
    testadd = 'covid_test_17500.csv'
    advtestadd='covid_wordrnn_charDelete_test_adv_samples.csv'

    traindata = dataset(trainadd,lines[i])
    valdata=dataset(valadd,lines[i])
    testdata = dataset(testadd,lines[i])
    advtestdata = dataset(advtestadd,lines[i])                  
    return(traindata,valdata,testdata,advtestdata,classes[i])#valdata,
    # else:
    #     data = dataset()
    #     return (traindata,testdata,classes[i])

def loaddatawithtokenize(i = 0, nb_words = 20000, start_char = 1, oov_char=2, index_from=3, withraw = False, datalen = 500):
    (traindata,valdata,testdata,advtestdata,numclass) = loaddata(i)
    rawtrain = traindata.content[:]
    rawval = valdata.content[:]
    rawtest = testdata.content[:]
    rawadvtest=advtestdata.content[:]

    tokenizer = Tokenizer(lower=True)
    tokenizer.fit_on_texts(traindata.content + valdata.content+ testdata.content+ advtestdata.content)

    traindata.content = tokenizer.texts_to_sequences(traindata.content)
    valdata.content  = tokenizer.texts_to_sequences(valdata.content)
    testdata.content = tokenizer.texts_to_sequences(testdata.content)
    advtestdata.content  = tokenizer.texts_to_sequences(advtestdata.content)
    
    if start_char==None:
        traindata.content = [[w + index_from for w in x] for x in traindata.content]
        valdata.content = [[w + index_from for w in x] for x in valdata.content]
        testdata.content = [[w + index_from for w in x] for x in testdata.content]
        advtestdata.content = [[w + index_from for w in x] for x in advtestdata.content]
    else:
        traindata.content = [[start_char]+[w + index_from for w in x] for x in traindata.content]
        valdata.content = [[start_char]+[w + index_from for w in x] for x in valdata.content]
        testdata.content = [[start_char]+[w + index_from for w in x] for x in testdata.content]
        advtestdata.content = [[start_char]+[w + index_from for w in x] for x in advtestdata.content]
    
    traindata.content = [[w if w < nb_words else oov_char for w in x] for x in traindata.content]
    valdata.content = [[w if w < nb_words else oov_char for w in x] for x in valdata.content]
    testdata.content = [[w if w < nb_words else oov_char for w in x] for x in testdata.content]
    advtestdata.content = [[w if w < nb_words else oov_char for w in x] for x in advtestdata.content]
    
    traindata.content = pad_sequences(traindata.content, maxlen=datalen)
    valdata.content = pad_sequences(valdata.content, maxlen=datalen)
    testdata.content = pad_sequences(testdata.content, maxlen=datalen)
    advtestdata.content = pad_sequences(advtestdata.content, maxlen=datalen)
    if withraw:
        return traindata,valdata,testdata,advtestdata,tokenizer,numclass,rawtrain,rawtest
    else:
        return traindata,valdata,testdata,advtestdata,tokenizer,numclass

# dataloader

In [0]:
class Worddata(Dataset):
    def __init__(self, data, tokenizer = True, length=1014, space = False, backward = -1, getidx = False, rawdata = None):
        self.backward = backward
        self.length = length
        (self.inputs,self.labels) = (data.content,data.output)
        self.labels = torch.LongTensor(self.labels)
        self.inputs = torch.from_numpy(self.inputs).long()
        self.getidx = getidx
        if rawdata:
            self.raw = rawdata
    def __len__(self):
        return len(self.inputs)
    def __getitem__(self,idx):
        x = self.inputs[idx]
        y = self.labels[idx]
        if self.getidx==True:
            if self.raw:
                return x,y,idx,self.raw[idx]
            else:
                return x,y,idx
        else:
            return x,y

# model

In [104]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [0]:
class WordRNN(nn.Module):
    def __init__(self, classes=2, bidirection = False, layernum=1, length=20000,embedding_size =100, hiddensize = 100):
        super(WordRNN, self).__init__()
        self.embd = nn.Embedding(length, embedding_size)
        # self.lstm = nn.LSTMCell(hiddensize, hiddensize)
        self.lstm = nn.LSTM(embedding_size, hiddensize, layernum, bidirectional = bidirection)
        self.hiddensize = hiddensize
        numdirections = 1 + bidirection
        self.hsize = numdirections * layernum
        self.linear = nn.Linear(hiddensize * numdirections, classes)
        self.log_softmax = nn.LogSoftmax()
    def forward(self, x, returnembd = False):
        embd = self.embd(x)
        if returnembd:
            embd = Variable(embd.data, requires_grad=True).to(device)
            embd.retain_grad()
            # print embd.size()
        h0 = Variable(torch.zeros(self.hsize, embd.size(0), self.hiddensize)).to(device)
        c0 = Variable(torch.zeros(self.hsize, embd.size(0), self.hiddensize)).to(device)
        # for inputs in x:
        x = embd.transpose(0,1)
        x,(hn,cn) = self.lstm(x,(h0,c0))
        x = x[-1]
        # x = x[-1].transpose(0,1)
        # x = x.view(x.size(0),-1)
        x = self.log_softmax(self.linear(x))
        if returnembd:
            return embd,x
        else:
            return x

In [0]:
def save_checkpoint(state, is_best, filename='checkpoint.dat'):
    torch.save(state, filename + '_checkpoint.dat')
    if is_best:
        shutil.copyfile(filename + '_checkpoint.dat', filename + "_bestmodel.dat")

In [0]:
data=0
wordlength=200
dictionarysize=20000
batchsize=128
backward=-1
epochs=5
power=10

In [108]:
torch.manual_seed(7)
torch.cuda.manual_seed_all(7)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Loading data..")
(train,val,test,advtest,tokenizer,numclass)=loaddatawithtokenize(data,nb_words=dictionarysize,datalen=wordlength)
word_index = tokenizer.word_index
trainword = Worddata(train,backward = backward)
valword = Worddata(val,backward = backward)
testword = Worddata(test,backward = backward)
advtestword = Worddata(advtest,backward = backward)
train_loader = DataLoader(trainword,batch_size=batchsize, shuffle = True)#, num_workers=4
val_loader = DataLoader(valword,batch_size=batchsize, shuffle = True)#, num_workers=4
test_loader = DataLoader(testword,batch_size=batchsize)#, num_workers=4
advtest_loader = DataLoader(advtestword,batch_size=batchsize)#, num_workers=4

maxlength =wordlength

Loading data..


In [0]:
model = WordRNN(classes = 2)
model = model.to(device)
#print(model)
optimizer = torch.optim.AdamW(model.parameters(),weight_decay=0.05)

In [112]:
bestacc = 0
beta=1
for epoch in range(epochs):
    #print('Start epoch %d' % epoch)
    model.train()
    correct_train = .0
    correct_adv = .0
    total_loss_train = 0
    for dataid, data in enumerate(train_loader):
        inputs,target = data
        inputs,target = Variable(inputs),  Variable(target)
        inputs, target = inputs.to(device), target.to(device)
        output = model(inputs)
        natural_loss=F.nll_loss(output, target)
        pred_train = torch.max(output, 1)[1].view(target.size())
        #pred_train = output.data.max(1, keepdim=True)[1]
        correct_train += (pred_train == target).sum().item()
        
        #advinputs=generate_adv(model, inputs, pred_train, numclass)
        #output_adv = model(advinputs)
        #adv_loss=F.nll_loss(output_adv, target)
        #pred_adv = torch.max(output_adv, 1)[1].view(target.size())
        #correct_adv += (pred_adv == target).sum().item()
        
        loss =natural_loss #+ beta *(adv_loss)
        total_loss_train += loss.item()
        

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    correct_val= .0
    total_loss_val = 0
    model.eval()
    for dataid, data in enumerate(val_loader):
        inputs,target = data
        inputs, target = inputs.to(device), target.to(device)
        output = model(inputs)
        loss = F.nll_loss(output, target)
        total_loss_val += loss.item()
        pred = output.data.max(1, keepdim=True)[1]
        correct_val += pred.eq(target.data.view_as(pred)).cpu().sum().item()

    acc_train = correct_train/len(train_loader.dataset)
    #acc_train_adv=correct_adv/len(train_loader.dataset)
    avg_loss_train = total_loss_train/len(train_loader.dataset)
    acc_val = correct_val/len(val_loader.dataset)
    avg_loss_val = total_loss_val/len(val_loader.dataset)
    print('Epoch %d :'%(epoch+1))
    print('Train_Loss: %.4f Train_Accuracy: %.5f ' % (avg_loss_train,acc_train))
    print('Validation_Loss: %.4f Validation_Accuracy: %.5f' % (avg_loss_val,acc_val))
    is_best = acc_val > bestacc
    if is_best:
        bestacc = acc_val
    if dictionarysize!=20000:
        fname = "covid_wordrnn_adv" +str(dictionarysize) + "_" 
    else:
        fname = "covid_wordrnn_adv" + "_" 
        
    save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'bestacc': bestacc,
            'optimizer' : optimizer.state_dict(),
        }, is_best, filename = fname)



Epoch 1 :
Train_Loss: 0.0044 Train_Accuracy: 0.69725 
Validation_Loss: 0.0040 Validation_Accuracy: 0.73308
Epoch 2 :
Train_Loss: 0.0030 Train_Accuracy: 0.82918 
Validation_Loss: 0.0032 Validation_Accuracy: 0.81381
Epoch 3 :
Train_Loss: 0.0021 Train_Accuracy: 0.89420 
Validation_Loss: 0.0029 Validation_Accuracy: 0.84626
Epoch 4 :
Train_Loss: 0.0014 Train_Accuracy: 0.93177 
Validation_Loss: 0.0030 Validation_Accuracy: 0.85184
Epoch 5 :
Train_Loss: 0.0010 Train_Accuracy: 0.95585 
Validation_Loss: 0.0031 Validation_Accuracy: 0.86607


# Evaluate

In [0]:
modelpath='covid_wordrnn_adv__bestmodel.dat'
model = WordRNN(classes = 2)

state = torch.load(modelpath)
model = model.to(device)
try:
    model.load_state_dict(state['state_dict'])
except:
    model = torch.nn.DataParallel(model).to(device)
    model.load_state_dict(state['state_dict'])
    model = model.module

In [114]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 17500 clean test images: %.2f %%' % (100 * correct / total))



Accuracy of the network on the 17500 clean test images: 86.40 %


In [115]:
correct = 0
total = 0
lab=[]
pred=[]
with torch.no_grad():
    for data in advtest_loader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        #print(labels.tolist(),predicted.tolist())
        lab.extend(labels)
        pred.extend(predicted)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 17500 adversarial test images: %.2f %%' % (100 * correct / total))



Accuracy of the network on the 17500 adversarial test images: 81.85 %


In [0]:
lab=[i.item() for i in lab]
pred=[i.item() for i in pred]

In [88]:
from sklearn.metrics import classification_report
#target_names = ['class 0', 'class 1']
print(classification_report(lab, pred))#, target_names=target_names

              precision    recall  f1-score   support

           0       0.73      0.82      0.77      8748
           1       0.79      0.69      0.74      8796

    accuracy                           0.76     17544
   macro avg       0.76      0.76      0.75     17544
weighted avg       0.76      0.76      0.75     17544

