In [1]:
import argparse
import sys
sys.path.append("..")
import os

In [12]:
import torch 
#print(torch.__version__)

In [13]:
parser = argparse.ArgumentParser()
parser.add_argument(
    '--embedding', 
    default='full',
    choices=['tt', 'tr', 'full'],
    type=str)
parser.add_argument('--ranks', type=int, default=8)
parser.add_argument('--d', type=int, default=3)
parser.add_argument('--embed_dim', type=int, default=64)
parser.add_argument('--voc_dim', default=250, type=int)
parser.add_argument('--lr', default=5e-4)
parser.add_argument('--gpu', default='', type=str)
parser.add_argument('--hidden_dim', default=128, type=int)
parser.add_argument('--n_epochs',  default=10, type=int)
parser.add_argument('--fout',  default="logdir/", type=str)
parser.add_argument('--dropout', default=0.5, type=float)
parser.add_argument(
    '--dataset',
    default='TB',
    type=str)
args = parser.parse_args('')

In [14]:
if args.embedding == 'tt':
    tt = "tt"
elif args.embedding == 'tr':
    tt = 'tr'
else:             
    tt = "full"

In [15]:
model_name = f"{args.dataset}-dim_{args.embed_dim}-d_{args.d}-ranks_{args.ranks}-{tt}"

In [16]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']=args.gpu
import utils
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import t3nsor as t3
from torchtext import data
from torchtext import datasets
import torch.optim as optim
from models import LSTM_Classifier
#from utils import binary_accuracy, train, evaluate
import pickle
import random
import spacy
from spacy.cli.download import download
random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nlp = spacy.load("en_core_web_sm")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [17]:
print("Building the dataset")
data_path = '/home/mashjunior/Desktop/FactorEmbeddings/embedding_eperiments/tt_embeddings/sentiment/contig_data.tsv'
TEXT = data.Field(tokenize='spacy', fix_length=100)
LABEL = data.LabelField(dtype=torch.float)
fields = [('text', TEXT),('label', LABEL)]
training_data = data.TabularDataset(
    path=data_path,
    format='csv',
    fields=fields,
    skip_header=True,
)
print("done")

Building the dataset
done


In [18]:
train_data, val_data = training_data.split(split_ratio=0.7, random_state=random.seed(42))

In [19]:
val_data, test_data = val_data.split(split_ratio=0.2, random_state=random.seed(42))

In [20]:
device

device(type='cpu')

In [21]:
OUTPUT_DIM = 1
#train_data, test_ = datasets.IMDB.splits(TEXT, LABEL)
#test_list = list(test_)
#random.shuffle(test_list)
#test_data_ = test_list[:12500]
#val_data_ = test_list[12500:]
#train_data = data.dataset.Dataset(train_data, fields=[('text', TEXT), ('label', LABEL)])
valid_data = data.dataset.Dataset(val_data, fields=fields)
test_data = data.dataset.Dataset(test_data, fields=fields)

In [22]:
def sort_key(ex):
    return len(ex.text)

In [23]:
TEXT.build_vocab(train_data, max_size=args.voc_dim - 2)
LABEL.build_vocab(train_data)

BATCH_SIZE = 64

In [24]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

valid_iterator.sort_key = sort_key
test_iterator.sort_key = sort_key

In [25]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = args.embed_dim
HIDDEN_DIM = args.hidden_dim
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = args.dropout

In [26]:
actual_vocab_size = len(TEXT.vocab.stoi)

In [27]:
actual_vocab_size

250

In [28]:
lstm_model = LSTM_Classifier(embedding_dim=EMBEDDING_DIM,
                             hidden_dim=HIDDEN_DIM,
                             output_dim=OUTPUT_DIM,
                             n_layers=N_LAYERS,
                             bidirectional=BIDIRECTIONAL,
                             dropout=DROPOUT)

In [29]:
if args.embedding == 'tt':
        embed_model = t3.TTEmbedding(
            voc_size=INPUT_DIM,
            emb_size=EMBEDDING_DIM,
            auto_shapes=True,
            auto_shape_mode='mixed',
            d=args.d,
            tt_rank=args.ranks,
            padding_idx=1
        )
        compression_rate = INPUT_DIM * EMBEDDING_DIM / embed_model.tt_matrix.dof
elif args.embedding == 'tr':
        embed_model = t3.TREmbedding(
            voc_size=INPUT_DIM,
            emb_size=EMBEDDING_DIM,
            auto_shapes=True,
            auto_shape_mode='mixed',
            d=args.d,
            tr_rank=args.ranks,
            padding_idx=1
        )
        compression_rate = INPUT_DIM * EMBEDDING_DIM / embed_model.tr_matrix.dof
else:
    embed_model = nn.Embedding(
        num_embeddings=INPUT_DIM,
        embedding_dim=EMBEDDING_DIM
    )
    compression_rate = 1.0


def cross_entropy_loss(logits, target):
    labels = target.type(torch.LongTensor).to(logits.device)
    return nn.CrossEntropyLoss()(logits, labels)

In [30]:
model = nn.Sequential(embed_model, lstm_model)

In [31]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    if len(preds.shape) == 1:
        rounded_preds = torch.round(torch.sigmoid(preds))
    else:
        rounded_preds = preds.argmax(1)
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum()/len(correct)
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0
    total_len = 0

    model.train()
    
    if isinstance(criterion, nn.CrossEntropyLoss):
        dtype = torch.LongTensor
    elif isinstance(criterion, nn.BCEWithLogitsLoss):
        dtype = torch.FloatTensor

    for i, batch in enumerate(iterator):

        optimizer.zero_grad()
        device = batch.text.device
        labels = batch.label.type(dtype).to(device)
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()

        B = batch.label.shape[0]

        epoch_loss += B * loss.item()
        epoch_acc += B * acc.item()

        total_len += B


        if i > len(iterator):
            break

    return epoch_loss / total_len, epoch_acc / total_len

In [33]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    total_len = 0

    model.eval()
    
    if isinstance(criterion, nn.CrossEntropyLoss):
        dtype = torch.LongTensor
    elif isinstance(criterion, nn.BCEWithLogitsLoss):
        dtype = torch.FloatTensor

    with torch.no_grad():

        for i, batch in enumerate(iterator):
            
            device = batch.text.device
            labels = batch.label.type(dtype).to(device)
            predictions = model(batch.text).squeeze(1)

            loss = criterion(predictions, labels)

            acc = binary_accuracy(predictions, labels)
            B = batch.label.shape[0]

            epoch_loss += B * loss.item()
            epoch_acc += B * acc.item()
            total_len += B

            if i > len(iterator):
                break

    return epoch_loss / total_len, epoch_acc / total_len

In [34]:
if args.dataset == 'TB':
    criterion = nn.BCEWithLogitsLoss()
    criterion = criterion.to(device)
#elif args.dataset[:3] == 'sst':
#    criterion = nn.CrossEntropyLoss()
    #criterion = criterion.to(device)
else:
    raise NotImplementedError

#criterion = nn.BCEWithLogitsLoss()
#criterion = criterion.to(device)

In [35]:
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
print(model)
N_EPOCHS = args.n_epochs

Sequential(
  (0): Embedding(250, 64)
  (1): LSTM_Classifier(
    (rnn): LSTM(64, 128, num_layers=2, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=256, out_features=1, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [36]:
log = {
    'compression_rate':compression_rate,
    'train_loss':[], 'test_loss':[], 'valid_loss':[],
    'train_acc':[], 'test_acc':[], 'valid_acc':[]}
best_result = {
    "epoch": 0, "train_acc": 0, "valid_acc": 0, "train_acc": 0}

In [37]:
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    log['train_loss'].append(train_loss)
    log['test_loss'].append(test_loss)
    log['train_acc'].append(train_acc)
    log['test_acc'].append(test_acc)
    log['valid_acc'].append(valid_acc)
    log['valid_loss'].append(valid_loss)

    if best_result["valid_acc"] < valid_acc:
        best_result["epoch"] = epoch
        best_result["train_acc"] = train_acc
        best_result["valid_acc"] = valid_acc
        best_result["test_acc"] = test_acc

    #if args.fout is not None:
    #    with open(args.fout+f"{model_name}-best.pkl", 'wb') as f:
    #        pickle.dump(best_result, f)
    print(f'| Epoch: {epoch+1:.2f} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')
    print ("TEST ACCURACY:", np.round(best_result["test_acc"] * 100, 2))

| Epoch: 1.00 | Train Loss: 0.690 | Train Acc: 51.07% | Val. Loss: 0.690 | Val. Acc: 50.00% | Test Loss: 0.684 | Test Acc: 54.17% |
TEST ACCURACY: 54.17
| Epoch: 2.00 | Train Loss: 0.680 | Train Acc: 59.64% | Val. Loss: 0.667 | Val. Acc: 66.67% | Test Loss: 0.661 | Test Acc: 75.00% |
TEST ACCURACY: 75.0
| Epoch: 3.00 | Train Loss: 0.662 | Train Acc: 62.14% | Val. Loss: 0.657 | Val. Acc: 58.33% | Test Loss: 0.641 | Test Acc: 54.17% |
TEST ACCURACY: 75.0
| Epoch: 4.00 | Train Loss: 0.634 | Train Acc: 61.07% | Val. Loss: 0.623 | Val. Acc: 66.67% | Test Loss: 0.600 | Test Acc: 70.83% |
TEST ACCURACY: 75.0
| Epoch: 5.00 | Train Loss: 0.596 | Train Acc: 72.86% | Val. Loss: 0.616 | Val. Acc: 58.33% | Test Loss: 0.580 | Test Acc: 54.17% |
TEST ACCURACY: 75.0
| Epoch: 6.00 | Train Loss: 0.545 | Train Acc: 71.07% | Val. Loss: 0.475 | Val. Acc: 79.17% | Test Loss: 0.404 | Test Acc: 93.75% |
TEST ACCURACY: 93.75
| Epoch: 7.00 | Train Loss: 0.499 | Train Acc: 76.43% | Val. Loss: 0.420 | Val. Acc: 7

In [38]:
log

{'compression_rate': 1.0,
 'train_loss': [0.6897736396108355,
  0.6804030707904271,
  0.6622858847890581,
  0.6343153732163566,
  0.5957407082830156,
  0.544941406590598,
  0.4992321653025491,
  0.5006222171442849,
  0.3826727219990322,
  0.2734083767448153],
 'test_loss': [0.6835191448529562,
  0.6609715024630228,
  0.6409523288408915,
  0.599717398484548,
  0.5798415144284567,
  0.40420785546302795,
  0.37961949904759723,
  0.3573540349801381,
  0.3450621763865153,
  0.1702261765797933],
 'valid_loss': [0.6895813941955566,
  0.6674940586090088,
  0.6571474671363831,
  0.6232240796089172,
  0.6163012385368347,
  0.47503402829170227,
  0.42000821232795715,
  0.46019086241722107,
  0.5238416790962219,
  0.19741792976856232],
 'train_acc': [0.5107142857142857,
  0.5964285697255816,
  0.6214285731315613,
  0.6107142857142858,
  0.7285714302744184,
  0.7107142840112959,
  0.7642857142857142,
  0.7321428554398673,
  0.8499999982970101,
  0.9142857125827244],
 'test_acc': [0.5416666666666666

In [39]:
best_result

{'epoch': 9,
 'train_acc': 0.9142857125827244,
 'valid_acc': 1.0,
 'test_acc': 0.96875}