In [1]:
import pandas as pd
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
import torch 
import argparse
import sys
sys.path.append("..")

In [3]:
parser = argparse.ArgumentParser()
parser.add_argument(
    '--embedding', 
    default='tt',
    choices=['tt', 'tr', 'full'],
    type=str)
parser.add_argument('--ranks', type=int, default=8)
parser.add_argument('--d', type=int, default=3)
parser.add_argument('--embed_dim', type=int, default=64)
parser.add_argument('--voc_dim', default=250, type=int)
parser.add_argument('--lr', default=5e-4)
parser.add_argument('--gpu', default='', type=str)
parser.add_argument('--hidden_dim', default=128, type=int)
parser.add_argument('--n_epochs',  default=100, type=int)
parser.add_argument('--fout',  default="logdir/", type=str)
parser.add_argument('--dropout', default=0.5, type=float)
parser.add_argument(
    '--dataset',
    default='Pcam',
    type=str)
args = parser.parse_args('')

In [4]:
if args.embedding == 'tt':
    tt = "tt"
elif args.embedding == 'tr':
    tt = 'tr'
else:             
    tt = "full"

In [5]:
model_name = f"{args.dataset}-dim_{args.embed_dim}-d_{args.d}-ranks_{args.ranks}-{tt}"

In [6]:
import os
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']=args.gpu
import utils
import torch
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import t3nsor as t3
from torchtext import data
from torchtext import datasets
import torch.optim as optim
from models import LSTM_Classifier
#from utils import binary_accuracy, train, evaluate
import pickle
import random
import spacy
from spacy.cli.download import download
random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
nlp = spacy.load("en_core_web_sm")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [7]:
TEXT = data.Field(tokenize='spacy', fix_length=100)
LABEL = data.LabelField(dtype=torch.float)
fields = [('sequence', TEXT),('label', LABEL)]

In [8]:
print("Building the dataset...")
OUTPUT_DIM = 1000
fine_grained = (OUTPUT_DIM == 1000)
seq_train = "/home/mashjunior/Downloads/Pcam_data/new_sequence_data/seq_train.csv"
seq_val = "/home/mashjunior/Downloads/Pcam_data/new_sequence_data/seq_val.csv"
seq_test = "/home/mashjunior/Downloads/Pcam_data/new_sequence_data/seq_test.csv"
    
training_data = data.TabularDataset(
    path=seq_train,
    format='csv',
    fields=fields,
    skip_header=True,
)

validation_data = data.TabularDataset(
    path=seq_val,
    format='csv',
    fields=fields,
    skip_header=True,
)

test_data = data.TabularDataset(
    path=seq_test,
    format='csv',
    fields=fields,
    skip_header=True,
)

Building the dataset...


In [9]:
def sort_key(ex):
    return len(ex.sequence)

In [10]:
TEXT.build_vocab(training_data, max_size=args.voc_dim - 2)
LABEL.build_vocab(training_data)

BATCH_SIZE = 64

In [11]:
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (training_data, validation_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)

valid_iterator.sort_key = sort_key
test_iterator.sort_key = sort_key

In [12]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = args.embed_dim
HIDDEN_DIM = args.hidden_dim
N_LAYERS = 1
BIDIRECTIONAL = True
DROPOUT = args.dropout

In [13]:
actual_vocab_size = len(TEXT.vocab.stoi)

In [14]:
actual_vocab_size

250

In [15]:
lstm_model = LSTM_Classifier(embedding_dim=EMBEDDING_DIM,
                             hidden_dim=HIDDEN_DIM,
                             output_dim=OUTPUT_DIM,
                             n_layers=N_LAYERS,
                             bidirectional=BIDIRECTIONAL,
                             dropout=DROPOUT)

  "num_layers={}".format(dropout, num_layers))


In [16]:
if args.embedding == 'tt':
        embed_model = t3.TTEmbedding(
            voc_size=INPUT_DIM,
            emb_size=EMBEDDING_DIM,
            auto_shapes=True,
            auto_shape_mode='mixed',
            d=args.d,
            tt_rank=args.ranks,
            padding_idx=1
        )
        compression_rate = INPUT_DIM * EMBEDDING_DIM / embed_model.tt_matrix.dof
elif args.embedding == 'tr':
        embed_model = t3.TREmbedding(
            voc_size=INPUT_DIM,
            emb_size=EMBEDDING_DIM,
            auto_shapes=True,
            auto_shape_mode='mixed',
            d=args.d,
            tr_rank=args.ranks,
            padding_idx=1
        )
        compression_rate = INPUT_DIM * EMBEDDING_DIM / embed_model.tr_matrix.dof
else:
    embed_model = nn.Embedding(
        num_embeddings=INPUT_DIM,
        embedding_dim=EMBEDDING_DIM
    )
    compression_rate = 1.0


def cross_entropy_loss(logits, target):
    labels = target.type(torch.LongTensor).to(logits.device)
    return nn.CrossEntropyLoss()(logits, labels)

In [17]:
model = nn.Sequential(embed_model, lstm_model)

In [18]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    if len(preds.shape) == 1:
        rounded_preds = torch.round(torch.sigmoid(preds))
    else:
        rounded_preds = preds.argmax(1)
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum()/len(correct)
    return acc

In [19]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0
    total_len = 0

    model.train()
    
    if isinstance(criterion, nn.CrossEntropyLoss):
        dtype = torch.LongTensor
    elif isinstance(criterion, nn.BCEWithLogitsLoss):
        dtype = torch.FloatTensor

    for i, batch in enumerate(iterator):

        optimizer.zero_grad()
        device = batch.sequence.device
        labels = batch.label.type(dtype).to(device)
        predictions = model(batch.sequence).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()

        B = batch.label.shape[0]

        epoch_loss += B * loss.item()
        epoch_acc += B * acc.item()

        total_len += B


        if i > len(iterator):
            break

    return epoch_loss / total_len, epoch_acc / total_len

In [20]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    total_len = 0

    model.eval()
    
    if isinstance(criterion, nn.CrossEntropyLoss):
        dtype = torch.LongTensor
    elif isinstance(criterion, nn.BCEWithLogitsLoss):
        dtype = torch.FloatTensor

    with torch.no_grad():

        for i, batch in enumerate(iterator):
            
            device = batch.sequence.device
            labels = batch.label.type(dtype).to(device)
            predictions = model(batch.sequence).squeeze(1)

            loss = criterion(predictions, labels)

            acc = binary_accuracy(predictions, labels)
            B = batch.label.shape[0]

            epoch_loss += B * loss.item()
            epoch_acc += B * acc.item()
            total_len += B

            if i > len(iterator):
                break

    return epoch_loss / total_len, epoch_acc / total_len

In [21]:
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device)

In [22]:
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
print(model)
N_EPOCHS = args.n_epochs

Sequential(
  (0): TTEmbedding(
    (parameters): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 1x5x4x8]
        (1): Parameter containing: [torch.FloatTensor of size 8x6x4x8]
        (2): Parameter containing: [torch.FloatTensor of size 8x10x4x1]
    )
  )
  (1): LSTM_Classifier(
    (rnn): LSTM(64, 128, dropout=0.5, bidirectional=True)
    (fc): Linear(in_features=256, out_features=1000, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


In [23]:
log = {
    'compression_rate':compression_rate,
    'train_loss':[], 'test_loss':[], 'valid_loss':[],
    'train_acc':[], 'test_acc':[], 'valid_acc':[]}
best_result = {
    "epoch": 0, "train_acc": 0, "valid_acc": 0, "train_acc": 0}

In [24]:
for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    log['train_loss'].append(train_loss)
    log['test_loss'].append(test_loss)
    log['train_acc'].append(train_acc)
    log['test_acc'].append(test_acc)
    log['valid_acc'].append(valid_acc)
    log['valid_loss'].append(valid_loss)

    if best_result["valid_acc"] < valid_acc:
        best_result["epoch"] = epoch
        best_result["train_acc"] = train_acc
        best_result["valid_acc"] = valid_acc
        best_result["test_acc"] = test_acc

    #if args.fout is not None:
    #    with open(args.fout+f"{model_name}-best.pkl", 'wb') as f:
    #        pickle.dump(best_result, f)
    print(f'| Epoch: {epoch+1:.2f} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}% | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% |')
    print ("TEST ACCURACY:", np.round(best_result["test_acc"] * 100, 2))

| Epoch: 1.00 | Train Loss: 6.173 | Train Acc: 1.60% | Val. Loss: 5.590 | Val. Acc: 3.21% | Test Loss: 5.590 | Test Acc: 3.21% |
TEST ACCURACY: 3.21
| Epoch: 2.00 | Train Loss: 5.529 | Train Acc: 3.42% | Val. Loss: 5.280 | Val. Acc: 4.74% | Test Loss: 5.280 | Test Acc: 4.74% |
TEST ACCURACY: 4.74
| Epoch: 3.00 | Train Loss: 5.326 | Train Acc: 4.35% | Val. Loss: 5.118 | Val. Acc: 6.32% | Test Loss: 5.118 | Test Acc: 6.32% |
TEST ACCURACY: 6.32
| Epoch: 4.00 | Train Loss: 5.212 | Train Acc: 5.21% | Val. Loss: 5.067 | Val. Acc: 6.66% | Test Loss: 5.067 | Test Acc: 6.66% |
TEST ACCURACY: 6.66
| Epoch: 5.00 | Train Loss: 5.135 | Train Acc: 5.91% | Val. Loss: 4.961 | Val. Acc: 7.67% | Test Loss: 4.961 | Test Acc: 7.67% |
TEST ACCURACY: 7.67
| Epoch: 6.00 | Train Loss: 5.046 | Train Acc: 6.74% | Val. Loss: 4.994 | Val. Acc: 6.47% | Test Loss: 4.994 | Test Acc: 6.47% |
TEST ACCURACY: 7.67
| Epoch: 7.00 | Train Loss: 5.007 | Train Acc: 7.40% | Val. Loss: 4.858 | Val. Acc: 9.36% | Test Loss: 4.8

| Epoch: 55.00 | Train Loss: 4.687 | Train Acc: 12.59% | Val. Loss: 4.592 | Val. Acc: 13.48% | Test Loss: 4.592 | Test Acc: 13.48% |
TEST ACCURACY: 13.5
| Epoch: 56.00 | Train Loss: 4.690 | Train Acc: 12.34% | Val. Loss: 4.592 | Val. Acc: 13.34% | Test Loss: 4.592 | Test Acc: 13.34% |
TEST ACCURACY: 13.5
| Epoch: 57.00 | Train Loss: 4.686 | Train Acc: 12.62% | Val. Loss: 4.598 | Val. Acc: 13.33% | Test Loss: 4.598 | Test Acc: 13.33% |
TEST ACCURACY: 13.5
| Epoch: 58.00 | Train Loss: 4.685 | Train Acc: 12.52% | Val. Loss: 4.591 | Val. Acc: 13.44% | Test Loss: 4.591 | Test Acc: 13.44% |
TEST ACCURACY: 13.5
| Epoch: 59.00 | Train Loss: 4.681 | Train Acc: 12.48% | Val. Loss: 4.597 | Val. Acc: 12.94% | Test Loss: 4.597 | Test Acc: 12.94% |
TEST ACCURACY: 13.5
| Epoch: 60.00 | Train Loss: 4.680 | Train Acc: 12.49% | Val. Loss: 4.590 | Val. Acc: 13.41% | Test Loss: 4.590 | Test Acc: 13.41% |
TEST ACCURACY: 13.5
| Epoch: 61.00 | Train Loss: 4.681 | Train Acc: 12.59% | Val. Loss: 4.594 | Val. A

In [25]:
log

{'compression_rate': 7.936507936507937,
 'train_loss': [6.173078198089599,
  5.529343283691406,
  5.325558062438965,
  5.212327967681885,
  5.135382705078125,
  5.045652011260986,
  5.006695288238525,
  4.971650762863159,
  4.957384234924317,
  4.938258407592773,
  4.899241761779785,
  4.881121518249512,
  4.86749458114624,
  4.853388886108398,
  4.8375276414489745,
  4.832554296722412,
  4.82023506362915,
  4.80969801864624,
  4.804461853027344,
  4.810269001159668,
  4.788954376983643,
  4.782254061584473,
  4.77753087600708,
  4.776470971069336,
  4.774167734222412,
  4.7621871069335935,
  4.76187069152832,
  4.768544646759033,
  4.754573500671387,
  4.7456744482421875,
  4.752347720642089,
  4.743860781402588,
  4.742851383666992,
  4.7347481719970705,
  4.735160696411133,
  4.734422542572021,
  4.756992329101562,
  4.722698518066406,
  4.729533582992554,
  4.739586484375,
  4.723484573974609,
  4.713653395080566,
  4.715449489746094,
  4.717872949676513,
  4.70818833480835,
  4.70

In [26]:
best_result

{'epoch': 98, 'train_acc': 0.13016, 'valid_acc': 0.13646, 'test_acc': 0.13646}