In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm
import seaborn as sns
plt.style.use('fivethirtyeight')

In [None]:
train_df = pd.read_csv('Data/yelp_review_polarity_csv/train.csv', header=None)
train_df.shape

In [None]:
train_df = train_df.loc[:30000,:]
train_df.shape

In [None]:
test_df = pd.read_csv('Data/yelp_review_polarity_csv/test.csv', header=None)
test_df.shape

In [None]:
train_df['Label'] = train_df[0].apply(lambda x:int(x==2))

In [None]:
train_df = pd.DataFrame({'id':train_df.index.values, 'label':train_df['Label'],
                        'alpha':['a']*train_df.shape[0], 'text':train_df[1].replace(r'\n',' ', regex=True) })
train_df.head()

In [None]:
test_df = pd.DataFrame({'id':test_df.index.values, 'label':test_df[0].apply(lambda x:int(x==2)),
                        'alpha':['a']*test_df.shape[0], 'text':test_df[1].replace(r'\n',' ', regex=True) })
test_df.head()

In [None]:
train_df.to_csv('Data/train.tsv', sep='\t', index=False, header=False)
test_df.to_csv('Data/dev.tsv', sep='\t', index=False, header=False)

In [None]:
from __future__ import absolute_import, division, print_function

import glob
import logging
import os
import random
import json

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import random
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange

from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

from transformers import AdamW, get_linear_schedule_with_warmup

from utils import (convert_examples_to_features,
                        output_modes, processors)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

args = {
    'data_dir': 'Data/',
    'model_type':  'xlnet',
    'model_name': 'xlnet-base-cased',
    'task_name': 'binary',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'fp16': False,
    'fp16_opt_level': 'O1',
    'max_seq_length': 128,
    'output_mode': 'classification',
    'train_batch_size': 32,
    'eval_batch_size': 32,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 1,
    'weight_decay': 0,
    'learning_rate': 4e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 50,
    'evaluate_during_training': False,
    'save_steps': 2000,
    'eval_all_checkpoints': True,

    'overwrite_output_dir': False,
    'reprocess_input_data': True,
    'notes': 'Using Yelp Reviews dataset'
}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open('args.json', 'w') as f:
    json.dump(args, f)


In [None]:
if os.path.exists(args['output_dir']) and os.listdir(args['output_dir']) and args['do_train'] and not args['overwrite_output_dir']:
    raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args['output_dir']))

In [None]:
MODEL_CLASSES = {
    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
}

config_class, model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

In [None]:
config = config_class.from_pretrained(args['model_name'], num_labels=2, finetuning_task=args['task_name'],
                                     proxies={'https':'//proxy-chain.intel.com:911'}), 
tokenizer = tokenizer_class.from_pretrained(args['model_name'],proxies={'https':'//proxy-chain.intel.com:911'})

In [None]:
model = model_class.from_pretrained(args['model_name'])
model.to(device)

task = args['task_name']

processor = processors[task]()
label_list = processor.get_labels()
num_labels = len(label_list)

In [None]:
model

In [None]:
def load_and_cache_examples(task, tokenizer, evaluate=False):
    processor = processors[task]()
    output_mode = args['output_mode']
    
    mode = 'dev' if evaluate else 'train'
    cached_features_file = os.path.join(args['data_dir'], f"cached_{mode}_{args['model_name']}_{args['max_seq_length']}_{task}")
    
    if os.path.exists(cached_features_file) and not args['reprocess_input_data']:
        logger.info("Loading features from cached file %s", cached_features_file)
        features = torch.load(cached_features_file)
               
    else:
        logger.info("Creating features from dataset file at %s", args['data_dir'])
        label_list = processor.get_labels()
        examples = processor.get_dev_examples(args['data_dir']) if evaluate else processor.get_train_examples(args['data_dir'])
        
        features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer, output_mode,
            cls_token_at_end=bool(args['model_type'] in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            sep_token=tokenizer.sep_token,
            cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0,
            pad_on_left=bool(args['model_type'] in ['xlnet']),                 # pad on the left for xlnet
            pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0)
        
        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    if output_mode == "classification":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    elif output_mode == "regression":
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    return dataset

In [None]:
def train(train_dataset, model, tokenizer):
    try:
        from torch.utils.tensorboard import SummaryWriter
    except AttributeError:
        from tensorboardX import SummaryWriter
    tb_writer = SummaryWriter()
    
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'], num_training_steps=t_total)
    
    if args['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args['fp16_opt_level'])
        
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")
    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            if args['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args['max_grad_norm'])
                
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    # Log metrics
                    if args['evaluate_during_training']:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args['logging_steps'], global_step)
                    logging_loss = tr_loss

                if args['save_steps'] > 0 and global_step % args['save_steps'] == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args['output_dir'], 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    logger.info("Saving model checkpoint to %s", output_dir)


    return global_step, tr_loss / global_step

In [None]:
from sklearn.metrics import mean_squared_error, matthews_corrcoef, confusion_matrix
from scipy.stats import pearsonr

def get_mismatched(labels, preds):
    mismatched = labels != preds
    examples = processor.get_dev_examples(args['data_dir'])
    wrong = [i for (i, v) in zip(examples, mismatched) if v]
    
    return wrong

def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn
    }, get_mismatched(labels, preds)

def compute_metrics(task_name, preds, labels):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def evaluate(model, tokenizer, prefix=""):
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args['output_dir']

    results = {}
    EVAL_TASK = args['task_name']

    eval_dataset = load_and_cache_examples(EVAL_TASK, tokenizer, evaluate=True)
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)


    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args['eval_batch_size'])

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args['eval_batch_size'])
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
                      'labels':         batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args['output_mode'] == "classification":
        preds = np.argmax(preds, axis=1)
    elif args['output_mode'] == "regression":
        preds = np.squeeze(preds)
    result, wrong = compute_metrics(EVAL_TASK, preds, out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results, wrong

In [None]:
'''train_dataset = load_and_cache_examples(args['task_name'], tokenizer)'''

In [None]:
'''global_step, tr_loss = train(train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)'''

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
from tqdm import tqdm
softmax = True

In [2]:
text = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
labels = data.LabelField(dtype=torch.float, batch_first=True)

In [3]:
fields = [(None,None), ('label',labels), (None,None), ('text',text)]

In [4]:
training_data=data.TabularDataset(path = 'Data/train.tsv',format = 'tsv',fields = fields,skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

{'label': '1', 'text': ['Been', 'going', 'to', 'Dr.', 'Goldberg', 'for', 'over', '10', 'years', '.', 'I', 'think', 'I', 'was', 'one', 'of', 'his', '1st', 'patients', 'when', 'he', 'started', 'at', 'MHMG', '.', 'He', "'s", 'been', 'great', 'over', 'the', 'years', 'and', 'is', 'really', 'all', 'about', 'the', 'big', 'picture', '.', 'It', 'is', 'because', 'of', 'him', ',', 'not', 'my', 'now', 'former', 'gyn', 'Dr.', 'Markoff', ',', 'that', 'I', 'found', 'out', 'I', 'have', 'fibroids', '.', 'He', 'explores', 'all', 'options', 'with', 'you', 'and', 'is', 'very', 'patient', 'and', 'understanding', '.', 'He', 'does', "n't", 'judge', 'and', 'asks', 'all', 'the', 'right', 'questions', '.', 'Very', 'thorough', 'and', 'wants', 'to', 'be', 'kept', 'in', 'the', 'loop', 'on', 'every', 'aspect', 'of', 'your', 'medical', 'health', 'and', 'your', 'life', '.']}


In [5]:
train_data, valid_data = training_data.split(split_ratio=0.9)

In [6]:
text.build_vocab(train_data, min_freq=3, vectors='glove.6B.100d')
labels.build_vocab(train_data)

In [7]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(text.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(labels.vocab))

#Commonly used words
print(text.vocab.freqs.most_common(10))  


Size of TEXT vocabulary: 26830
Size of LABEL vocabulary: 2
[('.', 188371), ('the', 146208), (',', 130604), ('and', 110111), ('I', 106418), ('to', 89923), ('a', 87587), (' ', 66662), ('was', 60242), ('of', 51669)]


In [8]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [9]:
import torch.nn as nn

class classifier(nn.Module):
    
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.output_dim = output_dim

        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return dense_outputs

In [10]:
import torch.nn as nn
import torch.functional

class classifier_alt(nn.Module):  
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        #Constructor
        super().__init__() 
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_dim = output_dim
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        #dense layer
        self.fc = nn.Linear(hidden_dim, self.output_dim)
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output_padded, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        #concat the final forward and backward hidden state
        output_padded = torch.mean(output_padded[:,:,:self.hidden_dim] + output_padded[:,:,self.hidden_dim:], dim=1) # Sum bidirectional outputs
        dense_outputs = self.fc(output_padded.reshape(-1, self.hidden_dim))
        #Final activation function
        outputs=self.act(dense_outputs)
        return dense_outputs

In [11]:
#define hyperparameters
size_of_vocab = len(text.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_layers = 1
if softmax:
    num_output_nodes = 2
else:
    num_output_nodes = 1
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

  "num_layers={}".format(dropout, num_layers))


In [12]:
model_alt = classifier_alt(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [13]:
model_alt

classifier_alt(
  (embedding): Embedding(26830, 100)
  (lstm): LSTM(100, 32, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=32, out_features=2, bias=True)
  (act): Sigmoid()
)

In [14]:
#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
count_parameters(model)

2717434

In [15]:
count_parameters(model_alt)

2717370

In [16]:
#Initialize the pretrained embedding
pretrained_embeddings = text.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

torch.Size([26830, 100])


In [17]:
model_alt.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

torch.Size([26830, 100])


In [18]:
#define optimizer and loss
if not softmax:
    criterion = nn.BCEWithLogitsLoss(reduction='sum')
else: 
    criterion = nn.CrossEntropyLoss(reduction='sum')
optimizer = torch.optim.Adam(model.parameters())


In [19]:
if not softmax:
    criterion_alt = nn.BCEWithLogitsLoss(reduction='sum')
else:
    criterion_alt = nn.CrossEntropyLoss(reduction='sum')
optimizer_alt = torch.optim.Adam(model_alt.parameters(), lr=0.01)

In [20]:
from sklearn.metrics import f1_score, accuracy_score
def train(model, iterator, optimizer, criterion, epoch_number, softmax):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
     #set the model in training phase
    model.train()  
    iter_ = tqdm(iterator,total=len(iterator))
    i = 0
    for batch in iter_:    
        #resets the gradients after every batch
        optimizer.zero_grad()   
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        #convert to 1D tensor
        if not softmax:
            predictions = model(text, text_lengths).squeeze()
            loss = criterion(predictions, batch.label)        
            acc = accuracy_score((predictions > 0.5).int().detach().numpy(), batch.label.numpy())  

        else: 
            predictions = model(text, text_lengths)
            loss = criterion(predictions, batch.label.long())        
            acc = accuracy_score(torch.argmax(predictions, dim=-1), batch.label.numpy())    
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()
        i+= 1
        iter_.set_description(f"Epoch {epoch_number+1}: Train loss: {round(epoch_loss/i,3)}  Train acc {round(epoch_acc/i,3)}")

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
def evaluate(model, iterator, criterion, epoch_number, softmax):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
        iter_ = tqdm(iterator, total=len(iterator))
        i=0
        for batch in iter_:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            if not softmax:
                predictions = model(text, text_lengths).squeeze()
                loss = criterion(predictions, batch.label)
                acc = accuracy_score((predictions > 0.5).int().detach().numpy(), batch.label.numpy())  
            else: 
                predictions = model(text, text_lengths)
                loss = criterion(predictions, batch.label.long())  
                acc = accuracy_score(torch.argmax(predictions, dim=-1), batch.label.numpy())   

            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            i+=1
            iter_.set_description(f"Epoch {epoch_number+1}: Val loss: {round(epoch_loss/i, 3)}  Val acc {round(epoch_acc/i, 3)}")

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, epoch, softmax)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, epoch, softmax)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [22]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model_alt, train_iterator, optimizer_alt, criterion_alt, epoch, softmax)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model_alt, valid_iterator, criterion_alt, epoch, softmax)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model_alt.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch 1: Train loss: 17.028  Train acc 0.883: 100%|██████████████████████████████████| 422/422 [35:05<00:00,  4.99s/it]
Epoch 1: Val loss: 12.847  Val acc 0.92: 100%|█████████████████████████████████████████| 47/47 [00:07<00:00,  6.15it/s]
  0%|                                                                                          | 0/422 [00:00<?, ?it/s]

	Train Loss: 17.028 | Train Acc: 88.31%
	 Val. Loss: 12.847 |  Val. Acc: 92.04%


Epoch 2: Train loss: 5.656  Train acc 0.968: 100%|███████████████████████████████████| 422/422 [37:53<00:00,  5.39s/it]
Epoch 2: Val loss: 15.728  Val acc 0.914: 100%|████████████████████████████████████████| 47/47 [00:08<00:00,  5.87it/s]
  0%|                                                                                          | 0/422 [00:00<?, ?it/s]

	Train Loss: 5.656 | Train Acc: 96.78%
	 Val. Loss: 15.728 |  Val. Acc: 91.35%


Epoch 3: Train loss: 2.255  Train acc 0.989: 100%|█████████████████████████████████| 422/422 [2:56:18<00:00, 25.07s/it]
Epoch 3: Val loss: 20.913  Val acc 0.899: 100%|████████████████████████████████████████| 47/47 [00:08<00:00,  5.82it/s]
  0%|                                                                                          | 0/422 [00:00<?, ?it/s]

	Train Loss: 2.255 | Train Acc: 98.89%
	 Val. Loss: 20.913 |  Val. Acc: 89.89%


Epoch 4: Train loss: 0.902  Train acc 0.995: 100%|███████████████████████████████████| 422/422 [30:53<00:00,  4.39s/it]
Epoch 4: Val loss: 28.687  Val acc 0.902: 100%|████████████████████████████████████████| 47/47 [00:05<00:00,  8.79it/s]
  0%|                                                                                          | 0/422 [00:00<?, ?it/s]

	Train Loss: 0.902 | Train Acc: 99.53%
	 Val. Loss: 28.687 |  Val. Acc: 90.15%


Epoch 5: Train loss: 0.34  Train acc 0.998: 100%|████████████████████████████████████| 422/422 [28:46<00:00,  4.09s/it]
Epoch 5: Val loss: 31.569  Val acc 0.904: 100%|████████████████████████████████████████| 47/47 [00:05<00:00,  7.98it/s]

	Train Loss: 0.340 | Train Acc: 99.84%
	 Val. Loss: 31.569 |  Val. Acc: 90.44%





In [None]:
for batch in train_iterator:
    text, text_lengths = batch.text

In [None]:
#text = [batch size,sent_length]
#optimizer_alt.zero_grad()
embedded = model_alt.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
        #packed sequence
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
packed_output, (hidden, cell) = model_alt.lstm(packed_embedded)
output_padded, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)        #hidden = [batch size, num layers * num directions,hid dim]


output_padded = output_padded#[:, :, :model_alt.hidden_dim] + output_padded[:, : ,model_alt.hidden_dim:] # Sum bidirectional outputs
#output_padded = output_padded[:,-1,:]
#dense_outputs = model_alt.fc(output_padded.reshape(-1, model_alt.hidden_dim))
        #Final activation function
#outputs=model_alt.act(dense_outputs)

In [None]:
output_padded.shape

In [None]:
loss = criterion_alt(outputs.squeeze(), batch.label)
loss

In [None]:
loss.backward()
optimizer_alt.step()

In [None]:
packed_embedded = nn.utils.rnn.pack_padded_sequence(embeds, text_length,batch_first=True)


In [None]:
class classifier_alt(nn.Module):  
    #define all the layers used in model
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        #Constructor
        super().__init__() 
        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.output_dim = output_dim
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        #dense layer
        self.fc = nn.Linear(hidden_dim, self.output_dim)
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        #embedded = [batch size, sent_len, emb dim]
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output_padded, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        #concat the final forward and backward hidden state
        output_padded = torch.mean(output_padded[:,:,:self.hidden_dim] + output_padded[:,:,self.hidden_dim:], dim=1) # Sum bidirectional outputs
        dense_outputs = self.fc(output_padded.reshape(-1, self.hidden_dim))
        #Final activation function
        outputs=self.act(dense_outputs)
        return dense_outputs

In [None]:
output_packed, hidden = model.lstm(packed_embedded)

In [None]:
output_packed.batch_sizes.shape

In [None]:
output_padded, output_lengths = nn.utils.rnn.pad_packed_sequence(output_packed, batch_first=True)

In [None]:
output_padded = torch.transpose(lstm_out, 0, 1)
output_padded = torch.transpose(lstm_out, 1, 2)

In [None]:
(torch.randint(0,30,(30,1)) > 15).int().numpy()

In [None]:
#I will also creating my own data loaders and compare the speed of training on a similar model
from tensorflow.keras.preprocessing.text import Tokenizer

Tokenizer().fit_on_texts