##Mount Drive


In [0]:
from google.colab import drive
drive.mount("./gdrive/")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at ./gdrive/


In [0]:
import torch
import torchtext
import spacy

nlp=spacy.load("en")

epochs=30

MAX_CHARS=None
hidden_dim=256
linear_dim=128
num_layers=3
alpha=0.000
learning_rate=0.001
batch_size=32

device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

domain_1="music"
domain_2="baby"

##Helper Functions

**read_split_save_json:** reads the training files, selects the examples from the two specified domains and saves the data into seperate files.

**tokenizer:** uses spacy tokenizer, truncating the length of the sentence to MAX_CHARS.

**create_fields:** creates the domain, text and labels fields

**create_dataset:** creates Tabular datasets from the train, test and validation files.

**print_progress:** prints the percentage progress of the training epoch.

**binary_accuracy:** used to calculate accuracy between logits and ground truths.

**create_iterators:** creates Bucket Iterators for the provided datasets.

**build_vocab:** builds vocabularies for the Fields

In [0]:
from torchtext.data import Field
from torchtext.data import TabularDataset
import pandas as pd
import json


def read_split_save_json(train_path, val_path, test_path, domain_tag_1, domain_tag_2):
  
  train=[]
  val=[]
  test=[]
  
  train_1=[]
  train_2=[]
  test_1=[]
  test_2=[]
  val_1=[]
  val_2=[]
  
  
  for line in open(train_path):
    train.append(json.loads(line))
  
  for line in open(val_path):
    val.append(json.loads(line))
  
  for line in open(test_path):
    test.append(json.loads(line))
  
  for d in train:
    if d["domain"]==domain_tag_1:
      train_1.append(d)
    elif d["domain"]==domain_tag_2:
      train_2.append(d)
   
  for d in val:
    if d["domain"]==domain_tag_1:
      val_1.append(d)
    elif d["domain"]==domain_tag_2:
      val_2.append(d)
      
  for d in test:
    if d["domain"]==domain_tag_1:
      test_1.append(d)
    elif d["domain"]==domain_tag_2:
      test_2.append(d)
  
  domain_path_1=("train_1.json","valid_1.json","test_1.json")
  domain_path_2=("train_2.json","valid_2.json","test_2.json")
  paths=list(domain_path_1)+list(domain_path_2)
  
  datas=[train_1,val_1,test_1,train_2,val_2,test_2]
  
  for path, data in zip(paths,datas):
    
    with open(path,'w') as outfile:
      for da in data:
        json.dump(da,outfile)
        outfile.write("\n")
  
  return domain_path_1, domain_path_2



def tokenizer(sentence):
  if MAX_CHARS and len(sentence)>MAX_CHARS:
    sentence=sentence[:MAX_CHARS]
  
  return [token.text for token in nlp.tokenizer(sentence)]



def create_fields(batch_first=True):
  TEXT=Field(sequential=True,lower=True,tokenize=tokenizer,init_token="<start>",eos_token="<end>",batch_first=batch_first)
  LABEL=Field(sequential=False,lower=False,batch_first=batch_first,unk_token=None)  
  DOMAIN=Field(sequential=False,lower=False,batch_first=batch_first,unk_token=None)
  return TEXT, LABEL, DOMAIN


def create_datasets(TEXT,LABEL,DOMAIN,root,paths,file_format):
  
  train_path, val_path, test_path= paths
  
  fields={"label":("label",LABEL),"sentence":("sentence",TEXT),"domain":("domain",DOMAIN)}
  
  train,val,test=TabularDataset.splits(path=root,train=train_path, validation=train_path, test=test_path, format=file_format,fields=fields)
  
  return train,val,test
  

def create_vocab(TEXT, LABEL, DOMAIN, data, max_size=None, min_freq=1):
  
  TEXT.build_vocab(data, vectors="glove.6B.300d",max_size=max_size,min_freq=min_freq)               # can specify vocab_size
  LABEL.build_vocab(data)
  DOMAIN.build_vocab(data)
  

def create_iterators(train, val, test, batch_size, device, sortkey):
  
  train_iter, val_iter, test_iter= torchtext.data.BucketIterator.splits( (train,val,test), sort_key=sortkey, batch_size=batch_size, device=device)         
  
  return train_iter,val_iter,test_iter
  
  
def binary_accuracy(pred, labels):
  
  rounded_pred=torch.round(pred)
  equal= (rounded_pred==labels.view(*rounded_pred.shape)).float()
  accuracy= equal.sum()/len(equal)
  
  return accuracy


def print_progress(step, t, max_len ):
  
  print('\r' + f'Progress: '
                f"[{'=' * int((t) * step) + ' ' * (24 - int((t) * step))}]"
                f"({math.ceil((t) * 100 /max_len)} %)",
                end='')

##Preprocessing

In [0]:
TEXT, LABEL,DOMAIN = create_fields(batch_first=True)

root="./gdrive/My Drive/Amazon Review Dataset/"
domain_path_1, domain_path_2 = read_split_save_json(root+"train.json", root+"valid.json", root+"test.json", domain_1, domain_2)


train,val,test=create_datasets(TEXT,LABEL,DOMAIN,root="./gdrive/My Drive/Amazon Review Dataset/",paths=("train.json", "valid.json", "test.json"),file_format="json")

train_d1,val_d1,test_d1 = create_datasets(TEXT,LABEL,DOMAIN, root="./",paths=domain_path_1,file_format="json")
train_d2,val_d2,test_d2 = create_datasets(TEXT,LABEL,DOMAIN, root="./",paths=domain_path_2,file_format="json")

create_vocab(TEXT,LABEL,DOMAIN,train)

train_iter, val_iter, test_iter = create_iterators(train, val, test, batch_size=batch_size, device=device, sortkey=lambda x: len(x.sentence))
train_iter_d1,val_iter_d1,test_iter_d1=create_iterators(train_d1, val_d1, test_d1, batch_size=batch_size, device=device, sortkey=lambda x: len(x.sentence))
train_iter_d2,val_iter_d2,test_iter_d2=create_iterators(train_d2, val_d2, test_d2, batch_size=batch_size, device=device, sortkey=lambda x: len(x.sentence))

weight_matrix=TEXT.vocab.vectors
vocab_size, emb_dim= weight_matrix.shape


##Tensorboard

In [0]:
from tensorboardcolab import TensorBoardColab
tb = TensorBoardColab()

Using TensorFlow backend.


Wait for 8 seconds...
TensorBoard link:
http://a8f644c5.ngrok.io


#Model

In [0]:
from torch import nn

def create_model( hidden_dim, device):
  
  class Classifier(nn.Module):
    
    def __init__(self , vocab_size, emb_dim, hidden_dim):
      super().__init__()
      
      self.embedding= nn.Embedding.from_pretrained(weight_matrix)
      self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
      self.ff1_1 = nn.Linear(2* num_layers * hidden_dim, 1)
      self.ff1_2 = nn.Linear(256,1)
      
      self.ff2_1 = nn.Linear(2* num_layers * hidden_dim, 1)
      self.ff2_2 = nn.Linear(256,1)
      
      self.relu = nn.ReLU()
      self.sigmoid = nn.Sigmoid()
      
    def forward(self, x, domain_tag):
      
      emb = self.embedding(x)
      lstm_out,hidden_states = self.lstm(emb)
      h,c = hidden_states
      h=h.permute(1,0,2)
      
      if domain_tag=="domain 1":
        
        #f1_out = self.relu(self.ff1_1(h.reshape(h.shape[0],-1)))
        #output = self.sigmoid(self.ff1_2(f1_out))
        output = self.sigmoid(self.ff1_1(h.reshape(h.shape[0],-1)))
      
      elif domain_tag=="domain 2":
        
        #f1_out = self.relu(self.ff2_1(h.reshape(h.shape[0],-1)))
        #output = self.sigmoid(self.ff2_2(f1_out))
        output = self.sigmoid(self.ff2_1(h.reshape(h.shape[0],-1)))
      
      return output.squeeze(-1)
    
  model=Classifier(vocab_size, emb_dim, hidden_dim)
  model.to(device)
  
  
  return model

In [0]:
model=create_model(hidden_dim, device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

criterion_1 = nn.BCELoss()
criterion_2 = nn.BCELoss()


print(model)

Classifier(
  (embedding): Embedding(58714, 300)
  (lstm): LSTM(300, 256, num_layers=3, batch_first=True, bidirectional=True)
  (ff1_1): Linear(in_features=1536, out_features=1, bias=True)
  (ff1_2): Linear(in_features=256, out_features=1, bias=True)
  (ff2_1): Linear(in_features=1536, out_features=1, bias=True)
  (ff2_2): Linear(in_features=256, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)


In [0]:
!rm -rf text*

#Train

In [0]:
import math

def train(model, epochs, train_iter_d1, train_iter_d2, val_iter_d1, val_iter_d2):

  min_val_loss=1e07

  training_acc = []
  training_losses = []
  validation_accuracies = []
  validation_losses = []



  for epoch in range(1,epochs+1):

    iter_d1, iter_d2=iter(train_iter_d1), iter(train_iter_d2)
    train_acc = 0
    train_loss = 0
    val_acc = 0
    val_loss = 0
    iter_size_1=len(train_iter_d1)
    iter_size_2=len(train_iter_d2)

    print("###################### \t Epoch: ",epoch," \t ######################")

    model.train()

    t=0
    step=25/(len(train_iter_d1)+len(train_iter_d2))

    print("Training: ")
    while iter_size_1 or iter_size_2:
      

      optimizer.zero_grad()

      ########## for domain 1 ##########
      if iter_size_1:
        
        t+=1
        batch = next(iter_d1)

        batch.label=batch.label.type(torch.FloatTensor)
        batch.sentence=batch.sentence.to(device)
        batch.label=batch.label.to(device)

        predictions = model(batch.sentence,"domain 1")
        loss_1 = criterion_1(predictions,batch.label)
        train_acc+= binary_accuracy(predictions,batch.label).item()
        loss_1.backward()
        train_loss+= loss_1.item()
        iter_size_1= iter_size_1 -1

      ########## for domain 2 ###########
      if iter_size_2:
        
        t+=1
        batch = next(iter_d2)

        batch.label=batch.label.type(torch.FloatTensor)
        batch.sentence=batch.sentence.to(device)
        batch.label=batch.label.to(device)

        predictions = model(batch.sentence, "domain 2")
        loss_2 = criterion_2(predictions,batch.label)
        train_acc+= binary_accuracy(predictions,batch.label).item()
        loss_2.backward()
        train_loss+=loss_2.item()
        iter_size_2= iter_size_2 -1

      optimizer.step()



      print_progress(step, t, len(train_iter_d1)+len(train_iter_d2))
    
    
    print()
    train_acc/=len(train_iter_d1)+len(train_iter_d2)
    train_loss/=len(train_iter_d1)+len(train_iter_d2)

    tb.save_value("Loss","train loss",epoch, train_loss)
    tb.save_value("Accuracy","train accuracy",epoch, train_acc)

    print('\nTraining Loss: {:.6f} \t Training accuracy: {: .6f}'.format( train_loss,train_acc))
    training_acc.append(train_acc)
    training_losses.append(train_loss)
    print()

    with torch.no_grad():

      losses=[]
      accuracies=[]

      model.eval()
      domain=1
      
      domain_loss=[]
      domain_acc=[]
      
      for val_iter in [val_iter_d1, val_iter_d2]:
        
      
        t=0
        step=25/len(val_iter)

        
        print("Validation (domain "+str(domain)+" ) :" )
        for val_batch in val_iter:
          t+=1

          val_batch.label=val_batch.label.type(torch.FloatTensor)

          val_batch.sentence=val_batch.sentence.to(device)
          val_batch.label=val_batch.label.to(device)

          predictions = model(val_batch.sentence,"domain "+str(domain))
          if domain==1:
            loss = criterion_1(predictions,val_batch.label)
          else:
            loss = criterion_2(predictions,val_batch.label)

          acc=binary_accuracy(predictions,val_batch.label)


          val_loss+=loss.item()
          val_acc+=acc.item()
          losses.append(loss.item())
          accuracies.append(acc.item())

          print_progress(step, t, len(val_iter))
          
          
        
        
        print()
        val_acc/=len(val_iter)
        val_loss/=len(val_iter)

        

        tb.save_value("Domain Loss","Domain"+str(domain)+"validation loss",epoch, val_loss)
        tb.save_value("Domain Accuracy","Domain"+str(domain)+"validation accuracy",epoch, val_acc)

        domain=domain+1

      print("Domain 1 loss: ",sum(losses[:len(val_iter_d1)])/len(val_iter_d1),"\t","Domain 2 loss: ",sum(losses[len(val_iter_d1):])/len(val_iter_d2))
      print("Domain 1 accuracy: ",sum(accuracies[:len(val_iter_d1)])/len(val_iter_d1),"\t","Domain 2 accuracy: ",sum(accuracies[len(val_iter_d1):])/len(val_iter_d2))
      

      validation_loss= sum(losses)/len(losses)
      validation_acc= sum(accuracies)/len(accuracies)

      print('Validation Loss: {:.6f} \t Validation accuracy: {: .6f}'.format( validation_loss,validation_acc))
      validation_accuracies.append(validation_acc)
      validation_losses.append(validation_loss)

      tb.save_value("Loss","val loss",epoch, validation_loss)
      tb.save_value("Accuracy","val accuracy",epoch, validation_acc)


      if validation_loss< min_val_loss:
        min_val_loss= validation_loss
        print("Validation loss reduced to {:.6f}, saving model............".format(validation_loss))
        torch.save(model.state_dict(), 'text-model.epoch='+str(epoch)+'.pt')
        print()
  
      print()
      #print(len(losses),losses)
      #print(len(accuracies),accuracies)

In [0]:
train(model, epochs, train_iter_d1, train_iter_d2, val_iter_d1, val_iter_d2)

###################### 	 Epoch:  1  	 ######################
Training: 

W0718 21:10:03.844654 140508989204352 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/tensorboardcolab/core.py:49: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.

W0718 21:10:03.849380 140508989204352 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/tensorboardcolab/core.py:101: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.




Training Loss: 0.679444 	 Training accuracy:  0.564749

Validation (domain 1 ) :
Validation (domain 2 ) :
Domain 1 loss:  0.6692618091901144 	 Domain 2 loss:  0.6494306020958479
Domain 1 accuracy:  0.6069444444444444 	 Domain 2 accuracy:  0.6121608531752298
Validation Loss: 0.659572 	 Validation accuracy:  0.609493
Validation loss reduced to 0.659572, saving model............


###################### 	 Epoch:  2  	 ######################
Training: 

Training Loss: 0.643503 	 Training accuracy:  0.636245

Validation (domain 1 ) :
Validation (domain 2 ) :
Domain 1 loss:  0.600323932700687 	 Domain 2 loss:  0.582449747379436
Domain 1 accuracy:  0.6972222222222222 	 Domain 2 accuracy:  0.714631783407788
Validation Loss: 0.591590 	 Validation accuracy:  0.705729
Validation loss reduced to 0.591590, saving model............


###################### 	 Epoch:  3  	 ######################
Training: 

Training Loss: 0.616274 	 Training accuracy:  0.666312

Validation (domain 1 ) :
Validation (d

##Save Model

In [0]:
import shutil

for i in range(1,7):
  shutil.copy("./text-model.epoch="+str(i)+".pt","./gdrive/My Drive/Amazon Review Dataset/text-model.epoch="+str(i)+".pt")
  

#Testing

In [0]:
def testing(model_path=None,model=None):
  
  if model_path:
    state_dict=torch.load(model_path)
    model.load_state_dict(state_dict)
  
  model.eval()
  count=0
  with torch.no_grad():
    
    domain=["domain 1","domain 2"]
    test_data= [test_iter_d1, test_iter_d2]   
    
    for test_iter,domain in zip(test_data, domain):
        print(domain,"\n")
        test_acc =0
        test_loss=0
        t=0
        step=25/len(test_iter)
        for test_batch in test_iter:
          t+=1

          test_batch.label=test_batch.label.type(torch.FloatTensor)

          test_batch.sentence=test_batch.sentence.to(device)
          test_batch.label=test_batch.label.to(device)

          predictions = model(test_batch.sentence,domain)
          loss = criterion_1(predictions,test_batch.label)
          acc=binary_accuracy(predictions,test_batch.label)


          test_loss+=loss.item()
          test_acc+=acc.item()
          
          count+=1

          print("Testing", end="")
          print_progress(step, t, len(test_iter))

        test_acc/=len(test_iter)
        test_loss/=len(test_iter)
        print('\nTest Loss: {:.6f} \t Test accuracy: {: .6f}'.format( test_loss,test_acc))
        
        print()

In [0]:
testing(model=model)

domain 1 

Test Loss: 1.343824 	 Test accuracy:  0.776442

domain 2 

Test Loss: 1.229939 	 Test accuracy:  0.781250



In [0]:
testing(model_path="./text-model.epoch=16.pt",model=model)

domain 1 

Test Loss: 0.601931 	 Test accuracy:  0.766827

domain 2 

Test Loss: 0.606000 	 Test accuracy:  0.762019



In [0]:
testing(model_path="./text-model.epoch=18.pt",model=model)

domain 1 

Test Loss: 0.817655 	 Test accuracy:  0.783654

domain 2 

Test Loss: 0.815865 	 Test accuracy:  0.786058



Hence we safely assume the max length to be 500 words (tokens) in order to improve model's efficiency.

##Tests (Optional)

In [0]:
print(list(map(lambda x:x.domain,train[:10])))
print(list(map(lambda x:x.domain,train_d1[:10])))
print(list(map(lambda x:x.domain,train_d2[:10])))

['apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel']
['apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel', 'apparel']
['baby', 'baby', 'baby', 'baby', 'baby', 'baby', 'baby', 'baby', 'baby', 'baby']


In [0]:
print(len(TEXT.vocab))
print(LABEL.vocab.stoi)
print(LABEL.vocab.freqs)
print(DOMAIN.vocab.stoi)

58714
defaultdict(<function _default_unk_index at 0x7f84dfb038c8>, {1: 0, 0: 1})
Counter({1: 10056, 0: 9908})
defaultdict(<function _default_unk_index at 0x7f84dfb038c8>, {'apparel': 0, 'books': 1, 'health_personal_care': 2, 'imdb': 3, 'kitchen_housewares': 4, 'music': 5, 'sports_outdoors': 6, 'toys_games': 7, 'video': 8, 'electronics': 9, 'camera_photo': 10, 'magazines': 11, 'software': 12, 'baby': 13})


In [0]:
print(TEXT.vocab.stoi["<end>"])
print(TEXT.vocab.stoi["<pad>"])
print(TEXT.vocab.stoi["n't"])

print(TEXT.vocab.itos[55])


3
1
30
has


In [0]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 125253), ('.', 121329), (',', 96666), ('and', 66508), ('i', 62321), ('to', 60863), ('a', 59332), ('it', 51535), ('of', 46838), ('is', 41696), (' ', 41276), ('this', 34363), ('in', 30303), ('that', 27083), ('for', 25881), ('you', 20498), ('with', 19470), ('was', 18007), ("'s", 17939), ('on', 17860)]


In [0]:
for batch in train_iter_d1:
  print(batch.label)
  print(batch.label.shape)
  print(batch.sentence)
  print(batch.sentence.shape)
  break

tensor([0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0], device='cuda:0')
torch.Size([64])
tensor([[   2,  197,   70,  ...,    1,    1,    1],
        [   2,  515,  466,  ...,    1,    1,    1],
        [   2,   15,   13,  ...,    1,    1,    1],
        ...,
        [   2,   93,   32,  ...,    1,    1,    1],
        [   2,   23,   75,  ...,    1,    1,    1],
        [   2, 4035, 3580,  ...,    1,    1,    1]], device='cuda:0')
torch.Size([64, 540])
