In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/colab_data/datasets/quora-question-pairs

# Load Data

In [None]:
import os
import pandas as pd

pd.set_option('display.max_colwidth', None)
ROOT_DIR = '/content/drive/My Drive/colab_data/datasets/quora-question-pairs'

In [None]:
df = pd.read_csv(os.path.join(ROOT_DIR,'train_preprocessed.csv'))
df = df[['question1','question2','is_duplicate']]
df.head()

# Pytorch and RNN

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.2)
print(train.shape)
print(val.shape)

In [None]:
class DuplicateDetector(nn.Module):
    def __init__(self, pretrained_embedding, nb_layers=1):
      super(DuplicateDetector, self).__init__()
      self.vocab_size, self.embedding_dim = pretrained_embedding.shape
      self.nb_layers=nb_layers
      self.word_embedding = nn.Embedding(
            num_embeddings= self.vocab_size,
            embedding_dim= self.embedding_dim,
            padding_idx=0,
            sparse=False
        )
      self.word_embedding.load_state_dict({'weight': torch.from_numpy(pretrained_embedding)})
      self.lstm = nn.LSTM(
          input_size = self.embedding_dim,
          hidden_size = self.embedding_dim,
          num_layers=self.nb_layers,
          batch_first=True
      )
      self.mlc = nn.Linear(self.embedding_dim*2, 1)

        
    def forward(self, q1, q1_lengths, q2, q2_lengths):
      q1_emb = self.word_embedding(q1)
      q1_emb = torch.nn.utils.rnn.pack_padded_sequence(q1_emb, q1_lengths, batch_first=True, enforce_sorted=False)
      _, q1_hidden_state = self.lstm(q1_emb)
      

      q2_emb = self.word_embedding(q2)
      q2_emb = torch.nn.utils.rnn.pack_padded_sequence(q2_emb, q2_lengths, batch_first=True, enforce_sorted=False)
      _, q2_hidden_state = self.lstm(q2_emb)


      output = torch.cat((q1_hidden_state[0], q2_hidden_state[0]),0)
      output = output.view(-1, self.embedding_dim*2)
      output = self.mlc(output)
      output = F.sigmoid(output)
      return output

In [None]:
use_cuda = torch.cuda.is_available()
print (use_cuda)

if use_cuda:
  current_device = torch.cuda.current_device()
  print(torch.cuda.get_device_name(current_device))
else:
  current_device = torch.device("cpu")

In [None]:
emb_weights = np.load('pretrained_emb.npy')
emb_weights.shape

In [None]:
model =  DuplicateDetector(emb_weights)
model.to(current_device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
epochs = 100

def print_(loss):
    print ("The loss calculated: ", loss)

print(model)

In [None]:
class QuoraDataset(Dataset):
  def __init__(self, df):
    self.df = df
  
  def __len__(self):
    return self.df.shape[0]
  
  def __getitem__(self, idx):
    data = df.loc[idx]
    q1 = eval(data['question1'])
    q1 = torch.LongTensor(q1)
    q2 = eval(data['question2'])
    q2 = torch.LongTensor(q2)
    return {'q1': q1,
            'q2': q2,
            'label': data['is_duplicate']}

In [None]:
def pad_collate(batch):
  key_data = ['q1', 'q2', 'label']
  elem = batch[0]
  batch = {key: [d[key] for d in batch] for key in elem if key in key_data}

  q1_lens = [len(q1) for q1 in batch['q1']]
  q2_lens = [len(q2) for q2 in batch['q2']]

  q1_pad = torch.nn.utils.rnn.pad_sequence(batch['q1'], batch_first=True, padding_value=0)
  q2_pad = torch.nn.utils.rnn.pad_sequence(batch['q2'], batch_first=True, padding_value=0)

  return q1_pad, q2_pad, q1_lens, q2_lens, batch['label']

In [None]:
train_dataset = QuoraDataset(train)
val_dataset = QuoraDataset(val)
train_dataloader = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=4, collate_fn=pad_collate)
val_dataloader = DataLoader(val_dataset, batch_size=10, shuffle=True, num_workers=4, collate_fn=pad_collate)

nb_train_batchs = len(train_dataloader)
nb_val_batchs = len(val_dataloader)

# Exercises:

1.   Implenent the training component using what you learned from Session 2