### Installing neccessary packages:

In [4]:
!pip install transformers
# https://huggingface.co/transformers/installation.html
!pip install sentencepiece
# https://pypi.org/project/sentencepiece/
# Python wrapper for SentencePiece. This API will offer the encoding, decoding and training of Sentencepiece.
!pip install Cython
# https://pypi.org/project/Cython/

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

## Checking the GPU availabilty

In [5]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda:0") 
    print("GPU")
else:
    device = torch.device("cpu")
    print("CPU")

GPU


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing the required packages:

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

In [8]:
import os
import sys
from transformers.optimization import Adafactor 
import time
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import torch
import random
import re

os.chdir('/content/drive/My Drive/Colab Notebooks/NLP-Project')

In [9]:
import pandas as pd
# Reading csv
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/NLP-Project/dataset.csv', header=None, names=['inputs', 'target'])
print(data.head(5))

                                      inputs  target
0  The sum of 875 and <extra_id_0>21 is 1096       2
1  The sum of 875 and 221 is <extra_id_0>096       1
2    The sum of <extra_id_0>33 and 27 is 360       3
3    The sum of 333 and <extra_id_0>7 is 360       2
4  The sum of 855 and 7<extra_id_0>8 is 1583       2


In [10]:
from sklearn.model_selection import train_test_split

# Test and validation split
train, validation = train_test_split(data, test_size=0.2)

data_train = train.reset_index(drop=True)
data_valid = validation.reset_index(drop=True)

In [11]:
# Initializing Parameters 
batch_size, num_of_epochs = 8, 2
num_of_batches = int(len(data_train)/batch_size)

In [12]:
# Reference
# https://huggingface.co/transformers/model_doc/t5.html
# https://medium.com/analytics-vidhya/t5-a-detailed-explanation-a0ac9bc53e51
# https://towardsdatascience.com/data-to-text-generation-with-t5-building-a-simple-yet-advanced-nlg-model-b5cce5a6df45

In [19]:
# T5-base
tokenizer = T5Tokenizer.from_pretrained('t5-base')

model = T5ForConditionalGeneration.from_pretrained('t5-base', return_dict=True)
# moving the model to device(GPU/CPU)
model.to(device)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dr

In [20]:
# Optimizer
# https://huggingface.co/transformers/model_doc/t5.html#overview
optimizer = Adafactor(
    model.parameters(),
    lr=3e-4, # Initializing the learning Rate as suggested in the T5 official documentation
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False
)

In [21]:
from IPython.display import HTML, display

# Setting the progress, with html as UI.
def progress(loss, value, max=100):
    return HTML(""" Batch loss :{loss}
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(loss=loss,value=value, max=max))

In [22]:
import gc

# Sets the module in training mode
model.train()

for epoch in range(1,num_of_epochs+1):
  print('Running epoch: {}'.format(epoch))
  running_loss=0
  out = display(progress(1, num_of_batches+1), display_id=True)

  for i in range(num_of_batches):
    new_df = data_train[i*batch_size:i*batch_size+batch_size]
    inputbatch, labelbatch = [], []

    for index, row in new_df.iterrows():
      inputbatch.append(row['inputs'])
      labelbatch.append(str(row['target']))
    
    # Encoding the input text in batches and picking up the input Ids
    inputbatch=tokenizer.batch_encode_plus(inputbatch, padding=True, max_length=400, return_tensors='pt')["input_ids"]
    labelbatch=tokenizer.batch_encode_plus(labelbatch, padding=True, max_length=400, return_tensors="pt")["input_ids"]

    # pushing to device
    inputbatch=inputbatch.to(device)
    labelbatch=labelbatch.to(device)
  
    # clear out the gradients of all Variables 
    optimizer.zero_grad()
    gc.collect()
    torch.cuda.empty_cache()

    # Forward propogation
    outputs = model(input_ids=inputbatch, labels=labelbatch)
    loss = outputs.loss
    loss_num=loss.item()
    logits = outputs.logits
    running_loss+=loss_num
    out.update(progress(loss_num,i, num_of_batches+1))

    # calculating the gradients
    loss.backward()

    # updating the params
    optimizer.step()
    
  running_loss=running_loss/int(num_of_batches)
  print('Epoch: {} , Running loss: {}'.format(epoch,running_loss))

Running epoch: 1


  "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "


Epoch: 1 , Running loss: 0.9187459457311193
Running epoch: 2


Epoch: 2 , Running loss: 0.6483135477502


In [13]:
# Changing the directory to store the model there.
print(os.getcwd())
os.chdir('/content/drive/My Drive/Colab Notebooks/NLP-Project/')
print(os.getcwd())

/content/drive/My Drive/Colab Notebooks/NLP-Project
/content/drive/My Drive/Colab Notebooks/NLP-Project


In [14]:
# Loading the configuration file for 't5-base' model
!wget https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json

--2021-11-03 14:04:12--  https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.229.32
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.229.32|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1199 (1.2K) [application/json]
Saving to: ‘t5-base-config.json.1’


2021-11-03 14:04:13 (25.3 MB/s) - ‘t5-base-config.json.1’ saved [1199/1199]



#### Saving the Model (creating checkpoint)

In [25]:
# saving the state
torch.save(model.state_dict(),'Masked_number_prediction_model.bin')

In [26]:
PATH = "Masked_number_prediction_model.pt"
torch.save({
            'epoch': num_of_epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': running_loss,
            }, PATH)

In [15]:
model_load = T5ForConditionalGeneration.from_pretrained('Masked_number_prediction_model.bin', return_dict=True, config='t5-base-config.json')

In [24]:
# Function to generate sentences from symptoms on the test dataset
def generateText(text):
  model_load.eval()
  input_ids = tokenizer.encode(text, return_tensors="pt")  # Batch size 1
  # input_ids.to(dev)
  s = time.time()
  outputs = model_load.generate(input_ids)
  gen_text=tokenizer.decode(outputs[0]).replace('<pad>','').replace('</s>','')
  elapsed = time.time() - s
  print('Generated in {} seconds'.format(str(elapsed)[:4]))

  return gen_text

In [26]:
data_valid
# testing on this for now

Unnamed: 0,inputs,target
0,The sum of 593 and <extra_id_0>42 is 1035,4
1,The sum of 741 and 5<extra_id_0>3 is 1244,0
2,The sum of <extra_id_0>6 and 844 is 1410,56
3,The sum of <extra_id_0>5 and 714 is 1679,96
4,The sum of 82 and <extra_id_0>4 is 476,39
...,...,...
15319,The sum of 304 and <extra_id_0>19 is 723,4
15320,The sum of 936 and 348 is <extra_id_0>84,12
15321,The sum of <extra_id_0>03 and 183 is 786,6
15322,The sum of 986 and 396 is 1<extra_id_0>82,3


In [31]:
generateText("The sum of <extra_id_0>03 and 183 is 786") # example

Generated in 0.23 seconds


' 6'

In [None]:
data_test['predictions'] = data_valid.apply(lambda x: generateText(x))
# should try this.