In [None]:
# Checking out the GPU we have access to
!nvidia-smi

Sun Aug 30 07:05:50 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    66W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install -q transformers

In [None]:
# Importing stock libraries
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Preparing for TPU usage
#import torch_xla
#import torch_xla.core.xla_model as xm
#device = xm.xla_device()

In [None]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv('news.csv')

In [None]:
df.head()

Unnamed: 0,text,ctext
0,mumbai extending its gain for the second cons...,summarize: extending its gain for the second c...
1,indian equity market may deliver better return...,summarize: morgan stanley says macros are driv...
2,new delhi the sp bse consumer durables index ...,summarize: the s p bse consumer durables index...
3,sydney caution gripped asian share markets on...,summarize: msci s broadest index of asia pacif...
4,mumbai brokerages have turned bearish on bank...,summarize: morgan stanley is of the view that ...


In [None]:
# Sections of config

# Defining some key variables that will be used later on in the training

tokenizer = T5Tokenizer.from_pretrained("t5-base")
MAX_LEN = 512
SUMMARY_LEN = 60
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 1e-4

In [None]:
# Creating a custom dataset for reading the dataframe and loading it into the dataloader to pass it to the neural network at a later stage for finetuning the model and to prepare it for predictions


class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.text
        self.ctext = self.data.ctext

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.9
train_dataset=df.sample(frac=train_size,random_state=42).reset_index(drop=True)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

FULL Dataset: (7622, 2)
TRAIN Dataset: (6860, 2)
TEST Dataset: (762, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]
        
        if _%1000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [None]:
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  8.970845222473145
Epoch: 0, Loss:  2.7758710384368896
Epoch: 1, Loss:  2.0630688667297363
Epoch: 1, Loss:  3.1202943325042725


In [None]:
model.save_pretrained('/content/model/t5')


In [None]:
model = T5ForConditionalGeneration.from_pretrained('/content/model/t5')

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [None]:
#text = 'Hours after Rajasthan chief minister Ashok Gehlot reiterated his “forgive and forget” mantra on Thursday with the rebels back in party fold, the Congress revoked the suspension of two MLAs who were suspended on July 17 following allegations that they were involved in a conspiracy to topple the state government. MLAs Bhanwar Lal Sharma and Vishvendra Singh were part of the rebel camp supporting former Rajasthan deputy chief minister Sachin Pilot. The suspension followed the leak of audio tapes with conversations allegedly between Bhanwar Lal Sharma, Sanjay Jain, an intermediary, and Union minister of Jal Shakti Gajendra Singh Shekhawat, which hinted at a conspiracy to bring down the government.'

In [None]:
text = df['text'][2]

In [None]:
input_ids = tokenizer.encode_plus(text, return_tensors="pt")

In [None]:
outputs = model.generate(input_ids['input_ids'])

In [None]:
pred = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in outputs]

In [None]:
pred

['whirlpool of india down 0 93 per cent were trading with losses']

In [None]:
df['ctext'][2]

'summarize: the s p bse consumer durables index was trading 0 04 per cent down at 22 332 55 around 11 57 am '