## Loading data from kaggle

In [2]:
!pip install transformers -q


[0m

In [3]:

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5TokenizerFast, T5ForConditionalGeneration

In [4]:

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.article = self.data.article
        self.highlights = self.data.highlights

    def __len__(self):
        return len(self.article)

    def __getitem__(self, index):
        highlights = str(self.highlights[index])
        highlights = ' '.join(highlights.split())

        article = str(self.article[index])
        article = ' '.join(article.split())

        source = self.tokenizer.batch_encode_plus([highlights], max_length= self.source_len, padding="max_length",return_tensors='pt', truncation= True)
        target = self.tokenizer.batch_encode_plus([article], max_length= self.summ_len, padding="max_length",return_tensors='pt', truncation= True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }


In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [7]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
! pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/

Collecting sentencepiece

  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m

[?25hInstalling collected packages: sentencepiece

Successfully installed sentencepiece-0.1.98


In [8]:
from transformers import T5Config

In [9]:
from transformers import T5Tokenizer

In [10]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base")

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [11]:
df = pd.read_csv("/kaggle/input/neutral-da/neut.csv")

In [12]:

df.head(5)

Unnamed: 0,index,id,article,highlights,bias
0,272581,ed0fed726929c1eeabe6c390e47128dbb7d7a055,By . Mia De Graaf . Britons flocked to beaches...,People enjoyed temperatures of 17C at Brighton...,neutral
1,171868,6a70a0d8d3ed365fe1df6d35f1587a8b9b298618,Video footage shows the heart stopping moment ...,A 17-year-old boy suffering lacerations to his...,neutral
2,63167,b37204c13ea38b511265e41ac69fb12acfb63f85,"Istanbul, Turkey (CNN) -- About 250 people rac...",Syrians citizens hightail it to Turkey .\nMost...,neutral
3,68522,c24e5805afd5145bc48410e876db91d44a06be5e,By . Daily Mail Reporter . PUBLISHED: . 12:53 ...,The Xue Long had provided the helicopter that ...,neutral
4,81888,e80e130d55bf30e5a0f547aaaa4cd9930635bfbd,(CNN) -- Place a tennis ball into a yellow soc...,'Muggle quidditch' replicates Harry Potter's m...,neutral


In [13]:
df = df[['article','highlights']]

In [14]:
  train_size = 0.8
  train_dataset=df.sample(frac=train_size,random_state = 42)
  val_dataset=df.drop(train_dataset.index).reset_index(drop=True)
  train_dataset = train_dataset.reset_index(drop=True)

In [15]:
    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))

FULL Dataset: (15467, 2)
TRAIN Dataset: (12374, 2)
TEST Dataset: (3093, 2)


In [23]:
           # Initialize config
TRAIN_BATCH_SIZE = 4   # input batch size for training (default: 64)
VALID_BATCH_SIZE = 4   # input batch size for testing (default: 1000)
TRAIN_EPOCHS = 2        # number of epochs to train (default: 10)
VAL_EPOCHS = 1 
LEARNING_RATE = 2e-4    # learning rate (default: 0.01)
SEED = 42               # random seed (default: 42)
MAX_LEN = 512
SUMMARY_LEN = 350

In [24]:
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

In [25]:
train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

val_params = {
        'batch_size': VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
        }

In [26]:
training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [20]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [27]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
!pip install --upgrade transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


















In [28]:
for epoch in range(TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('/kaggle/working//predictions.csv')
    print('Output Files generated for review')

Epoch: 0, Loss:  3.083691358566284
Epoch: 0, Loss:  2.8568475246429443
Epoch: 0, Loss:  2.916836977005005
Epoch: 0, Loss:  2.491539478302002
Epoch: 0, Loss:  2.756544351577759
Epoch: 0, Loss:  2.3584907054901123
Epoch: 0, Loss:  2.7519772052764893
Epoch: 1, Loss:  2.3865394592285156
Epoch: 1, Loss:  2.126655340194702
Epoch: 1, Loss:  2.5361714363098145
Epoch: 1, Loss:  2.680100679397583
Epoch: 1, Loss:  2.4016871452331543
Epoch: 1, Loss:  2.4307668209075928
Epoch: 1, Loss:  2.42146372795105
Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Completed 0
Completed 100
Completed 200
Completed 300
Completed 400
Completed 500
Completed 600
Completed 700
Output

In [29]:
pred1 = pd.read_csv("/kaggle/working/predictions.csv")
pred1.head(5)

Unnamed: 0.1,Unnamed: 0,Generated Text,Actual Text
0,0,"'It's not easy,' says Sam Smith, 17, who was s...",Video footage shows the heart stopping moment ...
1,1,'Muggle quidditch' is a fictional recreation o...,(CNN) -- Place a tennis ball into a yellow soc...
2,2,"'It's not like it's the same thing,' says one ...",‘Gone fishing’ is a phrase usually associated ...
3,3,Several Republicans are running for Congress i...,If right-wingers catch the wave of an election...
4,4,Hundreds of people have died amid political tu...,Cairo (CNN) -- Violence marked the beginning o...


In [32]:
pred1['Generated Text'][1]

"'Muggle quidditch' is a fictional recreation of Harry Potter's magical sport in the real world. The documentary follows UCLA's quidditch team to the Quidditch World Cup, where they compete against each other in a match for the first time. The film was created in 2005 and now supports 170 US college teams and five full-time staff. Scroll down for video. Quidditch: A documentary follows UCLA's quidditch team to the Quidditch World Cup. It recreates Harry Potter's magical sport in the real world. This documentary follows UCLA's"

In [33]:
pred1['Actual Text'][1]

'(CNN) -- Place a tennis ball into a yellow sock and tuck it into the back of your pants. Now start running. Congratulations! You are a snitch. Don\'t get caught, or the game ends. This is quidditch, as played by muggles. In JK Rowling\'s novels, the snitch is a sort-of magical, winged squash ball, pursued by "seeker" Harry Potter in a variety of high-stakes, airborne quidditch contests. Nobody on the UCLA campus has a magical, winged squash ball. A man with a tennis ball '

In [34]:
import pickle

In [35]:
pickle.dump(model, open('/kaggle/working/T5','wb'))

In [43]:
    import os
    from pydrive.auth import GoogleAuth
    from pydrive.drive import GoogleDrive
    from oauth2client.client import GoogleCredentials

ModuleNotFoundError: No module named 'pydrive'

In [42]:
gauth = GoogleAuth()
credential_file = 'gdrive.json'
gauth.LoadCredentialsFile(credential_file)

NameError: name 'GoogleAuth' is not defined

In [41]:
from IPython.display import FileLink
FileLink(r'/kaggle/working/T5')