In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%%time
%%capture
!pip install transformers

CPU times: user 31.5 ms, sys: 8.98 ms, total: 40.5 ms
Wall time: 3.23 s


In [1]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.10.0+cu111


In [2]:
if torch.cuda.is_available():
  DEVICE = 'cuda'
else:
  DEVICE = 'cpu'

In [3]:
MODEL           = 'models/' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
MAXLEN          = 768

In [4]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained('gpt2') #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, 
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,                                     
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path,map_location=torch.device(DEVICE)))

    model.to(DEVICE)
    return model

In [5]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, special_tokens=SPECIAL_TOKENS,load_model_path=os.path.join(MODEL,'pytorch_model.bin'))

Special tokens added


In [6]:
title = "Australia beats India by 7 wickets in Nagpur test"
keywords = ['Nagpur', 'Cricket', 'test', 'Kohli', 'win']
kw = ','.join(keywords)

prompt = SPECIAL_TOKENS['bos_token'] + title + \
         SPECIAL_TOKENS['sep_token'] + kw + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(DEVICE)

model.eval();

In [7]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=100, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=4.0,
                                early_stopping=True,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    text = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(title) + len(','.join(keywords))    
    print("{}: {}\n\n".format(i+1,  text[a:]))

1: New Delhi: Australia beat South Africa and Bangladesh on Sunday to claim a record seven consecutive victories against the Indian Premier League (IPL) champions. The result came after five straight losses with Pakistan winning four of them – defeating Sri Lanka 3-0 at Kohima Stadium last month while scoring two runs from Tinku Thaouyun’s double off Anuradha Patel over three balls — including one run during an innings that ended 4 overs before it was completed as Karnataka crossed its 100th win under new captain Sushil Chandra Bose ahead for his maiden Test series debut since 2014 when he joined West Indies earlier this year amid speculation about him being brought back into contention if required due transfer fee issues were not addressed soon enough following suspension or criminal charges levelled out upon allegations made regarding Naveen Kabrawala/Rajesh Bhansali vs Kailash Vijayawada between 2013 & 2015.  In fact all six Tests won either via home ground matches versus IPLS oppon

In [17]:
text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
text



'Australia beats India by 7 wickets in Nagpur testNagpur,Cricket,test,Kohli,winNew Delhi: Australia beat South Africa and Bangladesh on Sunday to claim a record seven consecutive victories against the Indian Premier League (IPL) champions. The result came after five straight losses with Pakistan winning four of them – defeating Sri Lanka 3-0 at Kohima Stadium last month while scoring two runs from Tinku Thaouyun’s double off Anuradha Patel over three balls — including one run during an innings that ended 4 overs before it was completed as Karnataka crossed its 100th win under new captain Sushil Chandra Bose ahead for his maiden Test series debut since 2014 when he joined West Indies earlier this year amid speculation about him being brought back into contention if required due transfer fee issues were not addressed soon enough following suspension or criminal charges levelled out upon allegations made regarding Naveen Kabrawala/Rajesh Bhansali vs Kailash Vijayawada between 2013 & 2015.