In [1]:
import argparse
import os

from bs4 import BeautifulSoup
from googlesearch import search
import numpy as np
import requests
from transformers import GPT2Config, GPT2LMHeadModel
import torch
from tqdm import tnrange, tqdm_notebook

from dataset import GPT21024Dataset 
from utils import add_special_tokens, beam_search, generate_beam_sample, generate_sample, sample_seq, set_seed, top_k_top_p_filtering

In [2]:
#please change default arguments if needed

parser = argparse.ArgumentParser()

parser.add_argument("--seed",default=42, type=int,  help="seed to replicate results")
parser.add_argument("--num_workers",default=4, type=int,  help="num of cpus available")
parser.add_argument("--device",default=torch.device('cuda'), help="torch.device object")
parser.add_argument("--output_dir",default='./output', type=str,  help="path to save evaluation results")
parser.add_argument("--model_dir",default='./weights', type=str,  help="path to save trained model")
parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.")
parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes")
args = parser.parse_args([])
print(args)

Namespace(device=device(type='cuda'), ids_file='./CNN/ids.json', model_dir='./weights', n_gpu=1, num_workers=4, output_dir='./output', root_dir='./CNN/gpt2_1024_data', seed=42)


In [3]:
# using the same validation and training data as during training
tokenizer = add_special_tokens()
# train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000)
# valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500)
test_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='test',length=500)


In [4]:
# model_file and config_file are files used to load finetuned model, change these name as per your file names

# model_file = os.path.join(args.model_dir, 'model_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(len(train_data),args.num_train_epochs))
# config_file = os.path.join(args.model_dir, 'config_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(len(train_data),args.num_train_epochs))

# path to model and config files
model_file = "345-model_O0_data3000_trained_after_5_epochs_only_sum_loss_ignr_pad.bin"
config_file = "345-config_O0_data3000_trained_after_5_epochs_only_sum_loss_ignr_pad.json"

config = GPT2Config.from_json_file(config_file)
model = GPT2LMHeadModel(config)
state_dict = torch.load(model_file)
model.load_state_dict(state_dict)
model.eval()
model.to(args.device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [5]:
generate_sample(test_data, tokenizer, model, num=2, length=100, temperature=1, top_k=10, top_p=0.5, device=args.device)

HBox(children=(IntProgress(value=0), HTML(value='')))


new_article

Rome -LRB- CNN -RRB- -- A cruise ship of the Costa Cruises line is adrift off the coast of the Seychelles after a fire in its engine room, the Italian coast guard said Monday. The ship, the Allegra, is a sister of the Costa Concordia, which wrecked off the coast of Italy on January 13, killing at least 21 people. The fire left the Allegra without propulsion, although its communications equipment is intact, the authorities said. The Allegra's fire has been put out, and the passengers are all in good health, the authorities said. The Seychelles is sending a tug, and merchant ships in the area are steaming toward the Allegra, the coast guard said.

generated_summary

 The ship is carrying cargo from the Seychelles . The ship was carrying cargo from the Seychelles . The ship was carrying cargo from the Seychelles . The ship was carrying cargo from the Seychelles . The ship was carrying cargo from the Seychelles . The ship was carrying cargo from the Seychelles . The ship was 

HBox(children=(IntProgress(value=0), HTML(value='')))


new_article

Islamabad, Pakistan -LRB- CNN -RRB- -- A Pakistani politician and his bodyguard were killed Monday in a suicide attack in northwest Pakistan, a police official told CNN. Hanif Jadoon had just finished morning prayers on the Islamic holiday of Eid al-Adha when a bomber approached his car and detonated his explosives, police official Muhammad Ejaz Khan said. The attack took place in the Swabi district of Khyber Pakhtunkhwa province, about 80 kilometers -LRB- 50 miles -RRB- west of Islamabad. Jadoon was a member of the Awami National Party, a secular party often targeted by the Taliban. Nine others were injured in the attack. No one has claimed responsibility for the attack, police said.

generated_summary

 Hanif Jadoon had just finished morning prayers on the Islamic holiday of Eid al-Adha . The attack took place in Swabi district of Khyber Pakhtunkhwa province , about 80 kilometers -LRB- 50 miles -RRB- west of Islamabad . Hanif Jadoon had just finished morning prayers on 

In [6]:
generate_beam_sample(test_data, tokenizer, model, num=2, length=100, beam_size=3, device=args.device)

  next_token_probs = F.softmax(next_token_logits)


HBox(children=(IntProgress(value=0, max=99), HTML(value='')))

  next_token_probs = F.softmax(next_token_logits)



new_article

Rome -LRB- CNN -RRB- -- A cruise ship of the Costa Cruises line is adrift off the coast of the Seychelles after a fire in its engine room, the Italian coast guard said Monday. The ship, the Allegra, is a sister of the Costa Concordia, which wrecked off the coast of Italy on January 13, killing at least 21 people. The fire left the Allegra without propulsion, although its communications equipment is intact, the authorities said. The Allegra's fire has been put out, and the passengers are all in good health, the authorities said. The Seychelles is sending a tug, and merchant ships in the area are steaming toward the Allegra, the coast guard said

actual_summary

An engine room fire leaves the Costa Allegra without propulsion, authorities say. Its sister ship, the Costa Concordia, shipwrecked last month, killing at least 21. <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> 

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))


new_article

Islamabad, Pakistan -LRB- CNN -RRB- -- A Pakistani politician and his bodyguard were killed Monday in a suicide attack in northwest Pakistan, a police official told CNN. Hanif Jadoon had just finished morning prayers on the Islamic holiday of Eid al-Adha when a bomber approached his car and detonated his explosives, police official Muhammad Ejaz Khan said. The attack took place in the Swabi district of Khyber Pakhtunkhwa province, about 80 kilometers -LRB- 50 miles -RRB- west of Islamabad. Jadoon was a member of the Awami National Party, a secular party often targeted by the Taliban. Nine others were injured in the attack. No one has claimed responsibility for the attack, police said

actual_summary

Hanif Jadoon was a member of the Awami National Party. The secular party is often targeted by the Taliban. Police say no one has claimed responsibility for the attack. Nine others are injured. <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|pad|> <|p

## Download An Article Given A Query

In [7]:
def sentences_from_query(query):
    # Get url
    if query.startswith("http"):
        url = query
    else:
        url = search(query, num_results=1)[0]
    print(url)
    page = requests.get(url).text
    soup = BeautifulSoup(page)
    # Get text from all <p> tags.
    p_tags = soup.find_all('p')
    # Get the text from each of the "p" tags and strip surrounding whitespace.
    p_tags_text = " ".join([tag.get_text().strip() for tag in p_tags])
    return p_tags_text

In [8]:
article = sentences_from_query("neural embedding")
article = tokenizer.encode(article)[:900]

https://towardsdatascience.com/neural-network-embeddings-explained-4d028e6f0526


Token indices sequence length is longer than the specified maximum sequence length for this model (1957 > 1024). Running this sequence through the model will result in indexing errors


In [9]:
generated_text = sample_seq(model, article, 50, args.device, temperature=1, top_k=10, top_p=0.5)
generated_text = generated_text[0, len(article):].tolist()
text = tokenizer.convert_ids_to_tokens(generated_text,skip_special_tokens=True)
text = tokenizer.convert_tokens_to_string(text)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [10]:
print("Article: \n")
print(tokenizer.decode(article))
print("------------------------------------------------------------ \n")
print("Generated Summary: \n")
print(text)

Article: 

Applications of neural networks have expanded significantly in recent years from image segmentation to natural language processing to time-series forecasting. One notably successful use of deep learning is embedding, a method used to represent discrete variables as continuous vectors. This technique has found practical applications with word embeddings for machine translation and entity embeddings for categorical variables. In this article, I’ll explain what neural network embeddings are, why we want to use them, and how they are learned. We’ll go through these concepts in the context of a real problem I’m working on: representing all the books on Wikipedia as vectors to create a book recommendation system. An embedding is a mapping of a discrete — categorical — variable to a vector of continuous numbers. In the context of neural networks, embeddings are low-dimensional, learned continuous vector representations of discrete variables. Neural network embeddings are useful bec