In [None]:
!pwd

# GPT Summarization Routines

In [None]:
!pip install openai tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp (from openai)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5 (from aiohttp->openai)
  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/

**Get OpenAI API Key:**

Enter API Key:

In [None]:
import openai
import getpass
import os

openai.organization = 'org-eWvp2DebNMFgMDwVvbWAg5g3'

openai.api_key = getpass.getpass('API Key: ')

os.environ["OPENAI_API_KEY"] = openai.api_key

API Key: ··········


In [None]:
# Test GPT-4 here

prompt = """
  Your task is to classify the text fragments in the backticks below. Classify each of the
  "Text" values as one of these "Aspect" labels - Kind,Mean.
  If none of these labels is suitable, label it as 'General'.
  Don't include backticks in your output.

  ```
  [
    { "Text": "You are a butt", "Aspect":""},
    { "Text": "You are a friend", "Aspect":""},
    { "Text": "You are a person", "Aspect":""}
  ]
  ```
  """

msgs = [{'role':'user', 'content':prompt}]

resp = openai.ChatCompletion.create(
      model='gpt-4',
      messages=msgs,
      temperature=0.75
)

print(resp['choices'][0]['message']['content'])

[
  { "Text": "You are a butt", "Aspect":"Mean"}, 
  { "Text": "You are a friend", "Aspect":"Kind"}, 
  { "Text": "You are a person", "Aspect":"General"}
]


In [None]:
for m in openai.Model.list()['data']:
  print(m['id'])

In [None]:
def gpt4_summarize_direct(text, prompt=None, system_msg=None, temperature=0.6):
  return _gpt_summarize_common('gpt-4', text, prompt, system_msg, temperature)


def gpt3_summarize_direct(text, prompt=None, system_msg=None, temperature=0.6):
  return _gpt_summarize_common('gpt-3.5-turbo', text, prompt, system_msg, temperature)


def _gpt_summarize_common(model, text, prompt, system_msg, temperature):
  # Summarize the entire input text in one go by assuming it's within
  # token limits.
  # API Ref: https://platform.openai.com/docs/guides/chat

  msgs = []
  if system_msg:
    msgs.append({'role':'system', 'content':system_msg})

  default_prompt = f"""
  Your task is to generate a short summary of an article on a website.

  Summarize the article below, delimited by triple backticks, in at most 3 sentences.

  ```{text}```
  """

  user_msg = ''
  if not prompt:
    user_msg = default_prompt
  else:
    user_msg = f'{prompt}\n\n{text}'

  msgs.append({'role':'user', 'content':user_msg})

  resp = openai.ChatCompletion.create(
      model=model,
      messages=msgs,
      temperature=temperature
  )

  asst_msg = resp['choices'][0]['message']['content']

  return asst_msg, resp

In [None]:
import tiktoken

def gpt4_count_tokens(text):
  tokenizer = tiktoken.encoding_for_model('gpt-4')
  tokens = tokenizer.encode(text)
  return len(tokens)

def gpt3_count_tokens(text):
  tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
  tokens = tokenizer.encode(text)
  return len(tokens)

# LangChain Summarization Routines

https://python.langchain.com/en/latest/modules/chains/index_examples/summarize.html

+ The `stuff` chain dumps the entire document. Not ideal for long docs.

+ The `map_reduce` chain uses chunking. Good for long docs. However, the chunking is naive rather than semantically-aware and context-aware.

+ The `refine` chain uses a supplied summary as a starting point and refines it.

In [None]:
!pip install langchain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting langchain
  Downloading langchain-0.0.166-py3-none-any.whl (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.5/803.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

## Prototypes

In [None]:
# Test basic langchain
from langchain.llms import OpenAI

llm = OpenAI(temperature=0.9)

text = "What would be a good company name for a company that makes colorful socks?"
print(llm(text))

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

chat = ChatOpenAI(temperature=0)

chat([HumanMessage(content="Translate this sentence from English to French. I love programming.")])

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document


text_splitter = CharacterTextSplitter()

with open("/content/drive/MyDrive/Colab Notebooks/example-news-article.txt") as f:
    fulltext = f.read()




In [None]:
from langchain.chat_models import ChatOpenAI

from langchain.chains.summarize import load_summarize_chain

chat = ChatOpenAI(openai_api_key=openai.api_key, openai_organization=openai.organization, temperature=0)

chain = load_summarize_chain(chat, chain_type="map_reduce")



In [None]:
full_doc = [Document(page_content=fulltext)]

chain.run(full_doc)

In [None]:
split_texts = text_splitter.split_text(fulltext)

split_docs = [Document(page_content=t) for t in split_texts]

chain.run(split_docs)

## Routines

In [None]:
# Routines for Abstractive summarization

import openai
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document


def gpt4_summarize_lc_abs_mr(text):
  return _gpt_summarize_lc_abs_common(text, 'gpt-4')


def gpt3_summarize_lc_abs_mr(text):
  return _gpt_summarize_lc_abs_common(text, 'gpt-3.5-turbo')


def _gpt_summarize_lc_abs_common(text, model):
  text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=100)

  split_texts = text_splitter.split_text(text)

  split_docs = [Document(page_content=t) for t in split_texts]

  chatgpt = ChatOpenAI(model_name=model, temperature=0.75,
                        openai_api_key=openai.api_key, openai_organization=openai.organization)

  chain = load_summarize_chain(chatgpt, chain_type="map_reduce") # verbose=True

  summary = chain.run(split_docs)

  return summary


In [None]:
# Routines for Extractive summarization

import openai
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import TokenTextSplitter
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate

def gpt4_summarize_lc_ext_mr(text):
  return _gpt_summarize_lc_ext_common(text, 'gpt-4')


def gpt3_summarize_lc_ext_mr(text):
  return _gpt_summarize_lc_ext_common(text, 'gpt-3.5-turbo')


def _gpt_summarize_lc_ext_common(text, model):
  text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=100)

  split_texts = text_splitter.split_text(text)

  split_docs = [Document(page_content=t) for t in split_texts]

  chatgpt = ChatOpenAI(model_name=model, temperature=0.0,
                        openai_api_key=openai.api_key, openai_organization=openai.organization)

  extractive_prompt_str = """Just pick 3 sentences from the text in backticks that cover its main ideas best.
  Don't rephrase any sentence. Output the 3 picked sentences as a single paragraph:

  ```{text}```
  """

  extractive_prompt_template = PromptTemplate(template=extractive_prompt_str,
                                              input_variables=["text"])

  chain = load_summarize_chain(chatgpt, chain_type="map_reduce",
                               map_prompt=extractive_prompt_template,
                               combine_prompt=extractive_prompt_template) # verbose=True

  summary = chain.run(split_docs)

  return summary


In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/example-news-article.txt") as f:
    fulltext = f.read()

gpt4_summarize_lc_abs_mr(fulltext)

In [None]:
gpt4_summarize_lc_ext_mr(fulltext)

In [None]:
gpt3_summarize_lc_abs_mr(fulltext)

In [None]:
gpt3_summarize_lc_ext_mr(fulltext)

# HuggingFace Summarization Pipelines

In [None]:
!pip install transformers

In [None]:
import transformers

# https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.SummarizationPipeline.example
#
# See https://huggingface.co/models?library=pytorch&pipeline_tag=summarization&sort=downloads&search=bart
# for the up-to-date list of models.

#hf_summarizer = transformers.pipeline('summarization', model='t5-base', tokenizer='t5-base')

#hf_summarizer = transformers.pipeline('summarization', model='facebook/bart-large-cnn')

#hf_summarizer = transformers.pipeline('summarization', model='facebook/bart-large-xsum')

def hf_summarize(input_text, model, max_length=70):
    summarizer = None

    if model == 'bart':
        summarizer = transformers.pipeline('summarization', model='facebook/bart-large-cnn')

    elif model == 'brio':
        summarizer = transformers.pipeline('summarization',
                                           model='Yale-LILY/brio-cnndm-uncased',
                                           tokenizer='facebook/bart-large')

    elif model == 't5':
        summarizer = transformers.pipeline('summarization', model='t5-base', tokenizer='t5-base')

    elif model == 'pegasus-news':
        summarizer = transformers.pipeline('summarization', model='google/pegasus-cnn_dailymail')

    elif model == 'pegasus-large':
        summarizer = transformers.pipeline('summarization', model='google/pegasus-large')

    else:
        raise Exception(f'Unknown model: {model}')

    summary = summarizer(input_text,
                     num_beams=5,
                     max_length=max_length, # Max number of tokens in the summary. Tokens are usually a few characters each.
                     min_length=5,
                     do_sample=False
                    )[0]['summary_text']

    return summary

# QFSumm Summarization Routines

Bhaskar et al. use a combo of QFSumm+GPT as one of their pipelines. https://github.com/oja/aosumm

In [None]:
!git clone https://github.com/oja/aosumm

# Metrics Libraries

Evaluation metrics used by

Goyal et al. https://arxiv.org/pdf/2209.12356.pdf

and

Bhaskar et al. https://arxiv.org/pdf/2211.15914.pdf

## ROUGE

In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge

Rouge().get_scores('This is a summary of an article on machine learning.',
                   'This is a reference summary for a machine learning article.')

[{'rouge-1': {'r': 0.7777777777777778, 'p': 0.7, 'f': 0.7368421002770082},
  'rouge-2': {'r': 0.3333333333333333,
   'p': 0.3333333333333333,
   'f': 0.3333333283333334},
  'rouge-l': {'r': 0.6666666666666666, 'p': 0.6, 'f': 0.6315789423822715}}]

## Reference-free Evaluation Metrics

SUPERT https://github.com/yg211/acl20-ref-free-eval

In [None]:
!git clone https://github.com/yg211/acl20-ref-free-eval

In [None]:
%cd acl20-ref-free-eval

In [None]:
!pip install pytorch-transformers

In [None]:
from ref_free_metrics.supert import Supert
from utils.data_reader import CorpusReader

In [None]:
%cd /content

## BERTScore

https://github.com/Tiiiger/bert_score

In [None]:
!pip install bert-score

In [None]:
# Alternate impl available via Torch https://torchmetrics.readthedocs.io/en/stable/text/bert_score.html
!pip install torchmetrics transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.11.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m

In [None]:
# From https://torchmetrics.readthedocs.io/en/stable/text/bert_score.html

from torchmetrics.text.bert import BERTScore

test_summaries = ['This is a summary of an article on machine learning.']
test_references = ['This is a reference summary for a machine learning article.']

bert_scores = BERTScore('roberta-large')(test_summaries,
                          test_references)

bert_scores

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'precision': 0.9888111352920532,
 'recall': 0.9884637594223022,
 'f1': 0.9886374473571777}

## MoverScore

https://github.com/AIPHES/emnlp19-moverscore

In [None]:
!pip install moverscore

In [None]:
!pip install pyemd

In [None]:
from moverscore_v2 import get_idf_dict, word_mover_score
from collections import defaultdict

test_summaries = ['This is a summary of an article on machine learning.']
test_references = ['This is a reference summary for a machine learning article.']

idf_dict_hyp = get_idf_dict(test_summaries) # idf_dict_hyp = defaultdict(lambda: 1.)
idf_dict_ref = get_idf_dict(test_references) # idf_dict_ref = defaultdict(lambda: 1.)

scores = word_mover_score(test_references, test_summaries, idf_dict_ref, idf_dict_hyp, \
                          stop_words=[], n_gram=1, remove_subwords=True, device='cpu')

scores

## QAEval

https://github.com/danieldeutsch/qaeval

Setup and usage: https://github.com/danieldeutsch/sacrerouge/blob/master/doc/metrics/qaeval.md

DROPPING due to dependency version conflicts

In [None]:

!pip install sacrerouge qaeval

## Entailment-based Qualitative Metrics

From Bhaskar et al.

https://github.com/testzer0/ZS-Summ-GPT3

In [None]:
%cd /content

In [None]:
!git clone https://github.com/testzer0/ZS-Summ-GPT3

# Use Case #1: News Summarization

## Download News Datasets

In [None]:
!pip install datasets

###1. CNN dataset

https://huggingface.co/datasets/cnn_dailymail

In [None]:
from datasets import load_dataset

cnn_dataset = load_dataset("cnn_dailymail", '3.0.0')

In [None]:
cnn_dataset

In [None]:
len(cnn_dataset['validation'])

###2. XLSum dataset

https://github.com/csebuetnlp/xl-sum

In [None]:
!mkdir -p /content/data/xlsum

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/xlsum-dataset/english_XLSum_v2.0.tar.bz2' /content/data/xlsum/

In [None]:
%cd /content/data/xlsum/

In [None]:
!tar -xf english_XLSum_v2.0.tar.bz2

In [None]:
!ls

In [None]:
!head -n1 english_val.jsonl

## Load Articles

In [None]:
example1 = cnn_dataset['validation'][0]

In [None]:
import textwrap

article1 = example1['article']
print(textwrap.fill(article1, width=70))

In [None]:
ref_summary1 = example1['highlights']
print(textwrap.fill(ref_summary1, width=70))

In [None]:
example2 = cnn_dataset['validation'][29]

In [None]:
import textwrap

article2 = example2['article']
print(textwrap.fill(article2, width=70))

In [None]:
ref_summary2 = example2['highlights']
print(textwrap.fill(ref_summary2, width=70))

In [None]:
news_articles = []
ref_summaries = []

sel_articles = [0, 29, 319, 495, 589]

for i in sel_articles:
  entry = cnn_dataset['validation'][i]
  news_articles.append(entry['article'])
  ref_summaries.append(entry['highlights'])

In [None]:
import tiktoken

tokenizer = tiktoken.encoding_for_model('gpt-4')
for a in news_articles:
  tokens = tokenizer.encode(a)
  print(len(tokens))

## Search Articles

In [None]:
import textwrap

i = 0
for entry in cnn_dataset['validation']:
  text = entry['article']
  if ' wine ' in text.lower():
    print(entry['id'])
    print(textwrap.fill(entry['article'], width=80))

    i += 1
    if i > 3:
      break


## GPT Summaries using Simple Prompts

Using the same prompt as Goyal et al:

> Article: {{article}}
>
> Summarize the above article in N sentences.

In [None]:
gpt4_summaries = []
gpt3_summaries = []

for article in news_articles:
  summarize_news_simple_prompt = f"""
  Article: {article}

  Summarize the above article in 3 sentences.
  """

  gpt4_summary_simple, _ = gpt4_summarize_direct('',
                                        summarize_news_simple_prompt,
                                        temperature=0.75)

  gpt4_summaries.append(gpt4_summary_simple)

  gpt3_summary_simple, _ = gpt3_summarize_direct('',
                                        summarize_news_simple_prompt,
                                        temperature=0.75)

  gpt3_summaries.append(gpt3_summary_simple)

In [None]:
import pickle

with open('simple-summaries.pkl', 'wb') as savefile:
  pickle.dump( (gpt4_summaries, gpt3_summaries), savefile )

In [None]:
!ls -lah

In [None]:
!cp simple-summaries.pkl '/content/drive/MyDrive/Colab Notebooks/gp4-summ-saved-data/'

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/gp4-summ-saved-data/simple-summaries.pkl' ./

In [None]:
import pickle

with open('simple-summaries.pkl', 'rb') as saved_file:
  gpt4_summaries, gpt3_summaries = pickle.load(saved_file)

In [None]:
import textwrap

for s in gpt4_summaries:
  print(textwrap.fill(s,width=70))
  print('\n\n')

for s in gpt3_summaries:
  print(textwrap.fill(s,width=70))
  print('\n\n')

## GPT Summaries using Complex Prompts

In [None]:
system_instruction = """You are an assistant that summarizes a news article to a
simple summary at 8th grade reading level.
"""

summarize_news_prompt = f"""
  Summarize this news article in at most 3 sentences, each sentence not exceeding 15 words,
  keep only the most important details, rephrase all other details:
  """

In [None]:
gpt4_summaries_complex = []
gpt3_summaries_complex = []

for article in news_articles:
  gpt4_summary_complex, _ = gpt4_summarize_direct(article,
                                        summarize_news_prompt,
                                        system_msg=system_instruction,
                                        temperature=0.75)

  gpt4_summaries_complex.append(gpt4_summary_complex)

  gpt3_summary_complex, _ = gpt3_summarize_direct(article,
                                        summarize_news_prompt,
                                        system_msg=system_instruction,
                                        temperature=0.75)

  gpt3_summaries_complex.append(gpt3_summary_complex)


In [None]:
import pickle

with open('complex-summaries.pkl', 'wb') as savefile:
  pickle.dump( (gpt4_summaries_complex, gpt3_summaries_complex), savefile )

In [None]:
!cp complex-summaries.pkl '/content/drive/MyDrive/Colab Notebooks/gp4-summ-saved-data/'

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/gp4-summ-saved-data/complex-summaries.pkl' ./

In [None]:
import pickle

with open('complex-summaries.pkl', 'rb') as saved_file:
  gpt4_summaries_complex, gpt3_summaries_complex = pickle.load(saved_file)

In [None]:
import textwrap

for s in gpt4_summaries_complex:
  print(textwrap.fill(s,width=70))
  print('\n\n')

for s in gpt3_summaries_complex:
  print(textwrap.fill(s,width=70))
  print('\n\n')

In [None]:
import textwrap

print(textwrap.fill(gpt4_summary_example,width=70))

In [None]:
[i*0.01 for i in range(0, 101, 25)]

In [None]:
import textwrap

news_results = []

for temperature in [i*0.01 for i in range(0, 101, 25)]:
  print(f'\n\nTemperature:{temperature}')

  gpt4_summary, _ = gpt4_summarize_direct(article, summarize_news_prompt,
                                          system_msg=system_instruction,
                                          temperature=temperature)
  print('\n\nGPT4:')
  print(textwrap.fill(gpt4_summary,width=70))

  gpt3_summary, _ = gpt3_summarize_direct(article, summarize_news_prompt,
                                          system_msg=system_instruction,
                                          temperature=temperature)
  print('\n\nGPT3:')
  print(textwrap.fill(gpt3_summary,width=70))

  news_results.append( (temperature, gpt4_summary, gpt3_summary) )


## GPT ROUGE Metrics

In [None]:
from pprint import pprint
import statistics

from rouge import Rouge

rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for s,r in zip(gpt4_summaries, ref_summaries):
  scores = R.get_scores(s,r)[0]
  #print(scores)
  rouge1_f.append(scores['rouge-1']['f'])
  rouge2_f.append(scores['rouge-2']['f'])
  rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))


In [None]:
from pprint import pprint
import statistics

from rouge import Rouge

rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for s,r in zip(gpt4_summaries_complex, ref_summaries):
  scores = R.get_scores(s,r)[0]
  #print(scores)
  rouge1_f.append(scores['rouge-1']['f'])
  rouge2_f.append(scores['rouge-2']['f'])
  rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))


In [None]:
from pprint import pprint
import statistics

from rouge import Rouge

rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for s,r in zip(gpt3_summaries, ref_summaries):
  scores = R.get_scores(s,r)[0]
  #print(scores)
  rouge1_f.append(scores['rouge-1']['f'])
  rouge2_f.append(scores['rouge-2']['f'])
  rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))


In [None]:
from pprint import pprint
import statistics

from rouge import Rouge

rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for s,r in zip(gpt3_summaries_complex, ref_summaries):
  scores = R.get_scores(s,r)[0]
  #print(scores)
  rouge1_f.append(scores['rouge-1']['f'])
  rouge2_f.append(scores['rouge-2']['f'])
  rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))


## BRIO summaries

In [None]:


brio_news_summaries = []

for article in news_articles:
  brio_summary = hf_summarize(article, 'brio')

  brio_news_summaries.append(brio_summary)


In [None]:
import pickle

with open('brio-summaries.pkl', 'wb') as savefile:
  pickle.dump( brio_news_summaries, savefile )

In [None]:
!cp brio-summaries.pkl '/content/drive/MyDrive/Colab Notebooks/gp4-summ-saved-data/'

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/gp4-summ-saved-data/brio-summaries.pkl' ./

In [None]:
import pickle

with open('brio-summaries.pkl', 'rb') as saved_file:
  brio_news_summaries = pickle.load(saved_file)

In [None]:
import textwrap

for s in brio_news_summaries:
  print(textwrap.fill(s, width=70))

In [None]:
from pprint import pprint
import statistics

from rouge import Rouge

rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for s,r in zip(brio_news_summaries, ref_summaries):
  scores = R.get_scores(s,r)[0]
  #print(scores)
  rouge1_f.append(scores['rouge-1']['f'])
  rouge2_f.append(scores['rouge-2']['f'])
  rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))

## BERTScore Metrics

In [None]:
from torchmetrics.text.bert import BERTScore

scorer = BERTScore('roberta-large')

bert_scores = scorer(gpt4_summaries, ref_summaries)

bert_scores

In [None]:
import statistics

statistics.mean(bert_scores['f1'])

In [None]:
import statistics

bert_scores = scorer(gpt3_summaries, ref_summaries)

statistics.mean(bert_scores['f1'])

In [None]:
import statistics

bert_scores = scorer(gpt4_summaries_complex, ref_summaries)

statistics.mean(bert_scores['f1'])

In [None]:
import statistics

bert_scores = scorer(gpt3_summaries_complex, ref_summaries)

statistics.mean(bert_scores['f1'])

In [None]:
import statistics

bert_scores = scorer(brio_news_summaries, ref_summaries)

statistics.mean(bert_scores['f1'])



---

# Use Case #2: Long-form Post Summarization

## Set up WikiHow Dataset

**WikiHow dataset**

https://github.com/mahnazkoupaee/WikiHow-Dataset

This is a CSV file with 3 fields:

+ Title: the title of the article as it appears on the WikiHow knowledge base

+ Headline: the concatenation of all the bold lines (the summary sentences) of all the paragraphs to serve as the reference summary

+ Text: the concatenation of all paragraphs (except the bold lines) to generate the article to be summarized

Note that not all entries have summaries or even text

In [None]:
%cd /content

/content


In [None]:
!ls -lah

In [None]:
!curl 'https://public.boxcloud.com/d/1/b1!mFymrCPjULN50ZpfDmFcHlrzQfKFYMsRPXF76jOTsoDX5qnnNaHFjqGEZq5KCAZQ4BmvFXAWK1xmSqbM6Ut7OA8ngK9BID8KD1wljxn2HAydZyEmd8SccShcsek4J5Qfd_TXCDPpKOqKbNbHe0S_60Wbqbkq4pONecsU-p0LL7qEb9xkbL7lsxhQhdN8NKyxvZA5FZpmMYsFrFjPc1PRyvA7EZQ4yQDl4nhras6Pcv8dPrCiJ06lnhU2jjzzfwSeOGtF2xfiPwnmkH1Dq-bEFpwuHUUYn6L9XUbcDSZz1pL-4VOdjV1i5J4RCMbXcKXQxD0fzRB97ZRxBN32x-kM5fVhfGWu0t5etw9GSfwKPbrRgUyHyMhhTQBTPcyPmAdZOJ0tAkwWpxGcgo8lkHTXkuNrbsJOfyeBa9dI96zb0e4EtvpZp6S8KcvjxpYCNOqPGPEORQgGUAQz5GBtVP2uppcxjJPX5eSnwtBdqEI0BArJip398L8DDMMVUXO5VY719oNK3_D3fz2VUwhzG05UBkbpCYQlUU32y__w8DJaVQ4U7EFz069ufylbIrBMwS6RoOyA6NNKq6e_UFZ0MlO1ryX3jOYbXrTa-FjnFEpmMCHaynKXIk9cdanMdocBbwOY0JQw502FqMeStivYdAI6JcwnneBqiahXFY_CW9TZKRpcXdFzoDopwVPKczqkiQG142pHiI3dINyAC3UFJ24dwzlz1cDGYpkLjJ-DB3v1mnEBgccxq68honFTqSJmpFfQreNsjv8TJf3fvajvSrak_rOoLu8yDDFhQ21XxaQcjLCmndBi04pDxp7s322ibUmz8lisy83FRFhZz6yxmXEZstROBx_bELD1BAtDsGRcwtNGR-QdqXQDDLY4UYp7-wLOExePOzqS0FcFsxxVOIuZ0QltYILbVH7t03FYUhvl1TXPVqv9h6UiNcnEvuSeUVew3r_JXuczgdAdjGLFQ7-2absiQjEeRv0J1T6BI9PcLLuKlhsuFxAx1zaqeKXpIzVWqcv1YOyWlIVhpBF3Pefy0vPbibz1L4U02NIeuuSzPm5iEHbR0oqqeMx05SEQ8yUCVqQ0cmX7mACzOt1WnTLO-Te-nxJA25cpJ-IK9gMlpU9JxvZzYXyC82WUA8IbEDVK5rN67lIVxETs8RXu6-8UVHEMUgrwsbNYR2crOpq1vKpKrm_iu5BaZgZgbTV02WxoG3HfCCeisN5W87yfmSz3uc7eDgusiMI0xoi0TiDN99yVBA0gy_dIlADS/download' \
-H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/112.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' -H 'Accept-Encoding: gzip, deflate, br' -H 'Alt-Used: public.boxcloud.com' -H 'Connection: keep-alive' -H 'Referer: https://ucsb.app.box.com/' -H 'Upgrade-Insecure-Requests: 1' -H 'Sec-Fetch-Dest: iframe' -H 'Sec-Fetch-Mode: navigate' -H 'Sec-Fetch-Site: cross-site' \
--output wikihowAll.csv.gz

In [None]:
!ls -lah

In [None]:
!cp wikihowAll.csv.gz '/content/drive/MyDrive/Colab Notebooks/wikihow-summ-dataset/'

In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/wikihow-summ-dataset/wikihowAll.csv.gz' ./

In [None]:
!gunzip -k wikihowAll.csv.gz

In [None]:
!git clone https://github.com/mahnazkoupaee/WikiHow-Dataset

In [None]:
%cd WikiHow-Dataset

In [None]:
!ln -s /content/wikihowAll.csv wikihowAll.csv

In [None]:
!ls -lah

In [None]:
!head -n20 wikihowAll.csv

In [None]:
# Find long posts in this dataset.
# Processing code based on https://github.com/mahnazkoupaee/WikiHow-Dataset/blob/master/process.py
# but without using Pandas.

import csv
import tiktoken

with open('wikihowAll.csv', 'r', newline='') as csvfile:

    csv_reader = csv.reader(csvfile)

    # Skip header
    next(csv_reader)

    tokenizer = tiktoken.encoding_for_model('gpt-4')

    long_posts = []

    for i, row in enumerate(csv_reader):
      #print(row)

      summary = row[0] if len(row) > 0 else ''
      title = row[1] if len(row) > 1 else ''
      text = row[2] if len(row) > 2 else ''

      #print(title, len(title))
      #print(summary, len(summary))
      #print(text, len(text))

      if len(text) > 25000:
        tokens = tokenizer.encode(text)
        if len(tokens) > 8000:
          print(i, len(tokens), len(text), title)
          long_posts.append( (i, len(tokens), title, summary, text) )

    print('Total number of entries in CSV:', i)


23665 11868 54475 How to Hike the John Muir Trail
82748 10699 33638 How to Solve Differential Equations
95398 8078 28299 How to Program Excel to Show Spheroids Visiting Their Home Planet
95489 8148 28448 How to Create the Idea of an Idea Image
98790 8161 28602 How to Create a Dakini and Boddhisattva Aspect of the Mother Planet
100039 15479 59544 How to Create an Overall Status Workbook in XL and VBA for Your wikiHow Articles
139904 14611 74231 How to Understand Your Website Traffic Variation with Time
147497 8572 42984 How to Understand Your Website Audience Profile
153631 13168 58892 How to Feed Cattle
Total number of entries in CSV: 215364


Out of 215,364 entries, only 9 exceed 8,192 tokens.

Only a few of these 9 seem suitable for general-purpose summarization and evaluation:

\#23665 11868 54475 How to Hike the John Muir Trail

\#98790 8161 28602 How to Create a Dakini and Boddhisattva Aspect of the Mother Planet

\#147497 8572 42984 How to Understand Your Website Audience Profile

\#153631 13168 58892 How to Feed Cattle



In [None]:
sel_long_posts = [long_posts[i] for i in [0, 4, 7, 8]]

In [None]:
[s[0] for s in sel_long_posts]

In [None]:
# Print example [post]
print(sel_long_posts[0][4])

 The JMT is a high elevation (for the US at least) mountain trail through remote terrain.


The JMT ranges in elevation from about 4000 feet above sea level, to 14,505 ft at the summit of Mt. Whitney. All of the southern half is above 8000 ft.
The trail passes through 3 national parks, 5 wilderness areas, 2 national forests, and 1 national monument. There's a very helpful "tour" of each with photos here.
There are no huts or shelters along the trail, so you will be responsible for your own campsite and protection from the elements at all times.
There are 10 passes (the high points between valleys) over 10,000 ft, and you will usually be hiking several thousand feet up and then down in a single day.
The JMT grows gradually higher and the passes more demanding as you hike from north to south. In fact, if heading out from Muir Trail Ranch (the halfway resupply stop) with 100 miles (160 km) of food on your back and the most difficult terrain still ahead, it almost feels like the first 100 

In [None]:
# Print example summary
print(sel_long_posts[0][3])


Understand the beauty and challenge of the JMT.,
Research the trail.,
Decide when to hike.,
Decide which direction to hike.,
Decide how fast to hike.,
Decide who to hike with.,
Plan your resupplies.,
Plan a rough itinerary.,
Plan your transportation.,
Apply for and get your permit.,
Carefully choose the "big 3": backpack, shelter, and sleeping system.,
Choose your clothing.,
Plan your hydration system.,
Choose the rest of your gear.,
Remember safety essentials.,
Choose a couple of luxury items that will really enhance your trip.,
Consider and plan for your personal needs in the wilderness.,
Weigh your gear ahead of time and make sure it fits comfortably in your pack.,

Choose a bear canister.,
Plan your resupply details.,
Identify your meal and snack counts.,
Plan your menu.,

Make and/or buy your meals.,
Mail your resupplies and, if possible, confirm that they arrived.,
Do a test pack with your bear canister to make sure it all fits.,
Decide to train.,
Be generally active.,
Take prog

## GPT Abstractive Summarization with Simple Chunking

Uses simple overlapping chunking implementation from langchain.

In [None]:
!ln -s '/content/drive/MyDrive/Colab Notebooks/wikihow-summ-dataset' wikihow-summ

In [None]:
import os

for i, token_len, title, summary, text in sel_long_posts:

  title_comp = title.replace(' ','-')

  gpt4_abs_summ = gpt4_summarize_lc_abs_mr(text)
  with open(f'wikihow-summ/{title_comp}.gpt4-abs.txt', 'w') as f:
    f.write(gpt4_abs_summ)

  gpt3_abs_summ = gpt3_summarize_lc_abs_mr(text)
  with open(f'wikihow-summ/{title_comp}.gpt3-abs.txt', 'w') as f:
    f.write(gpt3_abs_summ)



## GPT Extractive Summarization with Simple Chunking

Uses simple overlapping chunking implementation from langchain.

In [None]:
import os

for i, token_len, title, summary, text in sel_long_posts:

  title_comp = title.replace(' ','-')

  gpt4_ext_summ = gpt4_summarize_lc_ext_mr(text)
  with open(f'wikihow-summ/{title_comp}.gpt4-ext.txt', 'w') as f:
    f.write(gpt4_ext_summ)

  gpt3_ext_summ = gpt3_summarize_lc_ext_mr(text)
  with open(f'wikihow-summ/{title_comp}.gpt3-ext.txt', 'w') as f:
    f.write(gpt3_ext_summ)



## WikiHow ROUGE Metrics

In [None]:
!cp wikihow-summ/*.txt '/content/drive/MyDrive/Colab Notebooks/wikihow-summ-dataset/'

In [None]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge

Rouge().get_scores('this is a summary', 'this is a reference summary')

[{'rouge-1': {'r': 0.8, 'p': 1.0, 'f': 0.8888888839506174},
  'rouge-2': {'r': 0.5, 'p': 0.6666666666666666, 'f': 0.5714285665306124},
  'rouge-l': {'r': 0.8, 'p': 1.0, 'f': 0.8888888839506174}}]

In [None]:
from rouge import Rouge
import statistics

R = Rouge()

gpt4_abs_scores = {
    '1':[],
    '2':[],
    'l':[]
}
gpt3_abs_scores = {
    '1':[],
    '2':[],
    'l':[]
}
gpt4_ext_scores = {
    '1':[],
    '2':[],
    'l':[]
}
gpt3_ext_scores = {
    '1':[],
    '2':[],
    'l':[]
}

for i, token_len, title, summary, text in sel_long_posts:
  title_comp = title.replace(' ','-')

  with open(f'wikihow-summ/{title_comp}.gpt4-abs.txt', 'r') as f:
    gpt4_abs_summ = f.read()
    scores = R.get_scores(gpt4_abs_summ, summary)[0]
    gpt4_abs_scores['1'].append(scores['rouge-1']['f'])
    gpt4_abs_scores['2'].append(scores['rouge-2']['f'])
    gpt4_abs_scores['l'].append(scores['rouge-l']['f'])

  with open(f'wikihow-summ/{title_comp}.gpt3-abs.txt', 'r') as f:
    gpt3_abs_summ = f.read()
    scores = R.get_scores(gpt3_abs_summ, summary)[0]
    gpt3_abs_scores['1'].append(scores['rouge-1']['f'])
    gpt3_abs_scores['2'].append(scores['rouge-2']['f'])
    gpt3_abs_scores['l'].append(scores['rouge-l']['f'])

  with open(f'wikihow-summ/{title_comp}.gpt4-ext.txt', 'r') as f:
    gpt4_ext_summ = f.read()
    scores = R.get_scores(gpt4_ext_summ, summary)[0]
    gpt4_ext_scores['1'].append(scores['rouge-1']['f'])
    gpt4_ext_scores['2'].append(scores['rouge-2']['f'])
    gpt4_ext_scores['l'].append(scores['rouge-l']['f'])

  with open(f'wikihow-summ/{title_comp}.gpt3-ext.txt', 'r') as f:
    gpt3_ext_summ = f.read()
    scores = R.get_scores(gpt3_ext_summ, summary)[0]
    gpt3_ext_scores['1'].append(scores['rouge-1']['f'])
    gpt3_ext_scores['2'].append(scores['rouge-2']['f'])
    gpt3_ext_scores['l'].append(scores['rouge-l']['f'])

print('GPT4 Abs Mean rouge-1 f:', statistics.mean(gpt4_abs_scores['1']))
print('GPT4 Abs Mean rouge-2 f:', statistics.mean(gpt4_abs_scores['2']))
print('GPT4 Abs Mean rouge-l f:', statistics.mean(gpt4_abs_scores['l']))

print('GPT3 Abs Mean rouge-1 f:', statistics.mean(gpt3_abs_scores['1']))
print('GPT3 Abs Mean rouge-2 f:', statistics.mean(gpt3_abs_scores['2']))
print('GPT3 Abs Mean rouge-l f:', statistics.mean(gpt3_abs_scores['l']))

print('GPT4 Ext Mean rouge-1 f:', statistics.mean(gpt4_ext_scores['1']))
print('GPT4 Ext Mean rouge-2 f:', statistics.mean(gpt4_ext_scores['2']))
print('GPT4 Ext Mean rouge-l f:', statistics.mean(gpt4_ext_scores['l']))

print('GPT3 Ext Mean rouge-1 f:', statistics.mean(gpt3_ext_scores['1']))
print('GPT3 Ext Mean rouge-2 f:', statistics.mean(gpt3_ext_scores['2']))
print('GPT3 Ext Mean rouge-l f:', statistics.mean(gpt3_ext_scores['l']))


GPT4 Abs Mean rouge-1 f: 0.14388908870860545
GPT4 Abs Mean rouge-2 f: 0.015779481242014805
GPT4 Abs Mean rouge-l f: 0.13451371756745656
GPT3 Abs Mean rouge-1 f: 0.17302190780729163
GPT3 Abs Mean rouge-2 f: 0.025540420004049436
GPT3 Abs Mean rouge-l f: 0.15577159152703804
GPT4 Ext Mean rouge-1 f: 0.1868099677950608
GPT4 Ext Mean rouge-2 f: 0.03667096883325839
GPT4 Ext Mean rouge-l f: 0.16843717796265487
GPT3 Ext Mean rouge-1 f: 0.17289398658150873
GPT3 Ext Mean rouge-2 f: 0.024418690008306217
GPT3 Ext Mean rouge-l f: 0.15923032795341682


## BertSum Extractive Summarization with Chunking

SOTA model : https://paperswithcode.com/paper/abstractive-summarization-of-spoken

https://github.com/alebryvas/berk266

Pretrained version of the model for extractive summarization on CNN dataset: https://w266.blob.core.windows.net/pretraining/model_step_50000.pt

In [None]:
# https://github.com/alebryvas/berk266/blob/master/code/bertsumabs/requirements.txt
!pip install pytorch_transformers tensorboardX pyrouge

In [None]:
%cd /content

!git clone https://github.com/alebryvas/berk266

In [None]:
%cd berk266

In [None]:
!mv '/content/berk266/wikihow all' '/content/berk266/wikihow_all'

In [None]:
!ls /content/berk266

In [None]:
!ls /content/berk266/wikihow_all

In [None]:
!wget https://w266.blob.core.windows.net/pretraining/model_step_50000.pt

In [None]:
!ls /content/berk266/

In [None]:
%cd code/bertsumabs/src/

In [None]:
!ls

In [None]:
!python train.py -h

In [None]:
# This script is not very usable. BERT_DATA_PATH  is supposed to be the dataset directory
# but it must be in some Pytorch .pt binary format.
#
# -bert_data_path BERT_DATA_PATH
#  -sep_optim true -use_interval true
  #-test_from /content/berk266/wikihow_all/bert.pt_files_wikihow.train.90.bert.pt \

!python train.py -task ext -mode test \
  -test_from /content/berk266/model_step_50000.pt \
  -visible_gpus -1 -max_pos 512 -max_length 200 -alpha 0.95 -min_length 50 \
  -result_path ../results -report_rouge True



Trying direct Pytorch logic based on https://github.com/alebryvas/berk266/blob/28c2015de7a42290b99aec0dab836472b352beb5/code/bertsumabs/src/train_extractive.py#L176


Forwarding data through this model requires some special inputs like "segs", "mask_src", etc. It looks like these are produced by BertSumAbs' data preprocessor.

In [None]:
import torch



In [None]:

#checkpoint = torch.load('/content/berk266/wikihow_all/bert.pt_files_wikihow.train.90.bert.pt')

checkpoint = torch.load('/content/berk266/model_step_50000.pt', map_location=torch.device('cpu'))


In [None]:
type(checkpoint)

In [None]:
checkpoint.keys()

In [None]:
# Args expected by the ExtSummarizer code in https://github.com/alebryvas/berk266/blob/28c2015de7a42290b99aec0dab836472b352beb5/code/bertsumabs/src/models/model_builder.py#L135

import argparse

args = argparse.Namespace()

args.large = False
args.temp_dir = '../temp'
args.encoder = 'bert'
args.finetune_bert = True
args.max_pos = 512
args.ext_ff_size = 2048
args.ext_heads = 8
args.ext_dropout = 0.2
args.ext_layers = 2


In [None]:
from models.model_builder import ExtSummarizer

model = ExtSummarizer(args, 'cpu', checkpoint)

In [None]:
%cd /content

## Metrics

&nbsp;
&nbsp;
&nbsp;
&nbsp;
&nbsp;
# Use Case #3: Opinions / Reviews Summarization

Reproduce Zero-Shot Opinion Summarization with GPT-3, Bhaskar et al., https://arxiv.org/pdf/2211.15914.pdf  but with GPT-4 in the mix.

## Set up SPACE reviews dataset

SPACE = Summaries of Popular and Aspect-specific Customer Experiences

https://github.com/stangelid/qt/ => Its /data contains the gold reference summaries on different aspects.

https://drive.google.com/u/0/uc?id=1C6SaRQkas2B-9MolbwZbl0fuLgqdSKDT&export=download => Raw review data

&nbsp;

space_summ.json => Contains the golden summaries for 50 hotels.
+ It's a list of 50 dicts, one for each hotel. Each has 4 keys - 'entity_id', 'entity_name', 'reviews', 'summaries'.

+ `reviews` : A list of 100 reviews for the hotel. Each review is a dict with a `review_id`, a `sentences` list, and a `rating` integer.

+ `summaries` : A dict with the 7 aspects as keys and a list of golden summaries for each aspect. Each golden summary is a paragraph with one ore more sentences.



In [None]:
%cd /content

/content


In [None]:
!cp '/content/drive/MyDrive/Colab Notebooks/SPACE-hotel-reviews-dataset/space.tar.gz' ./

In [None]:
!tar -xvf space.tar.gz

space_train.json
space_summ.json
space_summ_splits.txt


In [None]:
!ls -lah space*

-rw-r--r-- 1 1000 1000 5.1M Dec  9  2020 space_summ.json
-rw-r--r-- 1 1000 1000  588 Dec  9  2020 space_summ_splits.txt
-r-------- 1 root root 405M May 15 15:36 space.tar.gz
-rw------- 1 1000 1000 1.2G Dec  9  2020 space_train.json


In [None]:
!head -n512 space_summ.json

In [None]:
import json

with open('space_summ.json', 'r') as f:
  space = json.load(f)



In [None]:
type(space), type(space[0])

In [None]:
len(space)

50

In [None]:
for hotel in space:
  print(hotel['entity_id'])
  print(hotel['entity_name'])
  print(len(hotel['reviews']))
  print()

In [None]:
# These are the selected hotel entity_ids in the paper's code.
sel_hotels = ['100597', '112429', '120274', '121241', '182002',
              '183092', '185804', '1029276', '1113787', '1176198']

In [None]:
space[0].keys()

In [None]:
space[0]['entity_id']

'100597'

In [None]:
type(space[0]['reviews'])

list

In [None]:
space[0]['reviews'][0]

{'review_id': 'UR59977476',
 'sentences': ['We stayed here on a lay over home from Cancun.',
  'It was great to have a comfortable bed and room on our final night of holidays.',
  'The kids loved the pool which was warmer than the ones at the resort in Cancun which we could not believe as we were in Seattle!',
  'The staff was friendly and we appreciated the cookies after a long flight when we were waiting to check inn.',
  'Just a nice touch!',
  'Shuttle was convenient and would definitely stay here again.'],
 'rating': 5}

In [None]:
[ print(s) for s in space[0]['reviews'][0]['sentences'] ]

We stayed here on a lay over home from Cancun.
It was great to have a comfortable bed and room on our final night of holidays.
The kids loved the pool which was warmer than the ones at the resort in Cancun which we could not believe as we were in Seattle!
The staff was friendly and we appreciated the cookies after a long flight when we were waiting to check inn.
Just a nice touch!
Shuttle was convenient and would definitely stay here again.


[None, None, None, None, None, None]

In [None]:
space[0]['summaries']['building'], len(space[0]['summaries']['building'])

(['The historical hotel lobby were very attractive. The balcony had a great view of trees . The spa and heated pool is a kid-friendly area and also has wi-fi. There is even a laundry room available to the guests.',
  'Hotel with very nice lobby and relaxing spa/pool area with lounge and free wifi. The pool is big and kid-friendly. There is also a beautiful view of the trees from the balcony.',
  'Warm, beautiful, large pool for the family. Old fashioned interior but pleasant rooms, great balcony, and the view outside to the trees was relaxing.'],
 3)

In [None]:
!cat space_summ_splits.txt

In [None]:
!head -c 340960 space_train.json

In [None]:
%cd /content

In [None]:
!git clone https://github.com/stangelid/qt/  space_gold

In [None]:
!ls -lah

total 20K
drwxr-xr-x 1 root root 4.0K May 11 16:33 .
drwxr-xr-x 1 root root 4.0K May 11 16:14 ..
drwxr-xr-x 4 root root 4.0K May  9 13:34 .config
drwx------ 5 root root 4.0K May 11 16:15 drive
lrwxrwxrwx 1 root root   51 May 11 16:33 opinion-summ -> '/content/drive/MyDrive/Colab Notebooks/opinion-summ'
drwxr-xr-x 1 root root 4.0K May  9 13:35 sample_data


In [None]:
!ln -s '/content/drive/MyDrive/Colab Notebooks/opinion-summ' opinion-summ

In [None]:
!ls opinion-summ/space

reviews  sentaspects


In [None]:
!rm -rf opinion-summ/space/sentaspects/*

In [None]:
OUTPUT_DIR = '/content/opinion-summ'

SPACE_OUTPUT_DIR = '/content/opinion-summ/space'

FEWSUM_OUTPUT_DIR = '/content/opinion-summ/fewsum'

import os

os.makedirs(SPACE_OUTPUT_DIR, exist_ok=True)
os.makedirs(FEWSUM_OUTPUT_DIR, exist_ok=True)

## Topic-wise Clustering for Hotel reviews

1. Collect all review sentences for the hotel.

2. Keeping under the token limit, send batches of review sentences to GPT and ask it to generate aspect labels for each sentence.

3. Verify that the number of label rows match the number of sentences passed in.

4. Cache all the aspect labels.

5. Group review sentences by aspects. Save them to different files.

6. Summarize each aspect review group.

In [None]:
import pickle

def savedata(filepath, obj):
  with open(filepath, 'wb') as f:
    pickle.dump(obj, f)

def loaddata(filepath):
  with open(filepath, 'rb') as f:
    ret = pickle.load(f)
  return ret

In [None]:
def get_all_review_sentences_for_hotel(eid):
  hotel = get_hotel(eid)
  hotel_reviews = hotel['reviews']
  all_review_sents = []
  for r in hotel_reviews:
    all_review_sents.extend(r['sentences'])

  return all_review_sents


def get_hotel(entity_id):
  for h in space:
    if h['entity_id'] == entity_id:
      return h

  raise ValueError(f'Hotel {entity_id} does not exist')

In [None]:
import os

SPACE_REVIEWS_DIR = f'{SPACE_OUTPUT_DIR}/reviews/'
os.makedirs(SPACE_REVIEWS_DIR, exist_ok=True)

In [None]:
# Cache all review sentences as List objects.
import pickle

for eid in sel_hotels:
  all_sents = get_all_review_sentences_for_hotel(eid)
  savedata(f'{SPACE_REVIEWS_DIR}/{eid}_allsents.pkl', all_sents)


In [None]:
!pip install tiktoken

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os

SPACE_SENTASPECTS_DIR = f'{SPACE_OUTPUT_DIR}/sentaspects/'
os.makedirs(SPACE_SENTASPECTS_DIR, exist_ok=True)

In [None]:
!ls $SPACE_SENTASPECTS_DIR

100597-aspect-sent-map.pkl   120274-aspect-sent-map.pkl
100597.pkl		     120274.pkl
1029276-aspect-sent-map.pkl  121241-aspect-sent-map.pkl
1029276.pkl		     121241.pkl
1113787-aspect-sent-map.pkl  182002-aspect-sent-map.pkl
1113787.pkl		     182002.pkl
112429-aspect-sent-map.pkl   183092-aspect-sent-map.pkl
112429.pkl		     183092.pkl
1176198-aspect-sent-map.pkl  185804-aspect-sent-map.pkl
1176198.pkl		     185804.pkl


### Classify sentences using GPT-4

Turns out to be extremely slow

In [None]:
import openai
import tiktoken
import pickle
import json

def gpt4_label_sentence_aspects(eid):
  # Get all sentences
  all_sents = loaddata(f'{SPACE_REVIEWS_DIR}/{eid}_allsents.pkl')
  print('Total sentences:', len(all_sents))

  # Batching optimally based on token limit didn't work out well.
  # When things are too close to the limit and there are too many sentences,
  # it appears that GPT simply gives up and sends partial results back.
  # A request for labeling around 200+ sentences resulted in just 11 being labeled.
  # When the input was cut by half, it sent back around 1000+ results but still
  # failed at the end.
  # So, just send a small fixed count of sentences instead of trying to optimize.

  all_labels = []

  batch_count = 25
  for startidx in range(0, len(all_sents), batch_count):
    batch_sents = all_sents[startidx:startidx+batch_count]
    num_sents_in_batch = len(batch_sents)

    print(f'\nNum sents={num_sents_in_batch}')

    sents_labels = _gpt_label_instruction(batch_sents)

    print('Num labels:', len(sents_labels))
    #print(sents_labels)

    if len(sents_labels) != num_sents_in_batch:
      raise RuntimeError("Num labels doesn't match num sentences")

    all_labels.extend(sents_labels)
    savedata(f'{SPACE_SENTASPECTS_DIR}/{eid}.pkl', all_labels)

  print('Total labels:', len(all_labels))
  return all_labels


def _gpt_label_instruction(sents_batch):

  '''
  num_sents = len(sents_batch)
  batch = '\n'.join(sents_batch)
  #print(batch)

  prompt = f"""
  The text below in backticks contains exactly {num_sents} sentences from reviews about a hotel.
  Don't ignore any sentence even if it's an incomplete sentence. Don't combine any lines or sentences.
  Classify each sentence as one of these 6 aspect labels - rooms, building, cleanliness, location, service, food.
  If none of these labels is suitable, label it as 'general'. Don't generate more than one label for a line.
  Output the label for each sentence in a separate line. Ensure exactly {num_sents} labels are produced.

  ```
  {batch}
  ```
  """
  '''


  num_sents = len(sents_batch)
  batch = [ {"s":s, "aspect":""} for s in sents_batch ]
  batch = json.dumps(batch) #, indent=1)

  prompt = f"""
  Your task is to classify the sentences in the backticks below. Classify each
  "sentence" value as one of these 6 "aspect" labels - rooms, building, cleanliness, location, service, food.
  If none of these labels is suitable, label it as 'general'.
  Don't include backticks in your output.

  ```
  [
    {"sentence": "We landed in Cancun.", "aspect":""},
    {"sentence": "The rooms were exquisite.", "aspect":""},
    ...
  ]
  ```
  """

  print('Prompt length:', len(tiktoken.encoding_for_model('gpt-4').encode(prompt)))

  msgs = [{'role':'user', 'content':prompt}]

  resp = openai.ChatCompletion.create(
      model='gpt-4',
      messages=msgs,
      temperature=0.0 # No need of randomness; we need very specific labels.
  )

  sent_labels_str = resp['choices'][0]['message']['content']
  print(sent_labels_str)
  #sent_labels = sent_labels_str.splitlines()
  sent_labels = json.loads(sent_labels_str)


  return sent_labels


### Classify sentences on aspects using OpenAI Embeddings

Extremely fast. Quite accurate, especially if top 2 labels are used instead of just the top-most.

In [None]:
# Sentence classification implementation using OpenAI Embeddings API instead of GPT-4 classification
# because the latter is extremely slow while the former is far faster and cheaper.
#
# Another observation: OpenAI's embeddings are unit normalized. So np.dot
# is far faster than using openai.embeddings_utils.
# Also the similarities returned by np.dot seem better than that returned by
# openai.embeddings_utils.distances_from_embeddings() which uses cosine similarity.

from openai import embeddings_utils
import numpy as np

def _get_embeddings(text_list):
  embeddings = openai.embeddings_utils.get_embeddings(text_list,
                                                      engine='text-embedding-ada-002')
  return embeddings

aspect_labels = ["rooms", "building", "cleanliness", "location", "service",
                 "food", "general"]

# Use slightly descriptive aspect label phrases for better embeddings.
aspect_label_desc = ["hotel rooms", "hotel building", "hotel cleanliness",
                 "hotel location", "hotel service", "hotel food",
                 "information"]

aspect_label_embeddings = _get_embeddings(aspect_label_desc)

In [None]:

def openai_embeddings_label_sentence_aspects(eid):
  all_sents = loaddata(f'{SPACE_REVIEWS_DIR}/{eid}_allsents.pkl')
  print('Total sentences:', len(all_sents))

  all_labels = []

  batch_count = 100
  for startidx in range(0, len(all_sents), batch_count):
    batch_sents = all_sents[startidx:startidx+batch_count]
    num_sents_in_batch = len(batch_sents)

    print(f'\nNum sents={num_sents_in_batch}')

    sent_batch_embeddings = _get_embeddings(batch_sents)

    # Calculate cosine similarity between sentences and aspects labels,
    # using dot product of [BATCHxD] . [Dx7] where D -> embedding dimension.
    dists = np.dot(np.array(sent_batch_embeddings), np.array(aspect_label_embeddings).T)

    # Every row in dists corresponds to one sentence and gives that sentence's
    # cosine similarity with each aspect label.
    # np.argsort -> Sorts every sentence row by ascending order of cosine similarity.
    #         Higher the value, more similar it is.
    sorted_idxes = np.argsort(dists)
    best_label_idxes = sorted_idxes[:,-1]
    next_best_label_idxes = sorted_idxes[:,-2]

    for sent,idx1,idx2 in zip(batch_sents, best_label_idxes, next_best_label_idxes):

      best_label, next_best_label = aspect_labels[idx1], aspect_labels[idx2]
      #print(f'{sent}\n{best_label}, {next_best_label}\n\n')
      all_labels.append( (sent, best_label, next_best_label) )

    savedata(f'{SPACE_SENTASPECTS_DIR}/{eid}.pkl', all_labels)

    # Older implementation using openai.embeddings_utils. It's very inefficient
    # compared to direct numpy routines.
    '''
    for sent, sent_emb in zip(batch_sents, sent_batch_embeddings):
      print(sent)
      dists = openai.embeddings_utils.distances_from_embeddings(sent_emb, aspect_label_embeddings)
      print(dists)
      dist_idx = np.argsort(dists)
      label1, label2 = dist_idx[0], dist_idx[1]
      print(aspect_labels[label1], aspect_labels[label2])
      break
      label_distances = openai.embeddings_utils.indices_of_nearest_neighbors_from_distances(dists)
      top_label_idx, next_label_idx = label_distances[0], label_distances[1]
      print(sent)
      print(aspect_labels[top_label_idx], aspect_labels[next_label_idx])
      print()
    '''




In [None]:
for eid in sel_hotels:
  #all_labels = gpt4_label_sentence_aspects(eid)
  all_labels = openai_embeddings_label_sentence_aspects(eid)


Total sentences: 893

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=93
Total sentences: 881

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=81
Total sentences: 1003

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=3
Total sentences: 948

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=48
Total sentences: 958

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=58
Total sentences: 843

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=100

Num sents=43
Total sent

In [None]:
!ls -lah {SPACE_SENTASPECTS_DIR}

total 829K
-rw------- 1 root root 79K May 16 08:13 100597.pkl
-rw------- 1 root root 85K May 16 08:13 1029276.pkl
-rw------- 1 root root 78K May 16 08:13 1113787.pkl
-rw------- 1 root root 78K May 16 08:13 112429.pkl
-rw------- 1 root root 84K May 16 08:14 1176198.pkl
-rw------- 1 root root 98K May 16 08:13 120274.pkl
-rw------- 1 root root 75K May 16 08:13 121241.pkl
-rw------- 1 root root 91K May 16 08:13 182002.pkl
-rw------- 1 root root 79K May 16 08:13 183092.pkl
-rw------- 1 root root 85K May 16 08:13 185804.pkl


### Group sentences by aspects

In [None]:
for eid in sel_hotels:

  # keys are aspects; values are lists of sentences with that aspect label.
  sentences_by_aspect = {aspect:[] for aspect in aspect_labels}

  all_labels = loaddata(f'{SPACE_SENTASPECTS_DIR}/{eid}.pkl')

  for sent, best_label, next_best_label in all_labels:
    sentences_by_aspect[best_label].append(sent)
    sentences_by_aspect[next_best_label].append(sent)

  print(f'{eid}')
  for aspect, sents in sentences_by_aspect.items():
    print(aspect, len(sents))

  savedata(f'{SPACE_SENTASPECTS_DIR}/{eid}-aspect-sent-map.pkl', sentences_by_aspect)




100597
rooms 584
building 115
cleanliness 664
location 36
service 262
food 93
general 32
112429
rooms 494
building 159
cleanliness 639
location 108
service 233
food 91
general 38
120274
rooms 565
building 41
cleanliness 811
location 64
service 290
food 159
general 76
121241
rooms 569
building 100
cleanliness 786
location 56
service 212
food 101
general 72
182002
rooms 554
building 162
cleanliness 747
location 82
service 249
food 80
general 42
183092
rooms 448
building 166
cleanliness 593
location 104
service 244
food 98
general 33
185804
rooms 432
building 136
cleanliness 594
location 140
service 302
food 159
general 43
1029276
rooms 486
building 162
cleanliness 716
location 58
service 294
food 101
general 61
1113787
rooms 363
building 142
cleanliness 660
location 67
service 343
food 105
general 68
1176198
rooms 504
building 139
cleanliness 708
location 90
service 251
food 100
general 44


### Chunking summarization by aspect

In [None]:
import nltk

nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk.tokenize

nltk.tokenize.sent_tokenize("This is a summary. This is generated by GPT-4.")

['This is a summary.', 'This is generated by GPT-4.']

In [None]:
import nltk.tokenize
import time
import tiktoken

token_counters = {
    'gpt-4': tiktoken.encoding_for_model('gpt-4'),
    'gpt-3.5-turbo': tiktoken.encoding_for_model('gpt-3.5-turbo')
}

def get_gpt_aspect_summary(model, aspect, sentences, group_size=30, is_summarized=False, \
    return_all_levels=False):

    if len(sentences) > group_size:

        n_groups = (len(sentences)+group_size-1) // group_size
        group_size = len(sentences) // n_groups
        summaries = []
        ss = []
        cur = 0

        while cur < len(sentences):
            if cur + group_size >= len(sentences):
                summary = get_gpt_aspect_summary(model, aspect, \
                    sentences[cur:], group_size, is_summarized)
            else:
                summary = get_gpt_aspect_summary(model, aspect, \
                    sentences[cur:cur+group_size], group_size, is_summarized)
            summaries.append(summary)
            ss += nltk.tokenize.sent_tokenize(summary)
            cur += group_size

        if return_all_levels:
            return [summaries] + get_gpt_aspect_summary(model, aspect, ss, \
                group_size, is_summarized=True, return_all_levels=True)
        else:
            return get_gpt_aspect_summary(model, aspect, ss, group_size, is_summarized=True)

    if is_summarized:
        prompt = "Here are some accounts of the reviews of a hotel:\n\n"
    else:
        prompt = "Here are some reviews of a hotel:\n\n"

    joined = "\n".join(sentences)
    if len(joined) > 3400:
        # Truncate if the length is too much
        joined = joined[:3400]
        joined = joined[:joined.rfind('\n')]
    prompt += joined+"\n\n"

    if is_summarized:
        prompt += "Summarize what the accounts said of the {}:".format(aspect)
    else:
        prompt += "Summarize what the reviews said of the {}:".format(aspect)

    if return_all_levels:
        return [[get_gpt_response(model, prompt)]]
    else:
        return get_gpt_response(model, prompt)


def get_gpt_response(model, prompt, tokenize=False):
  print('Tokens:', len(token_counters[model].encode(prompt)))

  msgs = []
  msgs.append({'role':'user', 'content':prompt})

  response = None

  for attempt in range(1,4):
    print(f'Attempt#:{attempt}')

    try:
      response = openai.ChatCompletion.create(
          model=model,
          messages=msgs,
          temperature=0.7,
          max_tokens=256
      )

      response = response['choices'][0]['message']['content'].strip()

      if tokenize:
        response = nltk.tokenize.sent_tokenize(response)

      print(response)

      time.sleep(10) # To avoid overloading the API

      break

    except openai.error.APIError as e:
      print(f"OpenAI API returned an API Error: {e}")
      time.sleep(10) # To avoid overloading the API

    except openai.error.RateLimitError as e:
      print(f"OpenAI API returned a rate limit Error: {e}")
      time.sleep(15) # To avoid overloading the API


    except openai.error.Timeout as e:
      print(f"OpenAI API returned a timeout Error: {e}")
      time.sleep(10) # Wait before retrying

    except openai.error.ServiceUnavailableError as e:
      print(f"OpenAI API returned a service unavailable Error: {e}")
      time.sleep(10) # Wait before retrying

  return response

In [None]:
from pprint import pprint

for eid in sel_hotels[1:]:

  # keys are aspects; values are lists of sentences with that aspect label.
  sentences_by_aspect = loaddata(f'{SPACE_SENTASPECTS_DIR}/{eid}-aspect-sent-map.pkl')

  for aspect in aspect_labels[:-1]: # Exclude 'general'
    print(aspect, len(sentences_by_aspect[aspect]))
    #pprint(sentences_by_aspect[aspect])

    # This returns a list of list with each level of recursive summaries.
    # The summary[-1][0] would be the final summary.
    summary = get_gpt_aspect_summary('gpt-4',
                                     aspect,
                                     sentences_by_aspect[aspect],
                                     is_summarized=False,
                                     return_all_levels=True)

    savedata(f'{SPACE_SENTASPECTS_DIR}/{eid}-{aspect}-summary.pkl', summary)

#  break



rooms 494
Tokens: 618
Attempt#:1
The reviews of the rooms at the hotel are mixed. Some guests found the rooms to be clean, stylish, comfortable, and well-equipped with amenities such as a refrigerator, comfortable furniture, and a great king bed. However, others found the rooms to be small, with issues such as poor air conditioning and cleanliness, as well as noise from garbage trucks and neighbors. The bathrooms were also described as small, with hot water taking a long time to warm up. Some guests mentioned an odd smell in the hallways and a daily deposit for room service that was not disclosed during booking.
Tokens: 513
Attempt#:1
The reviews mention that the rooms at the Paramount Hotel are nice, clean, and well-priced compared to other downtown hotels. Some rooms, particularly the corner ones, have nice city views. However, there were some issues with hot water and the in-room hairdryer being small and weak. Overall, the Paramount is a solid choice for a stay in Seattle.
Tokens: 

KeyboardInterrupt: ignored

### ROUGE metrics

In [None]:
from pprint import pprint
import os
import os.path
import statistics

from rouge import Rouge

all_gen_summaries = []
all_ref_summaries = []

# Mean ROUGE F1-scores across all hotels and all aspects
rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for eid in sel_hotels:
  for aspect in aspect_labels[:-1]: # Exclude 'general'
    # Get reference summary for hotel and aspect.
    summ_file = f'{SPACE_SENTASPECTS_DIR}/{eid}-{aspect}-summary.pkl'
    if not os.path.exists(summ_file):
      print(f'{summ_file} does not exist.')
      continue

    h = get_hotel(eid)
    ref_summaries = h['summaries'][aspect]
    all_ref_summaries.extend(ref_summaries)

    gen_summaries = loaddata(summ_file)
    gen_summary = gen_summaries[-1][0]

    for ref_summary in ref_summaries:
      all_gen_summaries.append(gen_summary)
      scores = R.get_scores(gen_summary,ref_summary)[0]
      #print(scores)
      rouge1_f.append(scores['rouge-1']['f'])
      rouge2_f.append(scores['rouge-2']['f'])
      rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))

/content/opinion-summ/space/sentaspects//183092-rooms-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//183092-building-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//183092-cleanliness-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//183092-location-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//183092-service-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//183092-food-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//185804-rooms-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//185804-building-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//185804-cleanliness-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//185804-location-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//185804-service-summary.pkl does not exist.
/content/opinion-summ/space/sentaspects//185804-food-summary.pkl does n

### BERTScore Metrics

In [None]:
len(all_gen_summaries), len(all_ref_summaries)

(90, 90)

In [None]:
from torchmetrics.text.bert import BERTScore

scorer = BERTScore('roberta-large')

bert_scores = scorer(all_gen_summaries, all_ref_summaries)

bert_scores

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'precision': [0.9487131834030151,
  0.9459242820739746,
  0.9476812481880188,
  0.9486280679702759,
  0.9414752125740051,
  0.9455282688140869,
  0.9526387453079224,
  0.9524768590927124,
  0.9528357982635498,
  0.9490207433700562,
  0.9576573967933655,
  0.9519209265708923,
  0.9561426639556885,
  0.9485679864883423,
  0.9480463862419128,
  0.9469122290611267,
  0.9454221129417419,
  0.9510140419006348,
  0.947048008441925,
  0.9494012594223022,
  0.951958954334259,
  0.9473575353622437,
  0.9464818835258484,
  0.9480583071708679,
  0.9523980617523193,
  0.9491949677467346,
  0.9484856128692627,
  0.9481065273284912,
  0.9466752409934998,
  0.9518048763275146,
  0.9514337778091431,
  0.9505701661109924,
  0.9479382038116455,
  0.957195520401001,
  0.9528428316116333,
  0.951153039932251,
  0.9486376643180847,
  0.9470864534378052,
  0.9505252242088318,
  0.9476030468940735,
  0.9501947164535522,
  0.9453746676445007,
  0.947884738445282,
  0.9496846199035645,
  0.9448795318603516,
  

In [None]:
import statistics

statistics.mean(bert_scores['f1'])

0.9564774228466881

## QFSumm-Long + GPT Summarization

In [None]:
%cd /content

In [None]:
 !git clone https://github.com/pathbreak/gpt4-opinion-summ

In [None]:
# This is a dict of dicts :
#  {eid: {
#     aspect: Newline-separated text summary
#  }}
qfsumm = loaddata('/content/gpt4-opinion-summ/saved-data/space/all-new-pkls/summaries-pkl/qfsumm-long.pkl')

In [None]:
import nltk.tokenize

for eid in sel_hotels:
  for aspect in aspect_labels[:-1]: #Exclude 'general'
    sel_aspect_sents = qfsumm[eid][aspect]

    sel_aspect_sents = nltk.tokenize.sent_tokenize(sel_aspect_sents)

    summary = get_gpt_aspect_summary('gpt-4',
                                     aspect,
                                     sel_aspect_sents,
                                     is_summarized=False,
                                     return_all_levels=True)

    savedata(f'{SPACE_SENTASPECTS_DIR}/{eid}-{aspect}-qfsummary.pkl', summary)


### ROUGE metrics

In [None]:
from pprint import pprint
import statistics

from rouge import Rouge

all_gen_summaries = []
all_ref_summaries = []

# Mean ROUGE F1-scores across all hotels and all aspects
rouge1_f = []
rouge2_f = []
rougel_f = []

R = Rouge()
for eid in sel_hotels:
  for aspect in aspect_labels[:-1]: # Exclude 'general'
    # Get reference summary for hotel and aspect.
    h = get_hotel(eid)
    ref_summary = h['summaries'][aspect]
    all_ref_summaries.append(ref_summary)

    gen_summaries = loaddata(f'{SPACE_SENTASPECTS_DIR}/{eid}-{aspect}-qfsummary.pkl')
    gen_summary = gen_summaries[-1][0]

    all_gen_summaries.append(gen_summary)
    scores = R.get_scores(gen_summary,ref_summary)[0]
    #print(scores)
    rouge1_f.append(scores['rouge-1']['f'])
    rouge2_f.append(scores['rouge-2']['f'])
    rougel_f.append(scores['rouge-l']['f'])

print('Mean rouge-1 f:', statistics.mean(rouge1_f))
print('Mean rouge-2 f:', statistics.mean(rouge2_f))
print('Mean rouge-l f:', statistics.mean(rougel_f))

### BERTScore Metrics

In [None]:
from torchmetrics.text.bert import BERTScore

scorer = BERTScore('roberta-large')

bert_scores = scorer(all_gen_summaries, all_ref_summaries)

bert_scores

In [None]:
import statistics

statistics.mean(bert_scores['f1'])