# Experiment with quick summarizer
Quick summarizer - take full text, output summary

In [22]:
# useful for rouge scoring
# !pip install -r rouge/requirements.txt
# !pip install rouge-score

In [71]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [72]:
import sys
sys.path.append('../rfpgo/')
from credentials import *
from summarize.quick.prompts.quick_field_prompts import *
import os
os.environ["OPENAI_API_KEY"] = OPENAI_KEY

In [79]:
import pandas as pd
from langchain.llms import Ollama
from langchain_openai import OpenAI, ChatOpenAI
import json

DATA_FP = '../data'
LABEL_FP = f'{DATA_FP}/labels'
PROMPT_FP = f'{DATA_FP}/prompts'

In [80]:
# ollama model
fp = 'company_conditions'
gemma = Ollama(model="gemma:7b")
oai_3 = ChatOpenAI(model='gpt-3.5-turbo')
oai_4 = ChatOpenAI(model='gpt-4-turbo')
rfps = json.load(open(f'../data/{fp}.json'))
content = rfps[0]['prompt'][1]['content']

In [89]:
def call_llm(prompt, llm):
    response = llm.invoke(prompt)
    if isinstance(response, str):
        return response
    return response.content

In [82]:
prompt = open('../data/prompts/quick_summarize.txt', 'r').read()
prompt_w_content = f'{prompt}\n{content}'
response = call_llm(prompt_w_content, oai_3)

### Testing harness
Inputs:
- Prompts
- llms (for now - just going to set these up with LC)
Output:
- prompt used
- llm used
- response
- length / total length
- rogue score

In [103]:
def score_rouge(label, response):
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    scores = scorer.score(label, response)
    return scores

def format_and_output(label, response):
    scores = score_rouge(label, response)
    output_json = {'label': label, 'response': response, 'scores': scores}
    return output_json

def run_test(label, llm):
    response = call_llm(label, llm)
    output_json = format_and_output(label, response)
    output_json['model'] = llm.dict()['model']
    # TODO: LLM's opinion on the response
    return output_json


from collections import defaultdict
class Summary(object):
    def __init__(self, document_fp, label_dict):
        self.document_fp = document_fp
        self.document = open(document_fp, 'r').read()
        if type(label_dict) == str:
            label_dict = json.load(open(label_dict))
        self.label_dict = label_dict
        self.output_dict = defaultdict(dict)

    def run(self, llm):
        collect_scores = []
        for field, label in self.label_dict.items():
            field = field.lower()
            if field == 'summary':
                # summary has a different prompt
                prompt = summary_prompt.format(document=self.document)
            else:
                prompt = field_prompt.format(field=field, document=self.document)
            response = call_llm(prompt, llm)
            formatted_output = format_and_output(label, response)
            collect_scores.append(formatted_output['scores'])
            self.output_dict[field]['response'] = formatted_output['response']
            self.output_dict[field]['label'] = formatted_output['label']
            self.output_dict[field]['scores'] = formatted_output['scores']
            # only for debugging
            #self.output_dict[field]['prompt'] = prompt

        # general request parameters
        self.output_dict['model'] = llm.dict()['model']
        self.output_dict['document_fp'] = self.document_fp
        self.output_dict['overall'] = sum(
            [s['rouge1'].fmeasure for s in collect_scores]) / len(collect_scores)
    
    def save(self, path):
        json.dump(self.output_dict, open(path, 'w'), indent=4)

    def __repr__(self):
        return json.dumps(
            {'document': self.document, 
            'label_dict': self.label_dict, 
            'output_dict': self.output_dict},
            indent=4
        )


In [100]:
s = Summary(
    document_fp=f'{DATA_FP}/0_synth_rfp.txt',
    label_dict=f'{LABEL_FP}/howard_09122024/0_summary.json')
s.run(llm=gemma)
s.save(f'{DATA_FP}/output/howard_09122024/0_summary_output_{gemma.dict()["model"]}.json')

project name
agency
solicitation number
contact person
email
submission deadline
contract term
source link
summary


In [104]:
for llm in [oai_3, oai_4]:
    for f in range(4):
        if os.path.exists(f'{DATA_FP}/output/howard_09122024/{f}_summary_output_{llm.dict()["model"]}.json'): 
            continue
        s = Summary(
            document_fp=f'{DATA_FP}/{f}_synth_rfp.txt',
            label_dict=f'{LABEL_FP}/howard_09122024/{f}_summary.json'
        )
        s.run(llm=llm)
        s.save(f'{DATA_FP}/output/howard_09122024/{f}_summary_output_{llm.dict()["model"]}.json')
        break

In [48]:
# format howard desired output
from pathlib import Path
fps = Path(f'{LABEL_FP}/howard_09122024/raw').glob('*.txt')
for f in fps: 
    content = open(f, 'r').read()
    # summary, split
    content = content.replace('Summary: ', 'Summary\n')
    # format as key-value
    content = content.split('\n')[2:]
    content = dict(zip(content[0::2], content[1::2]))
    # rewrite "insert deadline" to not specified
    if 'insert deadline' in content['Submission Deadline'].lower(): 
        content['Submission Deadline'] = 'Not specified'
    # output as json
    new_path = f.parent.parent / f.with_suffix('.json').name
    json.dump(content, open(new_path, 'w'))

In [95]:
# summarizer as a compound set of fields
# template for structured information extraction
prompt = "You are filling in structured information from a document.\n\
What is the {field} in the document below?\n\
Do not respond if there is no {field} in the document.\n\
{document}\n\
{field}: "

for field in fields:
    print(field)
    f_prompt = prompt.format(field=field, document=content['Summary'])
    response = call_llm(f_prompt, l)
    format_and_output(content, response)

Project Name
The text does not specify a Project Name, therefore I cannot complete the requested task.
Agency/Department/Organization
The text does not mention an Agency/Department/Organization in the document, therefore I cannot provide the requested information.
Solicitation Number
The text does not contain any Solicitation Number, therefore I cannot complete the requested task.
Contact Person
The text does not specify a Contact Person, therefore I cannot fill in the requested information.
Email
Sure, here is the email extracted from the document:

**Email:** [Insert contact email]
Submission Deadline
The text does not specify a Submission Deadline, therefore I cannot complete the text.
Contract Term
The text does not mention a Contract Term, therefore I cannot provide the requested information.
Source Link
The text does not contain a Source Link, therefore I cannot provide an answer to the question.


In [68]:
prompts = ['quick_summarize']
rfps = [content]
llms = [Ollama(model="gemma:7b")]

for p, r, l in zip(prompts, rfps, llms):
    prompt_w_content = open('../data/prompts/quick_summarize.txt', 'r').read()
    prompt_w_content = f'{prompt_w_content}\n{content}'
    format_and_output(content, call_llm(prompt_w_content, l))
    print()

{'Project Name': 'Wheels Supply for City X Transportation Department', 'Agency': 'City X Transportation Department', 'Solicitation Number': 'Not specified', 'Contact Person': 'Not specified', 'Email': 'Not specified', 'Submission Deadline': '[Insert deadline]', 'Contract Term': 'Not specified', 'Source Link': 'Not provided', 'Summary': 'The City X Transportation Department seeks proposals for eco-friendly, durable wheels for its vehicle fleet. Submission requirements include company background, product specs, pricing, and sustainability practices. Proposals should be submitted electronically to [Insert contact email] with the subject line "Wheels Supply RFP - City X Transportation Department." Evaluations will prioritize sustainability and product quality. For full details, refer to the complete RFP document.'}
Sure, here is a summary of the RFP: The City X Transportation Department is seeking proposals for eco-friendly, durable wheels for its vehicle fleet. Proposals must include comp

In [27]:
scores

{'rouge1': Score(precision=0.9655172413793104, recall=0.11498973305954825, fmeasure=0.2055045871559633),
 'rougeL': Score(precision=0.7931034482758621, recall=0.0944558521560575, fmeasure=0.1688073394495413)}

In [None]:
# testing harness must take a list of 