## Extraction Summarization

In [None]:
!pip install datasets transformers rouge-score nltk pdfminer.six pdfminer
!pip install huggingface_hub torch
!git clone https://github.com/RyanSGoldberg/HackTheMist.git

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## PDF Extraction

In [None]:
# Extract PDf to text
from __future__ import annotations
import requests
import sys
import urllib.parse
from bs4 import BeautifulSoup
import re
import urllib
from pdfminer.high_level import extract_text
import os
import logging

os.makedirs("pdfs", exist_ok=True)

def _download_url(filename, url):
    r = requests.get(url)
    if (r.status_code == 200):
        with open(filename, "wb") as fd:
            fd.write(r.content)

def _write_string_to_file(filename, content):
    with open(filename, "w") as fd:
        fd.write(content)

def _extract_pdf_text(in_file):
    # print(f'Extracting text from {in_file}')
    with open(in_file, "rb") as fd:
        result = extract_text(fd)
        return result

def get_search_results(c_name, keywords):
    logging.getLogger().setLevel(logging.ERROR)

    # Inputs
    company_name = c_name
    search_keywords = keywords

    # Ouputs
    pipeline_output = []

    # Implementation
    
    query = urllib.parse.quote_plus(company_name + " " + search_keywords)

    url = "https://www.google.com/search?q=filetype:pdf+" + query
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    result_block = soup.find_all("a", href=True)

    for result in result_block:
        href = result['href']
        pdf_href = re.search(r'(\/url\?q\=)(.+\b\.pdf\b)', href)
        
        # Check if this link contains a downloadable pdf file
        if pdf_href:
            pdf_url = pdf_href.group(2)

            base_output_filename = f'pdfs/result_{company_name.replace(" ", "_")}'
            _download_url(f'{base_output_filename}.pdf', pdf_url)
            extracted_text = _extract_pdf_text(f'{base_output_filename}.pdf')
            return [s for s in extracted_text.split("\n") if s != ""], pdf_url
    return [], ""

In [None]:
# get_search_results("tim hortons", "sustainability report")

## Summarization Model

In [None]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelForTokenClassification


sum_model_checkpoint = "t5-small" # options: model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
sum_tokenizer        = AutoTokenizer.from_pretrained(sum_model_checkpoint)
sum_model            = AutoModelForSeq2SeqLM.from_pretrained(sum_model_checkpoint)

summarizer = pipeline("summarization", model=sum_model, tokenizer=sum_tokenizer)

example = """In 1964, the first Tim Hortons® restaurant in Hamilton, Ontario opened its doors and Canadians have been ordering Tim Hortons iconic Original Blend coffee, Double-Double ™ coffees, Donuts and Timbits® ever since. Over the years, Tim Hortons has captured the hearts and taste buds of Canadians. Tim Hortons is now proud to be Canada's largest restaurant chain serving over 5 million cups of coffee every day with 80% of Canadians visiting a Tims in Canada at least once a month. More than a coffee and bake shop, Tim Hortons is part of the fabric of Canada and a proud symbol of our country and its values.

We have always had a deep connection to our communities and our guests from coast to coast. Our more than 1,500 restaurant owners are Canadians who live in communities across the country employing over 100,000 people. Our local Owners and their teams care deeply about the communities they serve. That’s why we provide over $60 million per year to organizations and communities throughout Canada. From signature programs like Timbits® Sports and the Tim Hortons Foundation Camps, we invest in community projects and initiatives big and small."""

def get_summary(company, progress_bar = None, num_sentence = 25):
    result, url = get_search_results(company, "sustainability report")

    summary = []
    N = len(result) // num_sentence
    if progress_bar:
        progress_bar.max = N
    for i in range(N):
        try:
            s = result[i*num_sentence:(i+1)*num_sentence]
            r = summarizer("\n".join(s), max_length=50)
            # print(r)
            summary.append(f'<li>{r[-1]["summary_text"]}</li>')
        except:
            pass
        if progress_bar:
            progress_bar.value += 1
    return summary, url

# summarizer(example)

In [None]:
from ipywidgets import IntProgress
progress = IntProgress()
display(progress)
# get_summary("tim hortons", progress)

## NER - Name-Entity Recognition Model

In [None]:
from IPython.core.formatters import default
ner_model_checkpoint = 'dslim/bert-base-NER'
ner_tokenizer        = AutoTokenizer.from_pretrained(ner_model_checkpoint)
ner_model            = AutoModelForTokenClassification.from_pretrained(ner_model_checkpoint)

entity_recognizer = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
example = "I work at Samsung and am planning on moving to Sport-Ball Inc, Apple, or France. I went to Canada for lunch "

def org_finder(corpus, num_results = 5):
    orgs = []
    for c in corpus:
        tokens = entity_recognizer(c)
        
        multi_string_org = False
        for t in tokens:
            if t['entity'] == 'B-ORG':
                multi_string_org = True
                orgs.append(t['word'])
            elif t['entity'] == 'I-ORG':
                if orgs != []:
                    orgs[-1] += " " + t['word']
                else:
                    orgs.append(t['word'])
            else:
                multi_string_org = False 
    l = [o for o in set(orgs) if o.replace(" ", "").replace(".", "").isalnum()]
    return sorted(l, key = lambda o : len(o), reverse=True)[:num_results]

# org_finder([example])

In [None]:
# org_finder(get_search_results("tim hortons", "suppliers list")[0])

# APP

In [None]:
from ipywidgets import HTML, Label, Button, Tab, VBox, HBox, Text, IntProgress
from IPython.display import Image
import logging
logging.getLogger().setLevel(logging.ERROR)


display(Image(filename='HackTheMist/logo.png', width=900, height=450))

search_bar = Text()
search_button = Button(description="Search", button_style='info', tooltip='Search',
                       icon='search')
reset_button = Button(description="Clear", button_style='info', tooltip='Reset',
                      icon='refresh')

window = VBox([HBox([Label(value="Company: "), search_bar, 
                     search_button, reset_button]),
              ])

def reset(clear_search = False):
  if clear_search:  
    search_bar.value = ""
  window.children = window.children[:1]

def make_tabs():
    reset()
    progress = IntProgress()
    window.children = window.children + (progress, )
    
    tabs = Tab([HTML(), HTML(value='<i>Suppliers list updating</i>')])
    tabs.set_title(0, "Report")
    tabs.set_title(1, "Suppliers and Sponsors")

    summary, url = get_summary(search_bar.value, progress)
    sum_html = HTML(value=f'<ul>{"".join(summary)}<ul/><a href="{url}">Source 0: {url}</a>')
    tabs.children = (sum_html, tabs.children[-1])

    window.children = window.children[:-1] + (tabs, )

    suppliers = org_finder(get_search_results(search_bar.value, "suppliers list")[0])
    suppliers_list = [f'<li>{org}</li>' for org in suppliers]
    sup_html = HTML(value=f'<ul>{" ".join(suppliers_list)}<ul/>')
    tabs.children = (sum_html, sup_html)

search_button.on_click(lambda _ : make_tabs())
reset_button.on_click(lambda _ : reset(clear_search=True))

reset()

display(window)