# Run example in notebook

In [None]:
!pip install --upgrade newsapi-python transformers python-docx diffusers scipy

In [None]:
import pandas as pd
import warnings; warnings.simplefilter('ignore')
import time
from utils import readConfig, clear_article

from newsapi import NewsApiClient
# --------------------------

fdate = '2023-08-03'

## NEWS_API
config = readConfig()
session_config = config['local_session']
api_key = session_config['api_key']
newsapi = NewsApiClient(api_key=api_key)
all_articles = newsapi.get_everything(q='generative ai llms',
                                      language='en',
                                      from_param=fdate,
                                      sort_by='relevancy')

print("#articles: ", len(all_articles["articles"]))

## SUMMARIZATION MODEL

In [None]:
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline

# --------------------------

model_name = "sshleifer/distilbart-cnn-12-6" #"sshleifer/bart-large-cnn" #"sshleifer/distilbart-xsum-12-1" "google/flan-t5-base"
summarizer = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6")#, device=0) # T4 GPU in Colab

dash_line = '-'.join('' for x in range(100))

## Data to feed
N = int(round(len(all_articles["articles"])*0.1, 0)) #number to show, verbose
L = 3000 #context size
df = pd.DataFrame(columns = ["source", "url", "title", "description", "len_text", "summary"])

for i, a in enumerate(all_articles["articles"]):

  start = time.time()
  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)

  # read url HTLM web
  page = requests.get(a["url"])
  soup = BeautifulSoup(page.content, 'html.parser')
  result = soup.find_all(["p","i"]) # Slashdot is <i>
  ARTICLE = ""
  for part in result:
    if (part.get("class")==None) and len(ARTICLE)<L:
      ARTICLE = ARTICLE + " " + part.get_text()

  ARTICLE = clear_article(ARTICLE)

  if len(ARTICLE)>0:
    summarized_article = summarizer(ARTICLE)[0]["summary_text"]
    summarized_article = re.sub(r'\s([?.!"](?:\s|$))', r'\1', summarized_article) #remove whitespaces
  else:
    summarized_article = "empty"

  df = df.append({'source': a['source']['name'],
                  'url': a["url"],
                  'title': a["title"],
                  'description': a["description"],
                  'len_text': len(ARTICLE),
                  'summary': summarized_article}, ignore_index = True)

  # print samples
  if (i<N) and (len(ARTICLE)>0) :
    print(a['source']['name'], a["url"])
    print('Description:  ', a['description'])
    print('LLM Summary')
    print(df.iloc[i]["summary"])

  end = time.time()
  print(end - start)
  print(dash_line)
  print()

df.drop_duplicates(subset=['source', 'title'], keep='first', inplace=True)
df.drop(df[df["summary"]=="empty"].index, inplace=True)
df = df[df['summary'].apply(lambda s: ("llms" in s) | ("llm" in s) | ("generative ai" in s) | ("ai" in s))]
print(df.count())

# Image generation

In [None]:
import torch
from diffusers import StableDiffusionPipeline

model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

#prompt = "Google Assistant Is Getting a Major Upgrade, pastel colors, Victorian style"
#image = pipe(prompt).images[0]
#image.save("image.png")

## Document Generation

In [None]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
from functions_doc import get_or_create_hyperlink_style, add_hyperlink
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from datetime import date

# -----------------------------------------------------
# document creation
# -----------------------------------------------------

today = date.today()
document = Document()
# document.add_heading("Check out the most popular articles about Generative AI and LLMs in a nutshell!", 0)
document.add_picture('/content/drive/MyDrive/LLMs coursera/20230804_130845_0000.png', width=Inches(6), height=Inches(1.2))
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
p = document.add_paragraph()
runner = p.add_run(fdate + " - " + str(today)).italic = True
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT

for r in df.index:
  # 1. add title
  document.add_heading('[' + df["source"][r] + '] ' +df["title"][r], level=1)
  # 2. add link
  p = document.add_paragraph()
  add_hyperlink(p, 'Original article', df["url"][r])

  # 3. create image and add it
  prompt_style= " sci-fi style, pen and ink, pastel colors"
  image = pipe(df["title"][r] + prompt_style).images[0]
  image.save("image.png")
  document.add_picture('image.png', width=Inches(2), height=Inches(2))
  last_paragraph = document.paragraphs[-1]
  last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

  # 4. add summary
  document.add_paragraph(df["summary"][r], style='Intense Quote')
  p = document.add_paragraph()
  p.paragraph_format.line_spacing = Inches(0.3)

document.save('/content/drive/MyDrive/LLMs coursera/summarized_articles.docx')