<a href="https://colab.research.google.com/github/VictorPelaez/genai_gazzete/blob/main/news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook to play and create an example

Tutorial instructions:
- Clone [github](https://github.com/VictorPelaez/genai_gazzete) repository
- Install all python dependences
- **Get api key** from [newsapi.org](https://newsapi.org/docs/client-libraries/python) and set it in config.ini file
- Run Summarization model to get all the article summaries
- Run Document generation with Images to get a docx file as a newsletter

In [None]:
!git clone https://github.com/VictorPelaez/genai_gazzete.git

In [None]:
!pip install --upgrade newsapi-python transformers python-docx diffusers scipy

In [None]:
from genai_gazzete import utils
from newsapi import NewsApiClient

# -----------------------------------------------------
# Get news from fdate
# -----------------------------------------------------

fdate = '2023-07-31'

config = utils.readConfig()
d_config = dict(config.items('DEFAULT'))

newsapi = NewsApiClient(api_key=d_config['api_key'])
all_articles = newsapi.get_everything(q='generative ai llms',
                                      language='en',
                                      from_param=fdate,
                                      sort_by='relevancy',
                                      page_size=20)

print("#articles: ", len(all_articles["articles"]))

## Summarization Model

In [None]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline

import warnings
warnings.filterwarnings('ignore')

# -----------------------------------------------------
# Run Summarization model for all the articles
# -----------------------------------------------------

model_name = "sshleifer/distilbart-cnn-12-6" # other models: "sshleifer/bart-large-cnn" "sshleifer/distilbart-xsum-12-1" "google/flan-t5-base"
summarizer = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6", device=0) # T4 GPU in Colab

dash_line = '-'.join('' for x in range(100))

## Data to feed
N = int(round(len(all_articles["articles"])*0.1, 0)) # Number of summaries to show, verbose
L = 3000 # Context size
df = pd.DataFrame(columns = ["source", "url", "title", "description", "len_text", "summary"])

for i, a in enumerate(all_articles["articles"]):

  start = time.time()
  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)

  # read url HTLM web
  page = requests.get(a["url"])
  soup = BeautifulSoup(page.content, 'html.parser')
  result = soup.find_all(["p","i"]) # Slashdot is <i>
  ARTICLE = ""
  for part in result:
    if (part.get("class")==None) and len(ARTICLE)<L:
      ARTICLE = ARTICLE + " " + part.get_text()

  ARTICLE = utils.clear_article(ARTICLE)

  if len(ARTICLE)>0:
    summarized_article = summarizer(ARTICLE)[0]["summary_text"] # Inference summarization model
    summarized_article = re.sub(r'\s([?.!"](?:\s|$))', r'\1', summarized_article) # Remove whitespaces
    summarized_article = summarized_article.lstrip().capitalize()

  else:
    summarized_article = "empty"

  df = df.append({'source': a['source']['name'],
                  'url': a["url"],
                  'title': a["title"],
                  'description': a["description"],
                  'len_text': len(ARTICLE),
                  'summary': summarized_article}, ignore_index = True)

  # print samples
  if (i<N) and (len(ARTICLE)>0) :
    print(a['source']['name'], a["url"])
    print('Description:  ', a['description'])
    print('LLM Summary')
    print(df.iloc[i]["summary"])

  end = time.time()
  print(end - start)
  print(dash_line)
  print()

df = utils.remove_summaries(df)
print('Final number of articles: ' + str(len(df. index)))

In [None]:
df.head(3)

In [None]:
# top 10
df = df.head(10)

## Document Generation with images

In [None]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
# import stable diffusion model

import torch
from diffusers import StableDiffusionPipeline

model_id = "CompVis/stable-diffusion-v1-4"

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

In [None]:
from genai_gazzete import functions_doc
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from datetime import date

# -----------------------------------------------------
# document creation
# -----------------------------------------------------

today = date.today()
document = Document()

document.add_picture('genai_gazzete/banner_top.png', width=Inches(6), height=Inches(1))
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

document.add_heading("10 Most Relevant Articles about Generative AI and LLMs in a Nutshell!", level=0)
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

p = document.add_paragraph()
runner = p.add_run("dates: " + fdate + " - " + str(today)).italic = True
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT

for r in df.index:
  # 1. add title
  document.add_heading('#' + str(r+1) + '.   [' + df["source"][r] + '] ' + df["title"][r], level=1)
  # 2. add link
  p = document.add_paragraph()
  functions_doc.add_hyperlink(p, 'Original article', df["url"][r])

  # 3. create image and add it
  # prompt_style = "sci-fi painting by Ian McQue:1 sci-fi painting by Simon Stalenhag:0.5, pen and ink, pastel colors"
  prompt_style= "sci-fi painting style by Simon Stalenhag:0.8 by Ian McQue:0.2, pen and ink, primary pastel colors"
  prompt = df["title"][r] + prompt_style
  num_images_per_prompt = 1
  images = pipe(prompt,
                width=1024,
                height=512,
                num_images_per_prompt=num_images_per_prompt).images

  for idx, im in enumerate(images):
    im.save("/content/drive/MyDrive/LLMs/images/image_"+str(r)+"_"+str(idx)+".png")
  document.add_picture("/content/drive/MyDrive/LLMs/images/image_"+str(r)+"_0.png", width=Inches(4.1), height=Inches(1.4))
  last_paragraph = document.paragraphs[-1]
  last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

  # 4. add summary
  document.add_paragraph(df["summary"][r], style='Intense Quote')
  last_paragraph = document.paragraphs[-1]
  last_paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
  p = document.add_paragraph()
  p.paragraph_format.line_spacing = Inches(0.2)

file_name = "summarized_articles" + "_" + today.strftime('%m_%d_%Y') + ".docx"
document.save('/content/drive/MyDrive/LLMs/'+file_name)