<a href="https://colab.research.google.com/github/VictorPelaez/genai_gazzete/blob/main/notebook_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Notebook to play and create an example

Tutorial instructions:
- Clone [github](https://github.com/VictorPelaez/genai_gazzete) repository
- Install all python dependences
- **Get api key** from [newsapi.org](https://newsapi.org/docs/client-libraries/python) and set it in config.ini file
- Run Summarization model to get all the article summaries
- Run Document generation with Images to get a docx file as a newsletter

In [None]:
!git clone https://github.com/VictorPelaez/genai_gazzete.git

Cloning into 'genai_gazzete'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (60/60), done.[K
remote: Total 77 (delta 40), reused 43 (delta 15), pack-reused 0[K
Receiving objects: 100% (77/77), 262.30 KiB | 3.45 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [None]:
!pip install --upgrade newsapi-python transformers python-docx diffusers scipy

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting diffusers
  Downloading diffusers-0.20.1-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting scipy
  Downloading scipy-1.11.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingf

In [None]:
from genai_gazzete import utils
from datetime import date, timedelta

# -----------------------------------------------------
# Get news from fdate
# -----------------------------------------------------

today = date.today()
fdate = today - timedelta(days=30)

all_articles = utils.get_news(fdate, n_articles=20)
print("#articles: ", len(all_articles["articles"]))

#articles:  20


## Summarization Model

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from transformers import pipeline
import requests
import re
import time
import warnings
warnings.filterwarnings('ignore')

# -----------------------------------------------------
# Run Summarization Model for all the articles
# other models: "sshleifer/bart-large-cnn" "sshleifer/distilbart-xsum-12-1" "google/flan-t5-base"
# -----------------------------------------------------
# model_name = "t5-base"
# model_name = "sshleifer/distilbart-cnn-12-6"
model_name = "facebook/bart-large-cnn"
summarizer = pipeline('summarization', model=model_name, device=0) # 0 for T4 GPU in Colab

dash_line = '-'.join('' for x in range(100))

## Data to feed
N = int(round(len(all_articles["articles"])*0.1, 0)) # Number of summaries to show, verbose
L = 3000 # Context size
df = pd.DataFrame(columns = ["source", "url", "title", "description", "len_text", "summary"])

for i, a in enumerate(all_articles["articles"]):

  start = time.time()
  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)

  # read url HTLM web
  page = requests.get(a["url"])
  soup = BeautifulSoup(page.content, 'html.parser')
  result = soup.find_all(["p","i"]) # Slashdot is <i>
  ARTICLE = ""
  for part in result:
    if (part.get("class")==None) and len(ARTICLE)<L:
      ARTICLE = ARTICLE + "\n" + part.get_text()

  ARTICLE = re.sub(r'\\r\\n', ' ', ARTICLE)

  if len(ARTICLE)>0:
    summarized_article = summarizer(ARTICLE, min_length=100, max_length=200)[0]["summary_text"] # Inference summarization model
    summarized_article = utils.format_summary(summarized_article)

  else:
    summarized_article = "empty"

  df = df.append({'source': a['source']['name'],
                  'url': a["url"],
                  'title': a["title"],
                  'description': a["description"],
                  'len_text': len(ARTICLE),
                  'summary': summarized_article}, ignore_index = True)

  # print samples
  if (i<N) and (len(ARTICLE)>0) :
    print(a['source']['name'], a["url"])
    print('Description:  ', a['description'])
    print('LLM Summary')
    print(df.iloc[i]["summary"])

  end = time.time()
  print(end - start)
  print(dash_line)
  print()

df = utils.remove_summaries(df, top=10)
print('Final number of articles: ' + str(len(df. index)))

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
VentureBeat https://venturebeat.com/ai/how-customer-engagement-will-evolve-along-with-generative-ai/
Description:   Generative AI is reshaping customer and prospect engagement, elevating experiences at scale and driving growth.
LLM Summary
Generative ai is reshaping customer and prospect engagement, elevating experiences at scale and driving growth. Dive into the transformative potential of genai, from groundbreaking use cases across industries to strategies you can implement today, in this vb spotlight event. Register to watch free now. The event is hosted by sendbird and takes place in new york city on november 14. For more information, visit sendbird's website. The conference is also being held in san francisco and los angeles on november 15 and 16. 
2.866035223007202
-------

## Document Generation with images

In [None]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import stable diffusion model

import torch
from diffusers import StableDiffusionPipeline

model_id = "CompVis/stable-diffusion-v1-4"

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

Downloading (…)ain/model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading (…)nfig-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

Downloading (…)_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)38b/unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading (…)638b/vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [None]:
from genai_gazzete import functions_doc
from docx import Document
from docx.shared import Inches, Pt, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.dml import MSO_THEME_COLOR

from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.text.paragraph import Paragraph
from docx.enum.section import WD_SECTION

# -----------------------------------------------------
# document creation
# -----------------------------------------------------

margin_value = 3
document = Document()
new_section = document.add_section(WD_SECTION.CONTINUOUS)
new_section.left_margin = Cm(margin_value)
new_section.right_margin = Cm(margin_value)

document.add_picture('genai_gazzete/banner_top.png', width=Inches(6), height=Inches(1))
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

document.add_heading("Most Relevant Articles about Generative AI and LLMs in a Nutshell!", level=0)
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

document.add_heading("Welcome to this AI-powered newsletter (" + fdate.strftime('%d/%m') + " - " + today.strftime('%d/%m/%Y') + ")\n", level=1)
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER


text_welcome = """Have questions or are looking to collaborate? Write me at victor.m.pelaez@outlook.com or connect on """
p = document.add_paragraph(style='Normal')
runner = p.add_run(text_welcome)
functions_doc.add_hyperlink(p, 'LinkedIn', "https://linkedin.com/in/victormpelaez")
font = runner.font
font.name = 'Calibri'
font.size = Pt(9)
font.italic=True
font.color.theme_color = MSO_THEME_COLOR.DARK_2
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT

p = document.add_paragraph(style='Normal')
runner = p.add_run("(*) Important Disclaimer: All content is generated by Artificial Intelligence. Details in the last page")
font = runner.font
font.name = 'Calibri'
font.size = Pt(9)
font.italic=True
font.color.theme_color = MSO_THEME_COLOR.DARK_2
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT

p = document.add_paragraph()
p.paragraph_format.line_spacing = Inches(0.3)

for r in df.index:

  current_section = document.sections[-1]  # last section in document
  new_section = document.add_section(WD_SECTION.CONTINUOUS)
  new_section.left_margin = Cm(margin_value)
  new_section.right_margin = Cm(margin_value)
  # sectPr = new_section._sectPr
  # cols = sectPr.xpath('./w:cols')[0]
  # cols.set(qn('w:num'), '1')


  # 1. add title
  document.add_heading('#' + str(r+1) + '.   [' + df["source"][r] + '] ' + df["title"][r], level=1)
  # 2. add link
  p = document.add_paragraph()
  functions_doc.add_hyperlink(p, 'Original article', df["url"][r])

  # 3. create image and add it
  prompt_style = "sci-fi painting style by Simon Stalenhag:0.8 by Ian McQue:0.2, pen and ink, primary pastel colors"
  prompt = df["title"][r] + prompt_style
  num_images_per_prompt = 1
  images = pipe(prompt,
                width=1024,
                height=512,
                num_images_per_prompt=num_images_per_prompt).images

  for idx, im in enumerate(images):
    im.save("/content/drive/MyDrive/LLMs/images/image_"+str(r)+"_"+str(idx)+".png")
  document.add_picture("/content/drive/MyDrive/LLMs/images/image_"+str(r)+"_0.png", width=Inches(4.1), height=Inches(1.4))
  last_paragraph = document.paragraphs[-1]
  last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

  # 4. add summary

  current_section = document.sections[-1]  # last section in document
  new_section = document.add_section(WD_SECTION.CONTINUOUS)
  # sectPr = new_section._sectPr
  # cols = sectPr.xpath('./w:cols')[0]
  # cols.set(qn('w:num'), '2')
  # cols.set(qn('w:space'), '0.9')  # Set space between columns to 10 points ->0.01"

  p = document.add_paragraph(style='Intense Quote')
  runner = p.add_run(df["summary"][r])
  font = runner.font
  font.name = 'Calibri'
  font.size = Pt(10)
  font.italic=False
  font.bold=False
  font.color.theme_color = MSO_THEME_COLOR.DARK_2
  last_paragraph = document.paragraphs[-1]
  last_paragraph.paragraph_format.right_indent = Inches(0.25)
  last_paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY_LOW
  # last_paragraph = document.paragraphs[-1]
  # last_paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY
  # p = document.add_paragraph()
  # p.paragraph_format.line_spacing = Inches(0.1)


# 5. add disclaimer
with open('genai_gazzete/disclaimer.txt') as f: disclaimer = f.read()
p = document.add_paragraph()
p.paragraph_format.line_spacing = Inches(0.5)

current_section = document.sections[-1]  # last section in document
new_section = document.add_section(WD_SECTION.CONTINUOUS)
# sectPr = new_section._sectPr
# cols = sectPr.xpath('./w:cols')[0]
# cols.set(qn('w:num'), '1')

p = document.add_paragraph(style='Normal')
runner = p.add_run(disclaimer)
font = runner.font
font.name = 'Cambria'
font.size = Pt(9)
font.italic=True
font.color.theme_color = MSO_THEME_COLOR.DARK_2
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT

file_name = "summarized_articles" + "_" + today.strftime('%m_%d_%Y') + ".docx"
document.save('/content/drive/MyDrive/LLMs/'+file_name)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]