<a href="https://colab.research.google.com/github/VictorPelaez/genai_gazzete/blob/main/news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Example notebook

In [1]:
!pip install --upgrade newsapi-python transformers python-docx diffusers scipy

Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting diffusers
  Downloading diffusers-0.19.3-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
Collecting scipy
  Downloading scipy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingf

In [2]:
!git clone https://github.com/VictorPelaez/genai_gazzete.git

Cloning into 'genai_gazzete'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 25 (delta 10), reused 15 (delta 4), pack-reused 0[K
Receiving objects: 100% (25/25), 8.34 KiB | 8.34 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [3]:
from genai_gazzete import utils
from newsapi import NewsApiClient

# -----------------------------------------------------
# Get news from fdate
# -----------------------------------------------------

fdate = '2023-08-04'

config = utils.readConfig()
d_config = dict(config.items('DEFAULT'))

newsapi = NewsApiClient(api_key=d_config['api_key'])
all_articles = newsapi.get_everything(q='generative ai llms',
                                      language='en',
                                      from_param=fdate,
                                      sort_by='relevancy')

print("#articles: ", len(all_articles["articles"]))

#articles:  25


## SUMMARIZATION MODEL

In [5]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import requests
import re
from transformers import pipeline

import warnings
warnings.filterwarnings('ignore')

# -----------------------------------------------------
# Run Summarization model for all the articles
# -----------------------------------------------------

model_name = "sshleifer/distilbart-cnn-12-6" # other models: "sshleifer/bart-large-cnn" "sshleifer/distilbart-xsum-12-1" "google/flan-t5-base"
summarizer = pipeline('summarization', model="sshleifer/distilbart-cnn-12-6", device=0) # T4 GPU in Colab

dash_line = '-'.join('' for x in range(100))

## Data to feed
N = int(round(len(all_articles["articles"])*0.1, 0)) # Number of summaries to show, verbose
L = 3000 # Context size
df = pd.DataFrame(columns = ["source", "url", "title", "description", "len_text", "summary"])

for i, a in enumerate(all_articles["articles"]):

  start = time.time()
  print(dash_line)
  print('Example ', i + 1)
  print(dash_line)

  # read url HTLM web
  page = requests.get(a["url"])
  soup = BeautifulSoup(page.content, 'html.parser')
  result = soup.find_all(["p","i"]) # Slashdot is <i>
  ARTICLE = ""
  for part in result:
    if (part.get("class")==None) and len(ARTICLE)<L:
      ARTICLE = ARTICLE + " " + part.get_text()

  ARTICLE = utils.clear_article(ARTICLE)

  if len(ARTICLE)>0:
    summarized_article = summarizer(ARTICLE)[0]["summary_text"] # Inference summarization model
    summarized_article = re.sub(r'\s([?.!"](?:\s|$))', r'\1', summarized_article) # Remove whitespaces
  else:
    summarized_article = "empty"

  df = df.append({'source': a['source']['name'],
                  'url': a["url"],
                  'title': a["title"],
                  'description': a["description"],
                  'len_text': len(ARTICLE),
                  'summary': summarized_article}, ignore_index = True)

  # print samples
  if (i<N) and (len(ARTICLE)>0) :
    print(a['source']['name'], a["url"])
    print('Description:  ', a['description'])
    print('LLM Summary')
    print(df.iloc[i]["summary"])

  end = time.time()
  print(end - start)
  print(dash_line)
  print()

df = utils.remove_summaries(df)
print('Final number of articles: ' + str(len(df. index)))

---------------------------------------------------------------------------------------------------
Example  1
---------------------------------------------------------------------------------------------------
Forbes https://www.forbes.com/sites/moorinsights/2023/08/04/aws-turbocharges-foundation-models-with-smart-ai-agents/
Description:   Vice President of AI & Quantum Computing, Paul Smith-Goodson discusses AWS's recently announced AI capability that adds key functionality to its foundation models.
LLM Summary
 aws recently announced an important ai capability called agents that adds key functionality to its foundation models before discussing agents in detail i will provide another overview of the amazon bedrock foundation models fms that will use these new features. The models are optimized and pretrained for conversations and content creation and because they run on an aws managed infrastructure the models can be scaled on services like ec2 and lambda.
1.757826566696167
---------

Your max_length is set to 142, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


1.5273330211639404
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  8
---------------------------------------------------------------------------------------------------
0.4091219902038574
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  9
---------------------------------------------------------------------------------------------------
1.127666711807251
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  10
---------------------------------------------------------------------------------------------------


Your max_length is set to 142, but your input_length is only 4. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=2)


1.5096440315246582
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  11
---------------------------------------------------------------------------------------------------
1.6941845417022705
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  12
---------------------------------------------------------------------------------------------------
3.315364122390747
---------------------------------------------------------------------------------------------------

---------------------------------------------------------------------------------------------------
Example  13
---------------------------------------------------------------------------------------------------
1.343

In [6]:
df.head(3)

Unnamed: 0,source,url,title,description,len_text,summary
0,Forbes,https://www.forbes.com/sites/moorinsights/2023...,AWS Turbocharges Foundation Models With Smart ...,"Vice President of AI & Quantum Computing, Paul...",2962,aws recently announced an important ai capabi...
1,Forbes,https://www.forbes.com/sites/forbestechcouncil...,Getting The Most Out Of AI: Businesses Need Sk...,The success of the prompt engineer will rely o...,3562,The last year has seen a meteoric rise in int...
2,Amazon.com,https://aws.amazon.com/about-aws/whats-new/202...,Announcing support for ml.p5 instances for Ama...,Amazon SageMaker training jobs now support ml....,1412,ml p5 instances powered by n Nvidia h100 chip...


# Image generation

In [7]:
import torch
from diffusers import StableDiffusionPipeline

model_id = "CompVis/stable-diffusion-v1-4"
device = "cuda"

pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)

# prompt = "Google Assistant Is Getting a Major Upgrade, pastel colors, Victorian style"
# image = pipe(prompt).images[0]
# image.save("image.png")

Downloading (…)ain/model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

Downloading (…)_encoder/config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

Downloading (…)cheduler_config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading (…)_checker/config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading (…)nfig-checkpoint.json:   0%|          | 0.00/209 [00:00<?, ?B/s]

Downloading (…)tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading (…)tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading (…)9ce/unet/config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Downloading (…)69ce/vae/config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Downloading (…)ch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


## Document Generation

In [8]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from genai_gazzete import functions_doc
from docx import Document
from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from datetime import date

# -----------------------------------------------------
# document creation
# -----------------------------------------------------

today = date.today()
document = Document()
# document.add_heading("Check out the most popular articles about Generative AI and LLMs in a nutshell!", 0)
document.add_picture('/content/drive/MyDrive/LLMs/20230804_130845_0000.png', width=Inches(6), height=Inches(1.2))
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
p = document.add_paragraph()
runner = p.add_run(fdate + " - " + str(today)).italic = True
last_paragraph = document.paragraphs[-1]
last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT

for r in df.index:
  # 1. add title
  document.add_heading('[' + df["source"][r] + '] ' +df["title"][r], level=1)
  # 2. add link
  p = document.add_paragraph()
  functions_doc.add_hyperlink(p, 'Original article', df["url"][r])

  # 3. create image and add it
  prompt_style= " sci-fi style, pen and ink, pastel colors"
  image = pipe(df["title"][r] + prompt_style).images[0]
  image.save("image.png")
  document.add_picture('image.png', width=Inches(2), height=Inches(2))
  last_paragraph = document.paragraphs[-1]
  last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER

  # 4. add summary
  document.add_paragraph(df["summary"][r], style='Intense Quote')
  p = document.add_paragraph()
  p.paragraph_format.line_spacing = Inches(0.3)

file_name = "summarized_articles" + "_" + today.strftime('%m_%d_%Y') + ".docx"
document.save('/content/drive/MyDrive/LLMs/'+file_name)