In [2]:
pip install feedparser newspaper3k transformers sentence-transformers torch jinja2


Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl.metadata (11 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting tldextract>=2.0.1 (from newspaper3k)
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jieba3k>=0.35.1 (from newspaper3k)
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m52.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tinysegmenter==0.3 (from newspap

In [6]:
pip install beautifulsoup4 lxml



In [8]:
pip install lxml[html_clean]


Collecting lxml_html_clean (from lxml[html_clean])
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


In [9]:
import feedparser
import newspaper
import os
import datetime
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
from jinja2 import Template

# Define user personas
personas = [
    {"name": "Alex Parker", "interests": ["AI", "cybersecurity", "blockchain", "startups", "programming"], "sources": ["https://techcrunch.com/feed/", "https://www.wired.com/feed/rss", "https://www.technologyreview.com/feed/"]},
    {"name": "Priya Sharma", "interests": ["global markets", "startups", "fintech", "cryptocurrency", "economics"], "sources": ["https://www.bloomberg.com/feed", "https://www.ft.com/?format=rss", "https://www.coindesk.com/arc/outboundfeeds/rss/"]},
    {"name": "Marco Rossi", "interests": ["football", "F1", "NBA", "Olympic sports", "esports"], "sources": ["https://www.espn.com/espn/rss/news", "http://feeds.bbci.co.uk/sport/rss.xml", "https://www.skysports.com/rss/12040"]},
    {"name": "Lisa Thompson", "interests": ["movies", "celebrity news", "TV shows", "music", "books"], "sources": ["https://variety.com/feed/", "https://www.hollywoodreporter.com/t/feed/", "https://www.billboard.com/feed/"]},
    {"name": "David Martinez", "interests": ["space exploration", "AI", "biotech", "physics", "renewable energy"], "sources": ["https://www.nasa.gov/rss/dyn/breaking_news.rss", "https://www.sciencedaily.com/rss/top/science.xml", "https://arstechnica.com/science/feed/"]}
]

# Summarizer & Embedding models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Template for markdown
newsletter_template = """# {{name}}'s Personalized Newsletter
**Date:** {{date}}

## Highlights:
{% for article in highlights %}
- [{{ article['title'] }}]({{ article['link'] }}) - *{{ article['summary'] }}*
{% endfor %}

{% for topic, articles in categorized.items() %}
## {{topic}}
{% for article in articles %}
- [{{ article['title'] }}]({{ article['link'] }}) - *{{ article['summary'] }}*
{% endfor %}
{% endfor %}
"""

def fetch_articles(sources):
    articles = []
    for url in sources:
        feed = feedparser.parse(url)
        for entry in feed.entries:
            try:
                art = newspaper.Article(entry.link)
                art.download()
                art.parse()
                summary = summarizer(art.text[:1024], max_length=60, min_length=10, do_sample=False)[0]['summary_text']
                articles.append({"title": entry.title, "link": entry.link, "text": art.text, "summary": summary})
            except Exception:
                continue
    return articles

def select_relevant_articles(articles, interests):
    scores = []
    interest_embeddings = embedder.encode(interests, convert_to_tensor=True)
    for article in articles:
        art_embedding = embedder.encode(article['summary'], convert_to_tensor=True)
        score = util.cos_sim(interest_embeddings, art_embedding).max().item()
        scores.append((article, score))
    scores.sort(key=lambda x: x[1], reverse=True)
    top_articles = [item[0] for item in scores[:10]]
    categorized = {}
    for art in top_articles:
        for interest in interests:
            if interest.lower() in art['summary'].lower():
                categorized.setdefault(interest.title(), []).append(art)
                break
        else:
            categorized.setdefault("Other", []).append(art)
    return categorized, top_articles[:3]

def generate_newsletter(persona):
    articles = fetch_articles(persona['sources'])
    categorized, highlights = select_relevant_articles(articles, persona['interests'])
    template = Template(newsletter_template)
    output = template.render(name=persona['name'], date=str(datetime.date.today()), highlights=highlights, categorized=categorized)
    file_name = persona['name'].replace(" ", "_") + "_Newsletter.md"
    with open(file_name, "w", encoding='utf-8') as f:
        f.write(output)

# Run for all personas
for persona in personas:
    print(f"Generating newsletter for {persona['name']}...")
    generate_newsletter(persona)
print("All newsletters generated!")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating newsletter for Alex Parker...
Generating newsletter for Priya Sharma...


Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


Generating newsletter for Marco Rossi...


Your max_length is set to 60, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 60, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 60, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)
Your max_length is set to 60, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max

Generating newsletter for Lisa Thompson...
Generating newsletter for David Martinez...
All newsletters generated!


In [12]:
!pip install markdown2 pdfkit


Collecting markdown2
  Downloading markdown2-2.5.3-py3-none-any.whl.metadata (2.1 kB)
Collecting pdfkit
  Downloading pdfkit-1.0.0-py3-none-any.whl.metadata (9.3 kB)
Downloading markdown2-2.5.3-py3-none-any.whl (48 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfkit-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: pdfkit, markdown2
Successfully installed markdown2-2.5.3 pdfkit-1.0.0


In [13]:
# Colab-Ready Script: Convert Markdown Newsletters to PDF

import markdown2
import pdfkit
import os

# Install wkhtmltopdf inside Colab
!apt-get install -y wkhtmltopdf

# Configure PDFKit to use Colab's wkhtmltopdf binary
config = pdfkit.configuration(wkhtmltopdf='/usr/bin/wkhtmltopdf')

# List of personas for file processing
personas_names = [
    "Alex_Parker",
    "Priya_Sharma",
    "Marco_Rossi",
    "Lisa_Thompson",
    "David_Martinez"
]

# PDF conversion settings
pdf_options = {
    'page-size': 'A4',
    'encoding': "UTF-8",
    'margin-top': '0.75in',
    'margin-right': '0.75in',
    'margin-bottom': '0.75in',
    'margin-left': '0.75in'
}

# Convert each markdown file to a PDF
def convert_md_to_pdf(md_file, pdf_file):
    with open(md_file, 'r', encoding='utf-8') as f:
        md_content = f.read()
    html_content = markdown2.markdown(md_content)
    pdfkit.from_string(html_content, pdf_file, options=pdf_options, configuration=config)

# Create PDFs for all personas
for persona in personas_names:
    md_file = f"{persona}_Newsletter.md"
    pdf_file = f"{persona}_Newsletter.pdf"
    if os.path.exists(md_file):
        print(f"Converting {md_file} to PDF...")
        convert_md_to_pdf(md_file, pdf_file)
    else:
        print(f"Markdown file for {persona} not found.")

print("All PDFs generated and saved in the current directory!")


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  avahi-daemon geoclue-2.0 glib-networking glib-networking-common glib-networking-services
  gsettings-desktop-schemas iio-sensor-proxy libavahi-core7 libavahi-glib1 libdaemon0 libevdev2
  libfontenc1 libgudev-1.0-0 libhyphen0 libinput-bin libinput10 libjson-glib-1.0-0
  libjson-glib-1.0-common libmbim-glib4 libmbim-proxy libmd4c0 libmm-glib0 libmtdev1
  libnl-genl-3-200 libnotify4 libnss-mdns libproxy1v5 libqmi-glib5 libqmi-proxy libqt5core5a
  libqt5dbus5 libqt5gui5 libqt5network5 libqt5positioning5 libqt5printsupport5 libqt5qml5
  libqt5qmlmodels5 libqt5quick5 libqt5sensors5 libqt5svg5 libqt5webchannel5 libqt5webkit5
  libqt5widgets5 libsoup2.4-1 libsoup2.4-common libwacom-bin libwacom-common libwacom9 libwoff1
  libxcb-icccm4 libxcb-image0 libxcb-keysyms1 libxcb-render-util0 libxcb-util1 libxcb-xinerama0
  libxcb-xinput0 libxcb-xkb1 l

In [15]:
def calculate_relevance_score(articles, interests):
    interest_embeddings = embedder.encode(interests, convert_to_tensor=True)
    scores = []
    for article in articles:
        art_embedding = embedder.encode(article['summary'], convert_to_tensor=True)
        score = util.cos_sim(interest_embeddings, art_embedding).max().item()
        scores.append(score)
    avg_score = sum(scores) / len(scores) if scores else 0
    return avg_score

In [16]:
def generate_newsletter(persona):
    articles = fetch_articles(persona['sources'])
    categorized, selected_articles = select_relevant_articles(articles, persona['interests'])
    relevance_score = calculate_relevance_score(selected_articles, persona['interests'])
    print(f"Relevance Score for {persona['name']}: {relevance_score:.2f}")

    template = Template(newsletter_template)
    output = template.render(name=persona['name'], date=str(datetime.date.today()), highlights=selected_articles[:3], categorized=categorized)
    file_name = persona['name'].replace(" ", "_") + "_Newsletter.md"
    with open(file_name, "w", encoding='utf-8') as f:
        f.write(output)

# Run for all personas
for persona in personas:
    print(f"Generating newsletter for {persona['name']}...")
    generate_newsletter(persona)
print("All newsletters generated with relevance scores displayed!")


Generating newsletter for Alex Parker...
Relevance Score for Alex Parker: 0.48
Generating newsletter for Priya Sharma...


Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)
Your max_length is set to 60, but your input_length is only 36. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=18)


Relevance Score for Priya Sharma: 0.45
Generating newsletter for Marco Rossi...


Your max_length is set to 60, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 60, but your input_length is only 49. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 60, but your input_length is only 59. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=29)
Your max_length is set to 60, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max

Relevance Score for Marco Rossi: 0.43
Generating newsletter for Lisa Thompson...
Relevance Score for Lisa Thompson: 0.33
Generating newsletter for David Martinez...
Relevance Score for David Martinez: 0.42
All newsletters generated with relevance scores displayed!
