In [18]:
import feedparser
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import json

f = open('article.html')
template = f.read()
f.close()

BASE_URL = './articles/article'

SUGGESTED_ARTICLE = '''
            <div class="suggested-article">
                <h3>TITLE</h3>
                <p>DESCRIPTION</p>
            </div>
'''

f = open('articles.json')
existing_articles = json.load(f)
f.close()

def create_article(article, file_name):
    f = open("articles/" + file_name + ".html", 'w')
    html = template.replace("TITLE", article["title"])
    html = html.replace("BODY", article["body"])
    html = html.replace("SUGGESTIONS", '\n'.join(article["related"]))
    html = html.replace("LINK", '\n'.join(article["original_link"]))
    f.write(html)
    f.close()

def query(query):
        body =  json.dumps({
              "model": "mistral",
              "prompt": query,
              "stream": False
        })
        response = requests.post('http://localhost:11434/api/generate', data=body)
        return response.json()['response']

def get_tag_content(html, target_class):
    try:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        # Find the target tag
        target_element = soup.find_all(class_=target_class)

        # Check if the target tag was found
        if target_element:
            # Return the content of the target tag
            return target_element[0].get_text(strip=True)
        else:
            return ""

    except Exception as e:
        print(f"An error occurred: {e}")

def read_rss_feed(feed_url):
    # Parse the RSS feed
    feed = feedparser.parse(feed_url)
    date_format = "%a, %d %b %Y %H:%M:%S %z"

    # Check if the feed was parsed successfully
    if feed.bozo == 0:
        for entry in feed.entries:
            entry.published = datetime.strptime(entry.published, date_format)
        return feed.entries
    else:
        return []

rss_feed_url = "https://www.ansa.it/sito/ansait_rss.xml"
feed = read_rss_feed(rss_feed_url)
articles = []
i = len(existing_articles)
tot_len = len(feed)
for entry in feed:
    if not "podcast" in entry.title:
        response = requests.get(entry.link)
        if response.status_code == 200:
            target_tag = 'article-main'
            content = get_tag_content(response.text, target_tag)
            if content:
                text = query(f"Crea un articolo giornalistico in italiano a partire dalle seguenti informazioni:\n{content}")
                title = query(f"Crea un titolo per il seguente articolo. Il titolo deve essere in italiano\n:{text}").replace('"', '').replace('\'', '')
                articles.append({
                    'title':title,
                    'body':text,
                    'index':i,
                    'original_link': entry.link,
                    'date': entry.published.date().strftime('%Y-%m-%dT%H:%M:%S.%f')
                })
                print(f'{i}/{tot_len}', end='\r')
                i +=1
                if i >+ 3:
                    break
        else:
            print(f"Failed to download HTML. Status code: {response.status_code}")

titles = [{'title':article['title'], 'url':f"{BASE_URL}{article['index']}", 'date': article['date']} for article in articles]
existing_articles += titles
f = open('articles.json', 'w')
f.write(json.dumps(existing_articles))
f.close()
print(titles)
all_titles = "\n-".join([title["title"] for title in titles])
for article in articles:
    related_news = query(f'dal seguente articolo, trova 6 articoli da consigliare tra i seguenti. Separa gli articoli consigliati unsando il carattere \\n. Articolo:\n{article["body"]}\nPossibili articoli:\n-{all_titles}')
    related_news = related_news.split('\n')
    related_news_snippets = []
    for news in related_news:
        if news != article['title']:
            related_news_snippets.append(
                SUGGESTED_ARTICLE.replace('TITLE', news).replace('DESCRIPTION', query(f"scrivi una brevissimo snippet in italiano per un articolo con questo titolo: {news}"))
            )
    article['related'] = related_news_snippets
    create_article(article, f'article{article["index"]}')

[{'title': '\n50 anni di regina del wellness: Kate Moss, licona modella britannica che promuove il benessere', 'url': './articles/article0', 'date': '2024-01-15T00:00:00.000000'}, {'title': 'La bambina più giovane al campo base del Monte Everest: La storia di Zara, la ragazza che superò il limite delle possibilità', 'url': './articles/article1', 'date': '2024-01-15T00:00:00.000000'}, {'title': '\nLa Cassazione stabilisce le responsabilità civili e penali per la strage di Viareggio del 2009: il presidente dellassociazione FS e RFi è stato rinviato in appello a Firenze', 'url': './articles/article2', 'date': '2024-01-15T00:00:00.000000'}, {'title': 'Hamas Publishes Video of Captured Israeli Teenager, Showing Her Dead Body in Disturbing Montage', 'url': './articles/article3', 'date': '2024-01-15T00:00:00.000000'}]
[{'title': '\n50 anni di regina del wellness: Kate Moss, licona modella britannica che promuove il benessere', 'url': './articles/article0', 'date': '2024-01-15T00:00:00.000000'