In [4]:
API_KEY = "2bBZn5pMnk003jcxoOLmZV9lKLflifdjC8mOOSpep0wB9lIaDoAyJQQJ99AKACfhMk5XJ3w3AAABACOGPo7V"
AZURE_ENDPOINT = "https://hackatum-2024.openai.azure.com"

In [84]:
# pip install llama-index llama-index-embeddings-azure-openai llama-index-llms-azure-openai
import os
from llama_index.llms.azure_openai import AzureOpenAI

llm = AzureOpenAI(
    engine="gpt-4o",  # Deployment name in Azure OpenAI
    model="gpt-4o",   # Model name
    api_key=API_KEY,
    azure_endpoint=AZURE_ENDPOINT,
    api_version="2024-08-01-preview",
    temperature=0.0   # Adjust temperature as needed
)

# Example usage
prompt = "What are the benefits of using Azure OpenAI?"
response = llm.complete(prompt)

# Output the response
print("AI Response:", response)

AI Response: Using Azure OpenAI offers several benefits, particularly for businesses and developers looking to leverage advanced AI capabilities. Here are some key advantages:

1. **Access to Advanced AI Models**: Azure OpenAI provides access to powerful language models like GPT-3, which can be used for a variety of applications such as natural language processing, text generation, translation, and more.

2. **Scalability**: Azure's cloud infrastructure allows you to scale your AI applications easily, accommodating varying workloads and ensuring that your applications can handle increased demand without performance issues.

3. **Integration with Azure Services**: Azure OpenAI can be seamlessly integrated with other Azure services, such as Azure Cognitive Services, Azure Machine Learning, and Azure Data Lake, enabling comprehensive solutions that leverage a wide range of AI and data capabilities.

4. **Security and Compliance**: Azure provides robust security features and compliance cer

In [11]:
import os
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

embedding_engine = "text-embedding-3-small"
api_version = "2024-08-01-preview"

embedding_model = AzureOpenAIEmbedding(
    engine=embedding_engine,
    api_key=API_KEY,
    azure_endpoint=AZURE_ENDPOINT,
    api_version=api_version
)

text = "Access to Advanced AI Models**: Azure OpenAI provides access to powerful language models like GPT-3, which can be used for a variety of applications such as natural language processing, text generation, translation, and more."

embedding = embedding_model.get_text_embedding(text)

print("Embedding:", embedding)


Embedding: [-0.013780848123133183, -0.011521583423018456, 0.002993027912452817, -0.01367484126240015, 0.004147505387663841, 0.01298579853028059, -0.02096278965473175, 0.011879355646669865, -0.0035147788003087044, -0.033127039670944214, 0.010103746317327023, 0.021598828956484795, -0.019664209336042404, -0.018842659890651703, -0.026581134647130966, 0.01872340217232704, 0.014059115201234818, -0.016470763832330704, 0.008076371625065804, -9.575784497428685e-05, -0.021333811804652214, 0.005933052394539118, -0.010594027116894722, -0.019359441474080086, 0.001626537530682981, 0.008248632773756981, 0.01876315474510193, -0.019094424322247505, -0.009706222452223301, -0.012124495580792427, 0.0316164456307888, 0.005820420570671558, -0.007658971007913351, -0.008997303433716297, 0.01291954517364502, 0.02236737683415413, 0.010958423838019371, 0.0007114031468518078, -0.010295883752405643, 0.00625107204541564, 0.030185356736183167, 0.017848847433924675, 0.01229013130068779, -0.007678847294300795, -0.0008

In [128]:
import feedparser
from typing import List, Dict
from newspaper import Article
import json
import nltk
nltk.download('punkt')

from pydantic import BaseModel, Field
from llama_index.program.openai import OpenAIPydanticProgram
from llama_index.core.prompts import ChatPromptTemplate, ChatMessage

class ArticleExtraction(BaseModel):
    """A model representing the extracted information from an article."""

    summary: str = Field(
        description=(
            "A concise summary capturing the main points of the article, focusing only on the most "
            "relevant information and filtering out unnecessary details."
        )
    )
    keywords: List[str] = Field(
        description=(
            "A list of the most important and relevant keywords or phrases that represent the main "
            "topics of the article. Exclude generic terms or overly broad concepts."
        )
    )
    facts: List[str] = Field(
        description=(
            "Key factual statements from the article that are essential for understanding its main points."
        )
    )
    important_dates: Dict[str, str] = Field(
        default_factory=dict,
        description=(
            "All significant dates mentioned in the article, with corresponding facts. The date format is flexible."
        )
    )

class RSSParser:
    def __init__(self, feed_url: str):
        self.feed_url = feed_url

        self.llm = llm

        schema = ArticleExtraction.schema_json(indent=2)

        self.prompt = ChatPromptTemplate(
            message_templates=[
                ChatMessage(
                    role="system",
                    content=(
                        "You are an expert assistant for extracting insights from articles in JSON format.\n"
                        "You extract data and return it in JSON format, according to the provided JSON schema, from the given article content.\n"
                        "REMEMBER to return extracted data only from the provided article content.\n\n"
                        "The JSON schema is:\n"
                        f"{schema}"
                    ),
                ),
                ChatMessage(
                    role="user",
                    content=(
                        "Article Content:\n"
                        "------\n"
                        "{article_content}\n"
                        "------"
                    ),
                ),
            ]
        )

        self.program = OpenAIPydanticProgram.from_defaults(
            output_cls=ArticleExtraction,
            llm=self.llm,
            prompt=self.prompt,
            verbose=True,
        )

    def fetch_feed(self) -> Dict:
        try:
            feed = feedparser.parse(self.feed_url)
            if feed.bozo:
                raise ValueError(f"Error parsing feed: {feed.bozo_exception}")
            return feed
        except Exception as e:
            raise RuntimeError(f"Failed to load or parse RSS feed: {e}")

    def extract_article_content(self, url: str) -> str:
        """
        Fetch the article from the URL and extract its main content.
        """
        try:
            article = Article(url)
            article.download()
            article.parse()
            return article.text
        except Exception as e:
            print(f"Failed to extract article content from {url}: {e}")
            return ""

    def extract_entries_with_content(self) -> List[Dict]:
        """
        Extract entries from the RSS feed, including the main content of each linked article.
        Enrich the content using LlamaIndex structured data extraction.
        """
        feed = self.fetch_feed()
        entries = []

        for entry in feed.entries:
            content = self.extract_article_content(entry.get("link", ""))
            if len(content) > 500:
                try:
                    # Use the program to extract data
                    response = self.program(article_content=content)
                    extraction_result = response  # Pydantic object
                    print('Extraction result:', extraction_result)
                except Exception as e:
                    print(f"Failed to extract structured data from content: {e}")
                    extraction_result = None

                item = {
                    "title": entry.get("title", "No title"),
                    "rss_summary": entry.get("summary", ""),
                    "link": entry.get("link", ""),
                    "id": entry.get("id", "No ID"),
                    "authors": entry.get("authors", []),
                    "published": entry.get("published", "Not specified"),
                    "published_parsed": entry.get("published_parsed", None),
                    "media_content": entry.get("media_content", []),
                    "content": content,
                    "extracted_data": extraction_result.dict() if extraction_result else {},
                }
                entries.append(item)
            else:
                print(f"Content too short or empty for URL {entry.get('link', '')}")
        return entries

    def get_feed_metadata(self) -> Dict:
        feed = self.fetch_feed()
        return {
            "title": feed.feed.get("title", "Not specified"),
            "link": feed.feed.get("link", ""),
            "description": feed.feed.get("description", "Not specified"),
            "published": feed.feed.get("published", "Not specified")
        }

def save_entries_to_json(entries: List[Dict], filename: str):
    """
    Save the list of entries to a JSON file.

    :param entries: List of entry dictionaries.
    :param filename: Name of the JSON file to save.
    """
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(entries, f, ensure_ascii=False, indent=4)
        print(f"Entries successfully saved to {filename}")
    except Exception as e:
        print(f"Failed to save entries to JSON file: {e}")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/dostavalovid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [129]:
feed_url_1 = "https://rss.app/feeds/AY3gpY8fWOkfCCWR.xml"  # Replace with your feed URL
feed_url_2 = "https://www.autobild.de/rss/22590661.xml"
feed_url_3 = "https://rss.app/feeds/MLuDKqkwFtd2tuMr.xml"
eng_feed_url_1 = "https://rss.app/feeds/u6rcvfy6PTSf9vQ4.xml"
eng_feed_url_2 = "https://rss.feedspot.com/uk_car_rss_feeds/"
parser = RSSParser(eng_feed_url_1)

metadata = parser.get_feed_metadata()
print("Feed Metadata:")
print(json.dumps(metadata, indent=4))

entries = parser.extract_entries_with_content()
print('entries:', entries)
save_entries_to_json(entries, "./rss_feed_entries_1.json")


Feed Metadata:
{
    "title": "Vehicles - Google News",
    "link": "https://news.google.com/topics/CAAqJAgKIh5DQkFTRUFvSEwyMHZNR3MwYWhJRlpXNHRSMElvQUFQAQ?hl=en-GB&gl=GB&ceid=GB%3Aen",
    "description": "Read full articles, watch videos, browse thousands of titles and more on the 'Vehicles' topic with Google News.",
    "published": "Not specified"
}
Function call: ArticleExtraction with args: {"summary":"Coventry Live readers express mixed opinions on the A46's speed limit, which drops from 70 to 50 mph near Coventry. Locals often ignore the limit, while newcomers brake for speed cameras. Some readers suggest more average speed cameras to enforce limits, while others argue the current limits are necessary for safety due to road conditions. The debate continues on social media, with some advocating for stricter enforcement and others questioning the logic behind the speed limits.","keywords":["A46 speed limit","Coventry","average speed cameras","road safety","speed enforcement"],"fact

feed: website='https://www.autobild.de/rss/22590661.xml' entries=[]
Feed successfully saved to feed.json
