In [6]:
import xml.etree.ElementTree as ET
import requests
import os

# URL of the XML data
url = "https://buttondown.email/ainews/rss"

# Fetch the XML data
response = requests.get(url)
xml_data = response.text

# Parse the XML
root = ET.fromstring(xml_data)

# Function to recursively extract text from the XML tree
def extract_text(element):
    text = ''
    if element.text:
        text += element.text.strip() + ' '
    for child in element:
        text += extract_text(child)
    if element.tail:
        text += element.tail.strip() + ' '
    return text

# Extract text from the root element
text_content = extract_text(root)

with open("docs/data/ai_news_rss.txt", "w", encoding='utf-8') as file:
    file.write(text_content)

In [9]:
import os
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings

DESNTINATION_DIR = "docs/data"
CHROMA_DIR = "docs/chroma"

def split_data(directory):
    """
    Split Markdown files in the specified directory into chunks.

    Args:
        directory (str): The directory containing Markdown files to process.

    Returns:
        list: A list of chunked text data.
    """

    loader = DirectoryLoader(
        directory, 
        glob="*.txt", 
        loader_cls=UnstructuredMarkdownLoader, 
        use_multithreading=True
        )
    pages = loader.load()

    print(f"Loaded {len(pages)} pages")

    txt = ' '.join([d.page_content for d in pages])
    print(f"Text length: {len(txt)}")

    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n#{1,6}", "\n\n", "\n", "__"],
        keep_separator=True,
        chunk_size=1000,
        chunk_overlap=250,
        add_start_index=True
    )

    chunks = text_splitter.split_documents(pages)
    print(f"Split into {len(chunks)} chunks")

    return chunks

def save_chunks(chunks, chroma_dir):
    """Process chunks into vectors and save them to directory."""
    embedding_function = GPT4AllEmbeddings(device='gpu')
    try:
        Chroma.from_documents(
            documents=chunks, 
            embedding=embedding_function, 
            persist_directory=chroma_dir
        )
        print(f"Chunks saved to {chroma_dir}")
    except Exception as e:
        print(f"Error saving chunks: {e}")

In [10]:
chunks = split_data(DESNTINATION_DIR)
save_chunks(chunks, CHROMA_DIR)

Loaded 1 pages
Text length: 6258805
Split into 8201 chunks
Chunks saved to docs/chroma


In [12]:
# Query the database
db = Chroma(persist_directory=CHROMA_DIR, embedding_function=GPT4AllEmbeddings(device='gpu'))
query_text = db.similarity_search_with_score("langchain", k=3)
print(query_text)

[(Document(page_content='no title found: no description found\n\nno title found: no description found\n\nno title found: no description found\n\nLangChain AI ▷ #announcements (1 messages):\n\nRevamped Documentation Structure: LangChain is seeking feedback on a new documentation structure, which explicitly differentiates between tutorials, how-to guides, and conceptual guides. The structure aims to make it easier for users to find relevant information.\n\nLangChain Framework Introduction: The shared documentation page provides a comprehensive introduction to LangChain, an open-source framework designed to streamline the application lifecycle of large language models (LLMs)—from development and productionization to deployment.\n\nLink mentioned: Introduction | 🦜️🔗 LangChain: LangChain is a framework for developing applications powered by large language models (LLMs).\n\nLangChain AI ▷ #general (43 messages🔥):', metadata={'source': 'docs\\data\\ai_news_rss.txt', 'start_index': 3035664}), 

In [17]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import ConversationChain
os.environ["OPENAI_API_KEY"] = "no_need"

model = ChatOpenAI(openai_api_base="http://localhost:3008/v1", model_name="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF")

conversation = ConversationChain(
    llm=model,
    verbose=False
)

In [36]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain import hub

map_prompt = hub.pull("rlm/map-prompt")
map_chain = LLMChain(llm=model, prompt=map_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=map_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)

query_text = "What is the latest top 5 news about AI?"

results = db.similarity_search_with_score(query_text, k=5)
docs = [doc for doc, _score in results]

split_docs = text_splitter.split_documents(docs)
result = map_reduce_chain.invoke(split_docs)
print(result)

{'input_documents': [Document(page_content='Anticipation for Future AI Developments and Competitor Platforms: The community anticipates new AI models such as the rumored "GPT-5" and potential upcoming Perplexity competitors from OpenAI. Additionally, there are discussions on the distinctions between search engines and knowledge engines, with speculations about how these tech advancements might evolve and integrate with existing platforms.\n\nHere’s why AI search engines really can’t kill Google: A search engine is much more than a search engine, and AI still can’t quite keep up.\n\nImagination Spongebob Squarepants GIF - Imagination Spongebob Squarepants Dreams - Discover & Share GIFs: Click to view the GIF\n\nno title found: no description found\n\nNew OpenAI Model \'Imminent\' and AI Stakes Get Raised (plus Med Gemini, GPT 2 Chatbot and Scale AI): Altman ‘knows the release date’, Politico calls it ‘imminent’ according to Insiders, and then the mystery GPT-2 chatbot [made by the phi t

In [42]:
PROMPT_TEMPLATE = """
<|start_header_id|>user<|end_header_id|>
You are an assistant that will look into news articles and give short summaries.
Use Markdown format and bullet points to give short summaries on multiple news articles.
Use emoji to show the sentiment of the news article. Make it short and clear to be ready to be shared on slack.
You are given the extracted parts of a long document and a request. Provide a concise summary.
Don't make up an answer with no context.
The given request: {request}
DO NOT give irelevant information that is not in the context.
Context: {context} <|eot_id|><|start_header_id|>assistant<|end_header_id|>
---
"""

In [43]:
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=result, request=query_text)

response_text = conversation.predict(input=prompt)

In [44]:
print(response_text)

Here are the top 5 news about AI:

* **GPT-5 Anticipation**: The community anticipates new AI models, including GPT-5, with potential competitors from OpenAI. [Source: Eleuther Discord](https://eleuther.ai/discord)
* **Imminent OpenAI Model Release**: A new OpenAI model is rumored to be released soon, along with updates on Med Gemini, GPT-2 chatbot, and Scale AI. [Source: Eleuther Discord](https://eleuther.ai/discord)
* **Nvidia's AI Chip Dominance**: A former Nvidia employee claims no one will catch up to Nvidia's AI chip lead this decade, sparking discussion about the company's strong position. [Source: r/hardware on Reddit](https://www.reddit.com/r/hardware/comments/lz3h6j/nvidias_ai_chip_dominance/)
* **Potential Billion-Dollar Market for AI Companions**: A tech executive predicts AI girlfriends could become a $1 billion business, sparking discussions on societal implications. [Source: r/singularity on Reddit](https://www.reddit.com/r/singularity/comments/lz3k9f/potential_billiondo

# Test out feedparser

In [None]:
#! pip install feedparser

In [None]:
import feedparser

url = "https://buttondown.email/ainews/rss"
feed = feedparser.parse(url)

In [None]:
from datetime import datetime, timedelta
from bs4 import BeautifulSoup

now = datetime.now()
time_range = timedelta(days=2)
max_articles = 10  # Change this to the maximum number of articles you want to display

# Loop through all entries in the feed
for idx, entry in enumerate(feed.entries):
    entry_date = datetime.strptime(entry.published, "%a, %d %b %Y %H:%M:%S %z")
    entry_date = entry_date.replace(tzinfo=None)
    if now.replace(tzinfo=None) - entry_date <= time_range:
        print("Entry Title:", entry.title)
        print("Entry Link:", entry.link)
        print("Entry Published Date:", entry.published)
        print("Entry Summary:", entry.summary)
        print("\n")
        
    # Check if the maximum number of articles to display has been reached
    if idx + 1 >= max_articles:
        break