In [3]:
!pip3 install -r requirements.txt

Collecting langchain==0.2 (from -r requirements.txt (line 2))
  Downloading langchain-0.2.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain==0.2->-r requirements.txt (line 2))
  Downloading langchain_text_splitters-0.2.1-py3-none-any.whl.metadata (2.2 kB)
Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading langchain_text_splitters-0.2.1-py3-none-any.whl (23 kB)
Installing collected packages: langchain-text-splitters, langchain
  Attempting uninstall: langchain
    Found existing installation: langchain 0.1.2
    Uninstalling langchain-0.1.2:
      Successfully uninstalled langchain-0.1.2
Successfully installed langchain-0.2.0 langchain-text-splitters-0.2.1


In [4]:
from dotenv import dotenv_values
import os
SECRETS=dotenv_values(".env")
os.environ['OPENAI_API_KEY'] = SECRETS['OPENAI_API_KEY']

In [140]:
from datetime import date, timedelta
from newsapi import NewsApiClient

newsapi = NewsApiClient(api_key=SECRETS['NEWS_API_KEY'])

today = date.today()
last_week = today - timedelta(days=7)

latest_news = newsapi.get_everything(
    q='Tyson Fury',
    from_param=last_week.strftime("%Y-%m-%d"),
    to=today.strftime("%Y-%m-%d"),
    sort_by='relevancy',
    language='en'
)

In [141]:
filtered_data = [item for item in latest_news['articles'] if isinstance(item, dict) and item.get('description') is not None]

In [142]:
filtered_data

[{'source': {'id': None, 'name': 'BBC News'},
  'author': 'Kal Sajad',
  'title': 'Hearn v Warren - from sworn enemies to partners',
  'description': "BBC Sport speaks to rivals turned friends Eddie Hearn of Matchroom and Frank Warren of Queensberry before Saturday's five-versus-five card in Riyadh.",
  'url': 'https://www.bbc.com/sport/articles/c97747e5zj7o',
  'urlToImage': 'https://ichef.bbci.co.uk/news/1024/branded_sport/4d50/live/cbd8dda0-1ddd-11ef-a13a-0b8c563da930.jpg',
  'publishedAt': '2024-05-30T07:38:14Z',
  'content': "Just six months ago, nobody would have predicted Hearn and Warren would be playing a friendly and bizarrely captivating game of Jenga at April's news conference.\r\nTheir relationship now is far less f… [+2245 chars]"},
 {'source': {'id': 'espn', 'name': 'ESPN'},
  'author': 'James Regan',
  'title': "Deontay Wilder 'had to regain' his love for boxing, and now is time for business",
  'description': 'Can Deontay Wilder regain his good form and fighting spirit

In [143]:
len(filtered_data)

79

Document loaders are used to load data from a source as Document's. A Document is a piece of text and associated metadata. For example, there are document loaders for loading a simple .txt file, for loading the text contents of any web page, or even for loading a transcript of a YouTube video.

Document loaders provide a "load" method for loading data as documents from a configured source. They optionally implement a "lazy load" as well for lazily loading data into memory.

In [144]:
from langchain.docstore.document import Document
docs = [
    Document(
    page_content= article['description'], 
    metadata={
        'source': article['url'],
        'author': article['author'],
        'date': article['publishedAt'],
    }
    ) for article in filtered_data
]

print(docs[0].page_content)
print(docs[0].metadata)

BBC Sport speaks to rivals turned friends Eddie Hearn of Matchroom and Frank Warren of Queensberry before Saturday's five-versus-five card in Riyadh.
{'source': 'https://www.bbc.com/sport/articles/c97747e5zj7o', 'author': 'Kal Sajad', 'date': '2024-05-30T07:38:14Z'}


Create a chain that can provides the sources when the LLM respond to your queries

In [145]:
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(show_progress_bar=True)

In [146]:
from langchain.chains import create_qa_with_sources_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA


llm = ChatOpenAI()

qa_chain =create_qa_with_sources_chain(llm)
doc_prompt = PromptTemplate(
    template="Content: {page_content}\n Source:{source}\n Author:{author}\n Date:{date}",
    input_variables=['page_content', 'source', 'author', 'date']

)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

index = FAISS.from_documents(docs, embedding=embeddings)

# RetrievalQA is used for question-answering against an index
chain = RetrievalQA(
    retriever=index.as_retriever(),
    combine_documents_chain=final_qa_chain
)

100%|██████████| 1/1 [00:01<00:00,  1.12s/it]


In [150]:
question = """
Who wins Tyson Fury or Usyk?
"""

In [151]:
answer = chain.run(question)

100%|██████████| 1/1 [00:00<00:00,  3.52it/s]


In [152]:
print(answer)

{"answer":"Oleksandr Usyk defeated Tyson Fury in their undisputed heavyweight boxing bout in Saudi Arabia on May 18. Their rematch is scheduled for December 21 in Riyadh.","sources":["https://www.news24.com/sport/knockout/date-set-usyk-fury-heavyweight-rematch-confirmed-for-december-2024-20240530","https://www.boxingnews24.com/2024/05/fury-vs-usyk-rematch-set-for-december-21st-can-tyson-make-the-necessary-changes/"]}
