## NEWSROOM ETL

### Extraction :

In [None]:
from datasets import load_dataset
import pandas as pd
from langchain_core.documents import Document
from transformers import pipeline
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
ds = load_dataset("R3troR0b/news-dataset")

In [13]:
news_dataframe = ds['train'].to_pandas()
news_dataframe.head()

Unnamed: 0,label,text
0,The Guardian;Critic wrongly jailed by former p...,"Leila de Lima, one of fiercest critics of the ..."
1,The Guardian;Trump’s killing of Qassem Suleima...,Donald Trump’s decision to sanction the assass...
2,The Guardian;Belgium becomes first EU country ...,Belgium has become the EU first country to ban...
3,The Guardian;Fate of endangered monkey hinges ...,The fate of one of the world’s most threatened...
4,The Guardian;Russian gas flows to Europe via U...,Russian gas has ceased flowing to Europe via U...


In [14]:
filtered_news_dataframe = news_dataframe.drop(
    news_dataframe[
        (~news_dataframe['text'].apply(lambda x: isinstance(x, str))) |  # Not a string
        (news_dataframe['text'] == "")                                 # Empty string
    ].index
)

In [15]:
metadata = filtered_news_dataframe["label"].str.split(';', expand=True)
metadata.drop(metadata.columns[4:], axis=1, inplace=True)
metadata.columns = ["source", "headline", "url", "date"]
filtered_news_dataframe = pd.concat([filtered_news_dataframe, metadata], axis=1)
filtered_news_dataframe.head(5)

Unnamed: 0,label,text,source,headline,url,date
0,The Guardian;Critic wrongly jailed by former p...,"Leila de Lima, one of fiercest critics of the ...",The Guardian,Critic wrongly jailed by former president of P...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T14:00:16Z
1,The Guardian;Trump’s killing of Qassem Suleima...,Donald Trump’s decision to sanction the assass...,The Guardian,Trump’s killing of Qassem Suleimani led to fal...,https://www.theguardian.com/us-news/2025/jan/0...,2025-01-01T13:10:04Z
2,The Guardian;Belgium becomes first EU country ...,Belgium has become the EU first country to ban...,The Guardian,Belgium becomes first EU country to ban sale o...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T12:48:15Z
3,The Guardian;Fate of endangered monkey hinges ...,The fate of one of the world’s most threatened...,The Guardian,Fate of endangered monkey hinges on Brazilian ...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T11:41:14Z
4,The Guardian;Russian gas flows to Europe via U...,Russian gas has ceased flowing to Europe via U...,The Guardian,Russian gas flows to Europe via Ukraine cease ...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T11:35:59Z


In [16]:
news_dataframe = filtered_news_dataframe.drop('label', axis=1) 

In [17]:
news_dataframe.head(5)

Unnamed: 0,text,source,headline,url,date
0,"Leila de Lima, one of fiercest critics of the ...",The Guardian,Critic wrongly jailed by former president of P...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T14:00:16Z
1,Donald Trump’s decision to sanction the assass...,The Guardian,Trump’s killing of Qassem Suleimani led to fal...,https://www.theguardian.com/us-news/2025/jan/0...,2025-01-01T13:10:04Z
2,Belgium has become the EU first country to ban...,The Guardian,Belgium becomes first EU country to ban sale o...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T12:48:15Z
3,The fate of one of the world’s most threatened...,The Guardian,Fate of endangered monkey hinges on Brazilian ...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T11:41:14Z
4,Russian gas has ceased flowing to Europe via U...,The Guardian,Russian gas flows to Europe via Ukraine cease ...,https://www.theguardian.com/world/2025/jan/01/...,2025-01-01T11:35:59Z


In [18]:
selected_news = news_dataframe.sample(n=200, random_state=42)
selected_news.to_csv("news.csv", index=False)

In [2]:
news = pd.read_csv('news.csv')
news_documents = []
for index, current_news in news.iterrows():
    document = Document(page_content=current_news["text"], metadata={"headline" : current_news["headline"], "source" : current_news["source"], "url" : current_news["url"], "date" : current_news["date"]})
    news_documents.append(document)


#### Transformation (Text Splitting, ML Integration, and Embedding) 

In [None]:

# Load Sentiment Model
sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

Device set to use cpu


In [11]:
news_documents_with_sentiment = []
for doc in news_documents:
    sentiment = sentiment_analyzer(doc.page_content, truncation=True)[0]
    sentiment_label = sentiment['label']
    sentiment_score = sentiment['score']
    if '1 star' in sentiment_label or '2 stars' in sentiment_label:
        sentiment_label = "NEGATIVE"
    elif '3 stars' in sentiment_label:
        sentiment_label = "NEUTRAL"
    elif '4 stars' in sentiment_label or '5 stars' in sentiment_label:
        sentiment_label = "POSITIVE"
    else:
        sentiment_label = "UNKNOWN"

    new_metadata = doc.metadata.copy()
    new_metadata["sentiment_label"] = sentiment_label
    new_metadata["sentiment_score"] = sentiment_score    
    updated_doc = Document(
        page_content=doc.page_content,
        metadata=new_metadata)
    news_documents_with_sentiment.append(updated_doc)    
    
    

In [None]:

print("Splitting documents into chunks...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, # Experiment with this value
    chunk_overlap=50, # And this one
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(news_documents_with_sentiment)

Splitting documents into chunks...


In [13]:
for i, chunk in enumerate(chunks[:3]): # Check first 3 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(f"Content (first 200 chars): {chunk.page_content[:200]}...")
    print(f"Metadata: {chunk.metadata}")
    print(f"Sentiment Label: {chunk.metadata.get('sentiment_label', 'N/A')}")
    print(f"Sentiment Score: {chunk.metadata.get('sentiment_score', 'N/A')}")
    print(f"Original Headline: {chunk.metadata.get('headline', 'N/A')}") # Check original metadata too


--- Chunk 1 ---
Content (first 200 chars): A majority of major movies had female protagonists for the first time in 2024. At the same time, representation of racial minorities saw little improvement....
Metadata: {'headline': 'Women lead in over half of Hollywood movies for first time', 'source': 'Deutsche Welle', 'url': 'https://www.dw.com/en/women-lead-in-over-half-of-hollywood-movies-for-first-time/a-71578297?maca=en-rss-en-all-1573-rdf', 'date': '2025-02-25 02:00:05', 'sentiment_label': 'NEGATIVE', 'sentiment_score': 0.4495375454425812}
Sentiment Label: NEGATIVE
Sentiment Score: 0.4495375454425812
Original Headline: Women lead in over half of Hollywood movies for first time

--- Chunk 2 ---
Content (first 200 chars): A recovery mission is now under way to find 11-year-old Kaliyah along the Thames, Met Police said....
Metadata: {'headline': 'Girl missing in River Thames named as Kaliyah Coa', 'source': 'BBC News', 'url': 'https://www.bbc.com/news/articles/ckg5v4rp4qzo', 'date': 'Tu

In [14]:
print("Initializing embeddings model...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print("Embeddings model initialized.")

Initializing embeddings model...
Embeddings model initialized.


#### Load (Load embeddings into vector store)

In [None]:
print("Loading chunks into ChromaDB...")
vectorstore = Chroma.from_documents(
    documents=chunks, 
    embedding=embeddings,    
    persist_directory="./chroma_db" 
)
print("ChromaDB loaded. Your RAG knowledge base is ready!")

vectorstore.persist()
print("ChromaDB persisted to disk.")

Loading chunks into ChromaDB...
ChromaDB loaded. Your RAG knowledge base is ready!
ChromaDB persisted to disk.


In [16]:
print("\n--- Performing a quick retrieval test ---")
retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) # Retrieve top 3 relevant chunks
test_query = "What's new in science or discoveries?" # Or pick a topic from your news_sample.csv

retrieved_docs = retriever.invoke(test_query)

if retrieved_docs:
    print(f"Retrieved {len(retrieved_docs)} documents for query: '{test_query}'")
    for i, doc in enumerate(retrieved_docs):
        print(f"\n--- Retrieved Document {i+1} ---")
        print(f"Content (first 200 chars): {doc.page_content[:200]}...")
        print(f"Metadata: {doc.metadata}")
        # Verify sentiment_label is present in metadata
        print(f"Sentiment: {doc.metadata.get('sentiment_label', 'N/A')}")
        print(f"Source Headline: {doc.metadata.get('headline', 'N/A')}")
else:
    print("No documents retrieved. Check your ETL and data.")


--- Performing a quick retrieval test ---
Retrieved 3 documents for query: 'What's new in science or discoveries?'

--- Retrieved Document 1 ---
Content (first 200 chars): Physicists John Hopfield and Geoffrey Hinton have been awarded the 2024 Nobel Prize in physics. They were honored for their research on machine learning with artificial neural networks....
Metadata: {'sentiment_label': 'POSITIVE', 'sentiment_score': 0.45703455805778503, 'url': 'https://www.dw.com/en/john-hopfield-and-geoffrey-hinton-receive-nobel-physics-award-for-ai-advances/a-70374538?maca=en-rss-en-all-1573-rdf', 'headline': 'John Hopfield and Geoffrey Hinton receive Nobel physics award for AI advances', 'date': '2024-12-25 16:57:06', 'source': 'Deutsche Welle'}
Sentiment: POSITIVE
Source Headline: John Hopfield and Geoffrey Hinton receive Nobel physics award for AI advances

--- Retrieved Document 2 ---
Content (first 200 chars): Physicists John Hopfield and Geoffrey Hinton have been awarded the 2024 Nobel Prize