In [1]:
!pip install faiss-cpu sentence-transformers transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [2]:
!unzip -q /content/summarized_data.zip -d /content/data


In [4]:
import glob

# Get a list of all .txt files in the unzipped data directory
news_files = glob.glob("/content/data/summarized_data/*.txt")

# Initialize a list to store the content of each news file
news_data = []

# Loop through each file, read its content, and append it to the list
for file in news_files:
    with open(file, "r", encoding="utf-8") as f:
        news_data.append(f.read())

print(f"Loaded {len(news_data)} news articles.")


Loaded 53 news articles.


In [5]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize the embedding model (using a lightweight, efficient model)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings for all loaded news articles
embeddings = embedding_model.encode(news_data, convert_to_numpy=True)
print("Generated embeddings for all news articles.")

# Determine the dimensionality of the embeddings
embedding_dim = embeddings.shape[1]

# Create a FAISS index for L2 (Euclidean) distance search
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(embeddings)
print(f"FAISS index created with {faiss_index.ntotal} articles.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generated embeddings for all news articles.
FAISS index created with 53 articles.


In [6]:
def retrieve_similar_articles(query, top_k=3):
    # Encode the query into the same embedding space
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = faiss_index.search(query_embedding, top_k)

    # Return the retrieved articles based on the indices
    return [news_data[i] for i in indices[0]]

# Test the retrieval function
test_query = "Latest developments in renewable energy"
similar_articles = retrieve_similar_articles(test_query)
print("Sample retrieved article snippets:")
for article in similar_articles:
    print(article[:200], "...\n")  # printing first 200 characters of each article


Sample retrieved article snippets:
Title: How the world’s largest tech event is forging a future AI economy
Summary:
By bringing together diverse stakeholders from across the globe, the event is poised to shape the future of the AI eco ...

Title: AI and robots take center stage at ‘world’s largest tech event’
Summary:
“I think what (was) very exciting this year (was) the focus on AI and deep tech,” said Trixie LohMirmand, executive vice ...

Title: Israel and Hamas are fighting a battle of narratives over Sinwar’s death
Summary:
Mohammed Huwais/AFP/Getty Images

‘Truth is in the eye of the beholder’

Gil Siegal, a legal scholar and head o ...



In [42]:
from transformers import pipeline

# Initialize the text generation pipeline with GPT-2 Medium
generator = pipeline(
    "text-generation",
    model="gpt2-medium",
    pad_token_id=50256
)


Device set to use cpu


In [47]:
def generate_response(query):
    # Retrieve the top 2 relevant news articles to reduce prompt length
    context_articles = retrieve_similar_articles(query, top_k=2)

    # Combine the retrieved articles into a context block and trim to 500 characters
    context_text = "\n".join(context_articles)[:500]

    # Debug: Print the context text to verify it's non-empty
    print("Retrieved Context (truncated):\n", context_text)

    # Construct a revised prompt with explicit instructions for summarization
    prompt = (
        "You are an intelligent assistant that summarizes news articles using only the provided text.\n\n"
        "News Information:\n"
        f"{context_text}\n\n"
        "Please provide a concise summary in a few complete sentences of the main trends mentioned in the above news information. "
        "Do not add any information that is not present in the text.\n"
        f"Query: {query}\n"
        "Answer:"
    )

    # Debug: Print the full prompt for verification
    #print("\nConstructed Prompt:\n", prompt)

    # Generate the response using the generative model
    output = generator(
        prompt,
        max_new_tokens=100,      # Reserve tokens for the generated answer
        truncation=True,
        do_sample=True,
        temperature=0.7,
        num_return_sequences=1,
        min_new_tokens=10,
        return_full_text=False   # Return only the newly generated text
    )[0]['generated_text']



    response = output.strip()
    return response

# Test the chatbot with a sample query
user_query = "provide some news about recent technology?"
chat_response = generate_response(user_query)
print("\nChatbot Response:")
print(chat_response)


Retrieved Context (truncated):
 Title: AI and robots take center stage at ‘world’s largest tech event’
Summary:
“I think what (was) very exciting this year (was) the focus on AI and deep tech,” said Trixie LohMirmand, executive vice president of Dubai World Trade Centre and CEO at KAOUN International, which organizes the event. And that gives everybody an opportunity to take (market) share from their competitors, build new markets and grow.”

The show, which debuted in 1981 as the Gulf Computer Exhibition in a single hall at t

Chatbot Response:
“On the whole,” says Trixie LohMirmand,” the news is very interesting, and has a lot of variety.”

““We are excited about developing new products, new services and new ideas that will be important for the future of our industry.” ““““

““Our audience is very active here,” says Trixie LohMirm
