In [None]:
# Mount with Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install chromadb sentence-transformers tqdm

Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.36.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.36.0-py3-none-any.whl.metadata (1.5 k

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb
from tqdm import tqdm
import shutil

In [None]:
# 1. Load your csv file
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/merged_cleaned.csv')

# Select the first 10000 rows
df_subset = df.head(10000).copy() # Add .copy() to avoid SettingWithCopyWarning

# Optional: Combine article + highlight into one text column for embedding
df_subset['combined'] = df_subset['article'] + ' ' + df_subset['highlights']

In [None]:
# 2. Load Sentence Embedding Model (MiniLM)
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
# 3. Define Embedding Function
def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean pooling to get a single vector representation
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding.numpy()

# 4. Generate Embeddings with Progress Bar
tqdm.pandas()
df_subset['embedding'] = df_subset['combined'].progress_apply(get_embedding)

100%|██████████| 10000/10000 [28:55<00:00,  5.76it/s]


In [None]:
# 5. Initialize ChromaDB client (stores to local disk)
# Check if the persist directory exists
persist_directory = "/content/drive/MyDrive/Colab Notebooks/chroma_store"
if os.path.exists(persist_directory):
    shutil.rmtree(persist_directory)
    print(f"Removed existing directory: {persist_directory}")

# Initialize ChromaDB with the new API
chroma_client = chromadb.PersistentClient(path=persist_directory)


# 6. Create or load Chroma Collection
collection = chroma_client.get_or_create_collection(name="news_collection")

# 7. Insert into ChromaDB in batches
batch_size = 5000  # Choose a batch size smaller than the max (5461)
for i in tqdm(range(0, len(df_subset), batch_size), desc="Adding documents to ChromaDB"):
    batch_df = df_subset.iloc[i:i+batch_size]
    batch_embeddings = batch_df['embedding'].apply(lambda x: x.tolist())

    collection.add(
        documents=batch_df['combined'].tolist(),
        embeddings=batch_embeddings.tolist(),
        ids=batch_df['id'].astype(str).tolist(), # Ensure IDs are strings
        metadatas=[{"source": "rag_csv"} for _ in range(len(batch_df))] # Add some metadata
    )

# 8. Save Chroma to disk - persist happens automatically with PersistentClient
print("Embeddings and texts saved to ChromaDB at ./chroma_store")

Adding documents to ChromaDB: 100%|██████████| 2/2 [01:40<00:00, 50.39s/it]

✅ Embeddings and texts saved to ChromaDB at ./chroma_store





In [None]:
# Save the embeddings to a numpy file
embeddings_array = np.array(df_subset['embedding'].tolist())
np.save('embeddings.npy', embeddings_array)

print("Embeddings saved to embeddings.npy")

Embeddings saved to embeddings.npy


In [None]:
# # 9. Perform a similarity search
# query_text = "Artificial intellegence"

# # Query the collection for similar documents
# results = collection.query(
#     query_texts=[query_text],
#     n_results=5 # Get the top 5 most similar results
# )

# # Display the results
# print(f"Query: {query_text}")
# print("-" * 30)

# if results and results['documents']:
#     for i, doc in enumerate(results['documents'][0]):
#         print(f"Result {i+1}:")
#         print(f"  Document: {doc}")
#         # You can also access other information like distance or metadata if needed
#         # print(f"  Distance: {results['distances'][0][i]}")
#         # print(f"  Metadata: {results['metadatas'][0][i]}")
#         print("-" * 10)
# else:
#     print("No results found.")

Query: Artificial intellegence
------------------------------
Result 1:
  Document: 2020 daytoday lives relationships even dinner could controlled run digital versions according futurist john smart within next six years many us could socalled digital twins schedule appointments even conversations others behalf could one day console loved ones die mimicking voice emotions mannerisms thoughts scroll video according futurist john smart within next five years many us could socalled digital twins illustrated schedule appointments make decisions conversations others behalf could even console loved ones die mimicking voice emotions mannerisms thoughts mr smart founder acceleration studies foundation made comments interview business insider digital twins become increasingly like us extensions us smart said ray kurzweil director engineering google believes 30 years humans able upload entire minds computers become digitally immortal event called singularity mr kurzweil also claims biological par

In [None]:
import chromadb

# Initialize ChromaDB client pointing to the persistent directory
persist_directory = "/content/drive/MyDrive/Colab Notebooks/chroma_store"
chroma_client = chromadb.PersistentClient(path=persist_directory)

# Get the existing collection
collection = chroma_client.get_collection(name="news_collection")

# You can print the collection to verify it's loaded
print(collection)

Collection(name=news_collection)


In [None]:
!pip install -U langchain-cohere



In [None]:
from google.colab import userdata
import cohere

COHERE_API_KEY = userdata.get('COHERE_API_KEY')

# Initialize the Cohere client
co = cohere.Client(COHERE_API_KEY)

print("Cohere client initialized.")

Cohere client initialized.


In [None]:
!pip install deep_translator -q

In [None]:
from deep_translator import GoogleTranslator

def generate_english_summary(query_text, collection, cohere_client, model_name="command-r-plus", n_results=5):
    try:
        # Retrieve relevant documents from ChromaDB
        results = collection.query(
            query_texts=[query_text],
            n_results=n_results
        )

        documents = results.get('documents', [[]])[0]
        if not documents:
            return None, "No relevant documents found."

        # Combine context
        context = "\n\n".join(documents)

        # Prompt to generate summary in English
        prompt_en = f"""You are a News Analyst. Summarize the following news articles into a neutral, concise news brief with bullet points in English.

Context:
{context}

Question:
{query_text}

Answer:""".strip()

        response_en = cohere_client.generate(
            model=model_name,
            prompt=prompt_en,
            max_tokens=500,
            temperature=0.3,
            stop_sequences=["--"]
        )

        english_summary = response_en.generations[0].text.strip()
        return english_summary, None

    except Exception as e:
        return None, f"An error occurred during English generation: {e}"


In [None]:
def generate_response(query_text, collection, cohere_client, target_languages, model_name="command-r-plus", n_results=5):
    # Step 1: Get English summary
    english_summary, error = generate_english_summary(query_text, collection, cohere_client, model_name, n_results)

    responses = {}

    # Step 2: If error or no summary, translate default fallback
    if error or english_summary is None:
        default_en_response = "No relevant documents found."
        responses['en'] = default_en_response
        for lang in target_languages:
            if lang != 'en':
                try:
                    translated = GoogleTranslator(source='en', target=lang).translate(default_en_response)
                    responses[lang] = translated
                except Exception as e:
                    responses[lang] = f"[Translation failed for {lang}]: {e}"
        return responses

    # Step 3: Translate English summary
    responses['en'] = english_summary
    for lang in target_languages:
        if lang != 'en':
            try:
                translated = GoogleTranslator(source='en', target=lang).translate(english_summary)
                responses[lang] = translated
            except Exception as e:
                responses[lang] = f"[Translation failed for {lang}]: {e}"

    return responses


In [None]:
query = "what is currently going on in the education sector?"
languages = ['en', 'hi', 'mr']  # English, Hindi, Marathi

responses = generate_response(query, collection, co, languages)

if 'error' in responses:
    print(responses['error'])
else:
    print("Please select your preferred language:")
    for lang in languages:
        lang_name = {'en': 'English', 'hi': 'Hindi', 'mr': 'Marathi'}[lang]
        print(f"Enter '{lang}' for {lang_name} response.")

    preferred_language = input("Your choice (default is 'en'): ").strip().lower()

    # Default to English if invalid input
    if preferred_language not in responses:
        preferred_language = 'en'
        print("\nInvalid or no input. Showing English response by default.")

    print("\n--- Response ---")
    print(responses[preferred_language])


Please select your preferred language:
Enter 'en' for English response.
Enter 'hi' for Hindi response.
Enter 'mr' for Marathi response.
Your choice (default is 'en'): mr

--- Response ---
- नॉरफोकमधील नवीन माध्यमिक शाळा, जेन ऑस्टेन Academy कॅडमीने त्याऐवजी जास्त काळ शाळेचे दिवस निवडून गृहपाठ बंदी घातली आहे. 
- मायक्रोसॉफ्टमधील माजी प्रोग्राम मॅनेजर पॅट्रिक अवुह यांनी स्थापन केलेल्या घाना येथील अशीसी विद्यापीठाची 10 वी वर्धापन दिन साजरा करीत आहे. विद्यापीठाने उदार कला दृष्टिकोनासह तांत्रिक प्रमुखांना एकत्र केले आहे आणि आफ्रिकेतील पुढील पिढी नेत्यांना चालना देण्याचे उद्दीष्ट आहे.
- ग्रेटर मँचेस्टरमधील एका शिक्षक, कॅरोलिन मोलिनेक्स यांना मॅकडोनाल्ड्समधील जीवशास्त्र पुनरावृत्ती सत्रात विद्यार्थ्यांना गुंतवून ठेवण्यासाठी तिच्या अनोख्या दृष्टिकोनासाठी राष्ट्रीय अध्यापन पुरस्कारासाठी नामांकन देण्यात आले आहे.
- एक भयानक चक्रीवादळ ओक्लाहोमामधील प्राथमिक शाळेत धडकला, शिक्षक आणि विद्यार्थी सुरक्षिततेसाठी बाथरूममध्ये अडकले. व्हिडिओ फुटेज भयानक विद्यार्थ्यांना धीर देण्याच्या विनाश आणि शिक्षकांच्या 

In [None]:
!pip install gTTS pydub simpleaudio



In [None]:
from gtts import gTTS
from pydub import AudioSegment
import io
from IPython.display import Audio, display # Import Audio and display

def speak_multilang(responses: dict):
    for lang_code, text in responses.items():
        print(f"\n--- {lang_code.upper()} ---")
        print(text)
        try:
            tts = gTTS(text=text, lang=lang_code)
            audio_fp = io.BytesIO()
            tts.write_to_fp(audio_fp)
            audio_fp.seek(0)

            # Use IPython.display.Audio to play the audio
            display(Audio(audio_fp.read(), autoplay=False))
            print(f"[Playing audio in {lang_code}]")

        except Exception as e:
            print(f"[TTS Error in {lang_code}]: {e}")

In [None]:
query = "recent world chaess tournament updates?"
languages = ['en', 'hi', 'mr']

responses = generate_response(query, collection, cohere_client=co, target_languages=languages)

if 'error' in responses:
    print(responses['error'])
else:
    # Speak responses in all selected languages
    speak_multilang(responses)



--- EN ---
- Here is a summary of the news articles provided: 

- Charlotte Casiraghi, the Monegasque princess and model showjumper, took a tumble during the Eiffel Jumping competition in Paris. Despite the fall, she quickly got back on her feet and continued the event. 

- Roger Federer defeated Andy Murray to advance to the semifinals of the Australian Open, setting up a match with arch-rival Rafael Nadal. This will be their first meeting in the semifinals in two years, with Nadal holding a winning record over Federer. 

- Lu Lingzi, a 23-year-old graduate student from China, was remembered at a memorial service one week after her death in the Boston Marathon bombings. Lu was described as a passionate pianist, a loving daughter, and an excellent statistics student. 

- Manhattan experienced the phenomenon known as "Manhattanhenge," where the sun aligns perfectly with the city's grid system. Hundreds of people gathered to witness and photograph the event, which occurs four times a ye

[Playing audio in en]

--- HI ---
- यहां समाचार लेखों का सारांश दिया गया है: 

- शार्लोट कैसिरघी, मोनेगास्क राजकुमारी और मॉडल शोजम्पर, पेरिस में एफिल जंपिंग प्रतियोगिता के दौरान एक टम्बल ले गए। गिरावट के बावजूद, वह जल्दी से अपने पैरों पर वापस आ गई और इस कार्यक्रम को जारी रखा। 

- रोजर फेडरर ने एंडी मरे को हराकर ऑस्ट्रेलियन ओपन के सेमीफाइनल को आगे बढ़ाने के लिए, आर्क-प्रतिद्वंद्वी राफेल नडाल के साथ एक मैच स्थापित किया। यह दो साल में सेमीफाइनल में उनकी पहली बैठक होगी, जिसमें नडाल ने फेडरर पर एक विजयी रिकॉर्ड बनाया। 

-चीन के 23 वर्षीय स्नातक छात्र लू लिंगजी को बोस्टन मैराथन बम विस्फोटों में उनकी मृत्यु के एक सप्ताह बाद एक स्मारक सेवा में याद किया गया था। लू को एक भावुक पियानोवादक, एक प्यार करने वाली बेटी और एक उत्कृष्ट सांख्यिकी छात्र के रूप में वर्णित किया गया था। 

- मैनहट्टन ने "मैनहट्टनहेन" के रूप में जानी जाने वाली घटना का अनुभव किया, जहां सूरज शहर के ग्रिड सिस्टम के साथ पूरी तरह से संरेखित करता है। सैकड़ों लोग इस घटना को गवाह और फोटो खिंचवाने के लिए इकट्ठा हुए, जो साल में चार बार ह

[Playing audio in hi]

--- MR ---
- येथे प्रदान केलेल्या बातम्यांचा सारांश येथे आहे: 

- पॅरिसमधील आयफेल जंपिंग स्पर्धेदरम्यान शार्लोट कॅसिरागी, मोनेगास्क राजकुमारी आणि मॉडेल शोज्परने एक गोंधळ उडाला. गडी बाद होण्याचा क्रम असूनही, ती पटकन तिच्या पायावर परत आली आणि कार्यक्रम चालू ठेवला. 

- रॉजर फेडररने अ‍ॅन्डी मरेला ऑस्ट्रेलियन ओपनच्या उपांत्य फेरीत प्रवेश करून कमान-प्रतिस्पर्धी राफेल नदालशी सामना सुरू केला. दोन वर्षांत उपांत्य फेरीत त्यांची पहिली बैठक असेल आणि नदालने फेडररवर विजय मिळविला. 

-चीनमधील 23 वर्षीय पदवीधर विद्यार्थी ल्यु लिंगझी यांना बोस्टन मॅरेथॉन बॉम्बस्फोटात तिच्या मृत्यूनंतर एका आठवड्यानंतर स्मारक सेवेत आठवले. लूचे वर्णन उत्कट पियानो वादक, एक प्रेमळ मुलगी आणि एक उत्कृष्ट आकडेवारी विद्यार्थी म्हणून केले गेले. 

- मॅनहॅटनने "मॅनहॅटनहेंज" म्हणून ओळखल्या जाणार्‍या घटनेचा अनुभव घेतला, जिथे सूर्य शहराच्या ग्रीड सिस्टमशी पूर्णपणे संरेखित होतो. वर्षातून चार वेळा उद्भवणार्‍या कार्यक्रमाची साक्ष देण्यासाठी आणि छायाचित्र काढण्यासाठी शेकडो लोक जमले. 

- इंग्लंडच्या विश्वचषक संघाचा म

[Playing audio in mr]
