In [1]:
import json
import time
import random
import subprocess
import argparse
from ollama import chat
from ollama import ChatResponse
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
import numpy as np

In [2]:
def load_processed_articles(file_path="processed_articles.json"):
    """Load processed articles from the given JSON file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

def aggregate_topic_text(topic_dict):
    """
    For a given topic dictionary (with keys "topic_name" and "url_content"),
    concatenate the article texts (ignoring empty ones) into one large string.
    """
    texts = []
    for pair in topic_dict.get("url_content", []):
        # Each pair is expected to be [source_url, article_text]
        article_text = pair[1]
        if article_text.strip():
            texts.append(article_text.strip())
    return "\n".join(texts)

def create_text_chunks(text, chunk_size=1000, chunk_overlap=200):
    """
    Split text into chunks using LangChain's RecursiveCharacterTextSplitter.
    """
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

def create_embeddings_for_chunks(chunks):
    """
    Create embeddings for each text chunk using a local HuggingFace model.
    (These embeddings can be used for further retrieval tasks if needed.)
    """
    hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    embeddings = [hf_embeddings.embed_query(chunk) for chunk in chunks]
    return embeddings

def cosine_similarity(a, b):
    """Compute cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_chat_response(system_prompt: str, user_prompt: str, model: str = "deepseek-r1:8b") -> str:
    """
    Get a chat response from the specified model using both a system and a user prompt.
    
    Args:
        system_prompt (str): The system message that sets the context and behavior.
        user_prompt (str): The user message with the actual task.
        model (str): The model to use (default: deepseek-r1:8b).
    
    Returns:
        str: The complete output generated by the model.
    """
    response: ChatResponse = chat(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        stream=True,
    )

    full_response = ""
    for chunk in response:
        content = chunk["message"]["content"]
        print(content, end="", flush=True)
        full_response += content
    print("\n")
    full_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL).strip()
    return full_response


In [3]:
NO_OF_ARTICLES = 5 # Number of articles to process.
NO_OF_CHUNKS = 3 # Number of chunks to select for summarization.

def main():
    articles = load_processed_articles("processed_articles.json")
    articles = articles[:NO_OF_ARTICLES]
    final_results = []

    system_prompt = (
        "You are a highly factual and SEO-optimized summarizer. "
        "Your task is to produce concise, authoritative, and fully factual summaries. "
        "Avoid hallucinations and ensure the output includes relevant SEO keywords."
    )

    for topic_dict in articles:
        topic_name = topic_dict.get("topic_name", "Unknown Topic")
        print(f"\nProcessing topic: {topic_name}")
        aggregated_text = aggregate_topic_text(topic_dict)
        if not aggregated_text.strip():
            print(f"No text found for topic: {topic_name}")
            continue

        chunks = create_text_chunks(aggregated_text, chunk_size=1000, chunk_overlap=200)
        embeddings = create_embeddings_for_chunks(chunks)

        # Make use of embeddings: compute centroid and select top N chunks.
        embeddings_arr = np.array(embeddings)
        centroid = embeddings_arr.mean(axis=0)
        similarities = [cosine_similarity(e, centroid) for e in embeddings_arr]
        N = min(NO_OF_CHUNKS, len(chunks)) 

        # Get indices of the top N chunks by similarity.
        top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:N]
        selected_chunks = [chunks[i] for i in top_indices]
        input_text = "\n".join(selected_chunks)

        user_prompt = (
            "Please summarize the following text into a concise single paragraph suitable for a blog post. Also provide a suitable title for the blog post. Mention the title as well in the summary in the format Title : {actual title of the blog}."
            f"Text: {input_text}"
        )

        summary = get_chat_response(system_prompt, user_prompt, model="deepseek-r1:8b")

        # user_prompt = (
        #     "Please give a suitable title for the blog post based on the summary you just provided."
        #     f"Summary: {summary}"
        # )
        # title = get_chat_response(system_prompt, user_prompt, model="deepseek-r1:8b")
        # # print("\nFinal Summary for topic", topic_name, ":\n", summary)

        final_results.append({
            "topic_name": topic_name,
            # "title": title,
            "summary": summary
        })

    with open("summarized_articles.json", "w", encoding="utf-8") as f:
        json.dump(final_results, f, indent=4, ensure_ascii=False)
    

if __name__ == "__main__":
    start_time = time.time()
    main()
    end_time = time.time()
    print(f"\nTotal time taken: {end_time - start_time:.2f} seconds.")



Processing topic: tennis


  hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


<think>
Okay, so I need to help summarize this text into a concise single paragraph for a blog post. The user also wants a suitable title mentioned in the summary as Title: {actual title}. Let's see what the text is about.

The text talks about Emma Raducanu having a stalker removed during her match at the Dubai Tennis Championships. He was following her to various events, and she felt distressed seeing him. The man was detained and given a restraining order, though charges were dropped after he agreed to stay away. There's also mention of security measures and a former coach confirming the stalker's presence.

I need to make sure the summary includes key points: Raducanu, the incident during the match, the stalker being removed, his history of following her, charges being dropped, security actions, and the broader issue of safety for female athletes.

Also, SEO optimization is important. So I should include relevant keywords like Emma Raducanu, Dubai Tennis Championships, stalker, sec