**Extract The Plain Text**

In [1]:
import json
from pathlib import Path
import re
from nltk.tokenize import sent_tokenize
from docx import Document

def extract_clean_text_from_docx(docx_path: str):
    """
    Extract text from a Word document (.docx) and merge headings into their paragraphs.
    Output: list of dicts { "paragraph_id": int, "sentence": str }
    """
    doc = Document(docx_path)

    paragraphs_text = []  # will hold merged heading+paragraphs or bullets
    buffer = []
    current_heading = None  # store heading text until we see the paragraph

    def flush_buffer():
        """helper to flush bullet buffer into one block"""
        nonlocal buffer, paragraphs_text
        if buffer:
            paragraphs_text.append(" ".join(buffer))
            buffer = []

    for para in doc.paragraphs:
        text = para.text.strip()
        style_name = para.style.name if para.style else ""

        if not text:
            flush_buffer()
            continue

        # normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        # if it's a heading
        if style_name.startswith("Heading"):
            flush_buffer()
            current_heading = text
            continue  # skip adding heading separately, wait for next para

        # if it's a bullet
        if text.startswith("●") or text.startswith("-"):
            # part of a bullet list, accumulate
            if current_heading:
                # prepend heading to the first bullet block
                text = f"{current_heading}: {text}"
                current_heading = None
            buffer.append(text)
        else:
            # normal paragraph, flush previous bullets
            flush_buffer()
            if current_heading:
                text = f"{current_heading}: {text}"
                current_heading = None
            paragraphs_text.append(text)

    # flush leftover bullets at the end
    flush_buffer()

    # now paragraphs_text holds merged bullets and normal paragraphs with heading text included
    sentences_with_para = []
    paragraph_id = 1
    for para_text in paragraphs_text:
        sentences = sent_tokenize(para_text)
        for sent in sentences:
            sentences_with_para.append({
                "paragraph_id": paragraph_id,
                "sentence": sent.strip()
            })
        paragraph_id += 1

    return sentences_with_para


def reconstruct_clean_text(sentences_with_para):
    """Reconstructs paragraphs from sentences with paragraph IDs."""
    paragraphs = []
    current_para_id = -1
    para_sentences = []

    for item in sentences_with_para:
        if item["paragraph_id"] != current_para_id:
            if para_sentences:
                paragraphs.append(" ".join(para_sentences))
            para_sentences = [item["sentence"]]
            current_para_id = item["paragraph_id"]
        else:
            para_sentences.append(item["sentence"])
    
    if para_sentences:
        paragraphs.append(" ".join(para_sentences))
    
    return "\n\n".join(paragraphs)


def save_text_to_file(text: str, output_path: str) -> None:
    """Saves text to a file."""
    Path(output_path).write_text(text, encoding="utf-8")
    print(f"Cleaned text saved to: {output_path}")


if __name__ == "__main__":
    docx_file = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\Transparency and efficiency.docx"
    output_file = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\plain_text.txt"

    sentences_data = extract_clean_text_from_docx(docx_file)
    cleaned_text = reconstruct_clean_text(sentences_data)
    save_text_to_file(cleaned_text, output_file)

    print("CLEANED TEXT (Preview):\n", cleaned_text[:1000])

    # Save sentences_data to JSON exactly as requested
    json_output_file = Path(r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_data.json")
    json_output_file.write_text(
        json.dumps(sentences_data, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )

    print(f"sentences_data saved to JSON: {json_output_file}")


Cleaned text saved to: C:\Users\Shahe\my_server\RAG\rag_sys\political\plain_text.txt
CLEANED TEXT (Preview):
 Introduction: There is a delicate balance between transparency and decision-making effectiveness, as transparency can be a double-edged weapon. Although it strengthens democracy and allows citizens and stakeholders to participate in the decision-making process, it may sometimes be hampered by increased complexity and delays.

On the one hand, opponents of transparency consider that they may lose effectiveness to decision-making processes, as they can cause delays in decision-making due to multiple public debates and intervention by external actors. On the other hand, proponents of transparency see it as enhancing legitimacy and accountability and ensuring that decisions are made in the public's interest.

Thus, there seems to be a need for an ideal balance between transparency and decision-making effectiveness, as transparency must be made available to citizens and stakeholders

**Embedding sentences from JSON**

In [2]:
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer

# --- Input / output paths ---
json_file = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_data.json"  # your input file
output_file = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_with_embeddings.json"

# --- Load sentences (expects a list of dicts like {"paragraph_id": ..., "sentence": ...}) ---
sentences = json.loads(Path(json_file).read_text(encoding="utf-8"))

# --- Load embedding model ---
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# --- Compute embeddings while preserving paragraph_id ---
sentences_with_embeddings = []
for record in sentences:
    paragraph_id = record["paragraph_id"]
    sentence_text = record["sentence"]

    embedding = model.encode(
        sentence_text,
        normalize_embeddings=True  # optional but good for cosine similarity
    ).tolist()

    sentences_with_embeddings.append({
        "paragraph_id": paragraph_id,
        "sentence": sentence_text,
        "embedding": embedding
    })

# --- Save to JSON ---
Path(output_file).write_text(
    json.dumps(sentences_with_embeddings, ensure_ascii=False, indent=2),
    encoding="utf-8"
)

print(f"Saved sentences with embeddings to: {output_file}")


Saved sentences with embeddings to: C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_with_embeddings.json


**Clustering By Topic**

In [3]:
import json
import numpy as np
from pathlib import Path
from bertopic import BERTopic

# ----------------------------
# 1. Load your JSON file
# ----------------------------
sentences_data = json.loads(
    Path(r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_with_embeddings.json")
    .read_text(encoding="utf-8")
)

# extract texts and embeddings
texts = [item["sentence"] for item in sentences_data]
paragraph_ids = [item["paragraph_id"] for item in sentences_data]
embeddings = np.array([item["embedding"] for item in sentences_data])

print(f"Loaded {len(texts)} sentences with embeddings of shape {embeddings.shape}")

# ----------------------------
# 2. Create and fit BERTopic
# ----------------------------
# By default BERTopic uses its own embedding model. Because we already have embeddings,
# we pass `embedding_model=None` and supply embeddings ourselves.
topic_model = BERTopic(embedding_model=None, verbose=True)

# Fit the model using our own embeddings
topics, probs = topic_model.fit_transform(texts, embeddings)

# ----------------------------
# 3. Inspect topics
# ----------------------------
# Topic number for each sentence:
for text, topic_id, pid in zip(texts, topics, paragraph_ids):
    print(f"[Paragraph {pid}] Topic {topic_id}: {text}")

# Summary info about all topics:
print("\n=== Topic Info ===")
print(topic_model.get_topic_info())  # shows topic id, size, and name keywords

# Inspect one topic's keywords:
print("\n=== Top words in topic 0 ===")
print(topic_model.get_topic(0))  # list of top words with scores

# ----------------------------
# 4. (Optional) Save topics per sentence to JSON
# ----------------------------
output = []
for item, topic_id in zip(sentences_data, topics):
    out_item = {
        "paragraph_id": item["paragraph_id"],
        "sentence": item["sentence"],
        "topic": int(topic_id)
    }
    output.append(out_item)

Path(r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_with_topics.json").write_text(
    json.dumps(output, ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print("\nSaved topics to sentences_with_topics.json")


2025-09-18 22:59:17,123 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


Loaded 108 sentences with embeddings of shape (108, 768)


2025-09-18 22:59:42,470 - BERTopic - Dimensionality - Completed ✓
2025-09-18 22:59:42,487 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-18 22:59:42,547 - BERTopic - Cluster - Completed ✓
2025-09-18 22:59:42,570 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-18 22:59:42,619 - BERTopic - Representation - Completed ✓


[Paragraph 1] Topic 0: Introduction: There is a delicate balance between transparency and decision-making effectiveness, as transparency can be a double-edged weapon.
[Paragraph 1] Topic 0: Although it strengthens democracy and allows citizens and stakeholders to participate in the decision-making process, it may sometimes be hampered by increased complexity and delays.
[Paragraph 2] Topic 0: On the one hand, opponents of transparency consider that they may lose effectiveness to decision-making processes, as they can cause delays in decision-making due to multiple public debates and intervention by external actors.
[Paragraph 2] Topic 0: On the other hand, proponents of transparency see it as enhancing legitimacy and accountability and ensuring that decisions are made in the public's interest.
[Paragraph 3] Topic 0: Thus, there seems to be a need for an ideal balance between transparency and decision-making effectiveness, as transparency must be made available to citizens and stakehold

**Merge By Topic**

In [1]:
import json
from pathlib import Path
from collections import defaultdict

# ----------------------------
# 1. Load sentences with topics
# ----------------------------
sentences_with_topics = json.loads(
    Path(r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_with_topics.json")
    .read_text(encoding="utf-8")
)

# ----------------------------
# 2. Group by topic
# ----------------------------
topics_dict = defaultdict(list)

for item in sentences_with_topics:
    topic_id = item["topic"]
    # Store sentence along with its paragraph_id
    sentence_info = {
        "paragraph_id": item["paragraph_id"],
        "sentence": item["sentence"]
    }
    topics_dict[topic_id].append(sentence_info)

# ----------------------------
# 3. Merge into single text per topic
# ----------------------------
merged_topics = []
for topic_id, sentence_infos in topics_dict.items():
    # Merge text for the chunk
    merged_text = " ".join([s["sentence"] for s in sentence_infos])
    merged_topics.append({
        "topic": int(topic_id),
        "text": merged_text,
        "sentences": sentence_infos  # keep metadata per sentence
    })

# ----------------------------
# 4. Save to JSON
# ----------------------------
output_path = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\topics_merged.json"
Path(output_path).write_text(
    json.dumps(merged_topics, ensure_ascii=False, indent=2),
    encoding="utf-8"
)

print(f"Merged topics saved to {output_path}")


Merged topics saved to C:\Users\Shahe\my_server\RAG\rag_sys\political\topics_merged.json


**Embedding for Each Chunk**

In [2]:
import json
from pathlib import Path
import numpy as np
from sentence_transformers import SentenceTransformer

# ----------------------------
# 1. Load merged topics
# ----------------------------
merged_topics = json.loads(
    Path(r"C:\Users\Shahe\my_server\RAG\rag_sys\political\topics_merged.json")
    .read_text(encoding="utf-8")
)

# ----------------------------
# 2. Load embedding model
# ----------------------------
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# ----------------------------
# 3. Compute embeddings for each chunk text
# ----------------------------
chunks_with_embeddings = []
for topic_chunk in merged_topics:
    chunk_text = topic_chunk["text"]
    embedding = embedding_model.encode(chunk_text)  # single embedding for whole chunk

    chunks_with_embeddings.append({
        "topic": topic_chunk["topic"],
        "text": chunk_text,
        "embedding": embedding.tolist(),  # convert to list for JSON
        "sentences": topic_chunk["sentences"]
    })

# ----------------------------
# 4. Save to JSON
# ----------------------------
output_path = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\new_chunks_with_embeddings.json"
Path(output_path).write_text(
    json.dumps(chunks_with_embeddings, ensure_ascii=False, indent=2),
    encoding="utf-8"
)

print(f"Chunk embeddings saved to {output_path}")


Chunk embeddings saved to C:\Users\Shahe\my_server\RAG\rag_sys\political\new_chunks_with_embeddings.json


**Paragraph Embedding**

In [1]:
import json
from sentence_transformers import SentenceTransformer
from collections import defaultdict

# Initialize model
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# Load sentences
with open(r"C:\Users\Shahe\my_server\RAG\rag_sys\political\sentences_with_embeddings.json", 'r', encoding='utf-8') as f:
    sentences_data = json.load(f)

# Group sentences by paragraph_id
paragraph_map = defaultdict(list)
for item in sentences_data:
    paragraph_map[item['paragraph_id']].append(item['sentence'])

# Precompute paragraph + sentence embeddings
paragraphs = []
for para_id, sentences in paragraph_map.items():
    # encode each sentence separately → list of embeddings
    sentence_embeddings = embedding_model.encode(sentences).tolist()

    # encode the full paragraph (optional)
    para_text = " ".join(sentences)
    para_emb = embedding_model.encode(para_text).tolist()

    paragraphs.append({
        "paragraph_id": para_id,
        "sentences": sentences,                   # ✅ sentences saved as list
       # "sentence_embeddings": sentence_embeddings,  # ✅ each sentence embedding
        "text": para_text,
        "paragraph_embedding": para_emb
    })

# Save to JSON for future fast loading
out_path = r"C:\Users\Shahe\my_server\RAG\rag_sys\political\paragraphs_with_embeddings.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(paragraphs, f, ensure_ascii=False, indent=2)

print(f"Paragraph + sentence embeddings precomputed and saved to {out_path}")


Paragraph + sentence embeddings precomputed and saved to C:\Users\Shahe\my_server\RAG\rag_sys\political\paragraphs_with_embeddings.json
