In [None]:
!pip install python-docx



In [None]:
# ‚úÖ Upload, Read, Chunk, and Print Word Document Content (NO NLTK version)
from google.colab import files
from docx import Document
from transformers import AutoTokenizer

# Step 1: Upload DOCX file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Step 2: Extract clean text
def get_clean_text(file_path):
    doc = Document(file_path)
    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
    return " ".join(paragraphs)

text = get_clean_text(file_name)

# Step 3: Define chunking function using tokenizer only (no nltk)
def split_text_to_chunks(text, chunk_size=512, model_name="sentence-transformers/all-MiniLM-L6-v2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokens = tokenizer.tokenize(text)

    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text.strip())

    return chunks

# Step 4: Create and print chunks
chunks = split_text_to_chunks(text, chunk_size=512)

print(f"\nüì¶ Total Chunks Created: {len(chunks)}\n")
for i, chunk in enumerate(chunks):
    print(f"--- Chunk {i+1} ---")
    print(chunk)
    print("\n" + "-"*80 + "\n")


Saving ITU_assignment_word_format.docx to ITU_assignment_word_format (2).docx


Token indices sequence length is longer than the specified maximum sequence length for this model (121727 > 512). Running this sequence through the model will result in indexing errors



üì¶ Total Chunks Created: 238

--- Chunk 1 ---
3gpp tr 21. 917 v17. 0. 1 ( 2023 - 01 ) technical report 3rd generation partnership project ; technical specification group services and system aspects ; release 17 description ; summary of rel - 17 work items ( release 17 ) the present document has been developed within the 3rd generation partnership project ( 3gpp tm ) and may be further elaborated for the purposes of 3gpp. the present document has not been subject to any approval process by the 3gpp organizational partners and shall not be implemented. this report is provided for future development work within 3gpp only. the organizational partners accept no liability for any use of this specification. specifications and reports for implementation of the 3gpp tm system should be obtained via the 3gpp organizational partners ' publications offices. keywords description, summary, release 17 3gpp postal address 3gpp support office address 650 route des lucioles - sophia antipolis valbonn

In [None]:
# Install sentence-transformers (if not already installed)
!pip install -q sentence-transformers

#Import and load the model
from sentence_transformers import SentenceTransformer

# Load a lightweight and fast embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

#Convert the chunks into embeddings
# `chunks` should be the list of text chunks from earlier
embeddings = model.encode(chunks, convert_to_numpy=True)

# Check the shape of the resulting embeddings
print(f"Generated {len(embeddings)} embeddings with dimension {embeddings.shape[1]}")

Generated 238 embeddings with dimension 384


In [None]:
# Step 1: Install necessary libraries
!pip install -q faiss-cpu sentence-transformers

# Step 2: Imports
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 3: Use the actual chunks from your parsed DOCX file
# Make sure this already exists from earlier step:
# chunks = split_text_to_chunks(text, chunk_size=512)

# Just check if chunks exist
print(f" Using {len(chunks)} chunks extracted from your document.")

# Step 4: Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 5: Convert chunks to embeddings
embeddings = model.encode(chunks, convert_to_numpy=True)

# Step 6: Create FAISS index and add embeddings
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

# Step 7: Ask a question
query = input("Enter your question: ")

# Step 8: Convert query to embedding and search
query_embedding = model.encode([query], convert_to_numpy=True)
top_k = 3
distances, indices = index.search(query_embedding, top_k)

# Step 9: Show matching chunks
print(f"\n Top {top_k} matching chunks for: '{query}'\n")
for rank, idx in enumerate(indices[0]):
    print(f"Result {rank+1} (Vector ID {idx}):\n{chunks[idx]}")
    print("-" * 80)

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m31.3/31.3 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25h Using 238 chunks extracted from your document.
Enter your question: What is 6ghz?

 Top 3 matching chunks for: 'What is 6ghz?'

Result 1 (Vector ID 174):
ran4 [ 6 ] r4 - 2208245, " introducing 6ghz licensed operation into 38. 174 ", catt [ 7 ] r4 - 2209537, " cr to 37. 104 on introduction of n104 co - existence requirements ", nokia, nokia shanghai bell [ 8 ] r4 - 2209583, " cr to ts36. 104 the introduction of coexistence requirements of licensed band 6425 - 7125mhz ", zte corporation [ 9 ] r4 - 2210739, " introducing 6ghz licensed operation into 37. 105 ", catt 11. 19. 2 extending current nr operation to 71 ghz summary based on the input provided by qualcomm in rp - 222478. this wid extends nr operation to 71ghz with the introduction of new unlicensed band n263. relevant system para