1. ***Preprocessing:*** <br><tb>
This notebook focuses on preparing raw UMBC ISSS data for the Retrieval-Augmented Generation (RAG) system.
Key Steps:
* Extract text from .txt and .docx files.
* Clean the data by removing special characters and unwanted formatting.
* Chunk text into ~60-word semantic segments, adding context-before and context-after metadata.
* Save the processed chunks as JSON files for efficient retrieval.
* Uploaded to Github: https://github.com/Aravind6908/datahub/tree/main/LLM

In [1]:
# installing Libraries

In [2]:
!pip install python-docx



In [3]:
!pip install pywin32



In [14]:
import os
import re
import json
from pathlib import Path
from docx import Document  
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

# Directories
input_directory = "./documents"  # Input folder
output_directory = "./pre_processed_documents"  # Output folder

os.makedirs(output_directory, exist_ok=True)

def clean_text(text, preserve_words=None):
    
    if preserve_words is None:
        preserve_words = {"US", "OPT", "I-20", "I-94"}  

    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'<.*?>', '', text)  
    text = re.sub(r'http\S+|www\.\S+', '', text) 
    text = re.sub(r'[^A-Za-z0-9@.,!?\'\";:\s\-$]', '', text)
    words = text.split()
    processed_words = [
        word if word.upper() in preserve_words else word.lower()
        for word in words
    ]
    return ' '.join(processed_words)



# Function to chunk text semantically based on sentences and token limits
def semantic_chunking(text, max_token_size=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = len(sentence.split())
        if current_tokens + sentence_tokens <= max_token_size:
            current_chunk.append(sentence)
            current_tokens += sentence_tokens
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_tokens = sentence_tokens

    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Function to add context to chunks
def add_context(chunks, context_window=1):
    contextual_chunks = []
    for i, chunk in enumerate(chunks):
        context_before = ' '.join(chunks[max(i - context_window, 0):i])
        context_after = ' '.join(chunks[i + 1:i + 1 + context_window])
        contextual_chunks.append({
            "chunk": chunk,
            "context_before": context_before,
            "context_after": context_after
        })
    return contextual_chunks

# Function to extract text from .docx files
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([paragraph.text for paragraph in doc.paragraphs])

# Processing each file
for filename in os.listdir(input_directory):
    file_path = os.path.join(input_directory, filename)

    try:
        # Handling different file types
        if filename.lower().endswith('.txt'):
            with open(file_path, 'r', encoding='utf-8') as file:
                raw_text = file.read()
        elif filename.lower().endswith('.docx'):
            raw_text = extract_text_from_docx(file_path)
        else:
            print(f"Skipping unsupported file type: {filename}")
            continue

        # Cleaning and preprocessing the text
        cleaned_text = clean_text(raw_text)
        chunks = semantic_chunking(cleaned_text, max_token_size=60)
        contextual_chunks = add_context(chunks, context_window=1)
        for idx, chunk_data in enumerate(contextual_chunks):
            output_path = os.path.join(output_directory, f"{Path(filename).stem}_chunk{idx}.json")
            chunk_data_with_metadata = {
                "DocumentID": filename,
                "ChunkID": idx,
                "ChunkText": chunk_data["chunk"],
                "ContextBefore": chunk_data["context_before"],
                "ContextAfter": chunk_data["context_after"]
            }
            with open(output_path, "w", encoding="utf-8") as output_file:
                json.dump(chunk_data_with_metadata, output_file, indent=4)

        print(f"Processed {filename}: {len(contextual_chunks)} chunks created.")

    except Exception as e:
        print(f"Error processing {filename}: {e}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aravi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processed About_OISS.docx: 3 chunks created.
Processed Applying_to_UMBC.docx: 5 chunks created.
Processed Career_Center.docx: 1 chunks created.
Processed Change_of_Education_or_Degree_Level.docx: 8 chunks created.
Processed Contact_Us.docx: 7 chunks created.
Processed CPT.docx: 16 chunks created.
Processed Economic_Hardship_Work_Authorization.docx: 16 chunks created.
Processed Education_Abroad_for_International_Students.docx: 4 chunks created.
Processed Entering_the_USA.docx: 20 chunks created.
Processed F1_Enrollements_and_Requirements.docx: 35 chunks created.
Processed F1_Visa_Process.docx: 35 chunks created.
Processed Financing_Your_Studies.docx: 9 chunks created.
Processed Graduate_Assistantships.docx: 13 chunks created.
Processed H1-B_Overview.docx: 28 chunks created.
Processed Health_Insurance_and_Immunization_Requirements.docx: 6 chunks created.
Processed I-20_Program_Extension.docx: 6 chunks created.
Processed i20.docx: 15 chunks created.
Processed i20_Access.docx: 6 chunks cre