In [1]:
import json
import os
import uuid
import nltk
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from nltk.tokenize import sent_tokenize
from huggingface_hub import login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set environment variable to reduce memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Set NLTK data path and ensure 'punkt' and 'punkt_tab' are downloaded
nltk.data.path.append('C:/Users/DSUCSCL7-31/AppData/Roaming/nltk_data')
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('tokenizers/punkt_tab/english')
except LookupError as e:
    print(f"Downloading missing NLTK resources: {e}")
    nltk.download('punkt', download_dir='C:/Users/DSUCSCL7-31/AppData/Roaming/nltk_data')
    nltk.download('punkt_tab', download_dir='C:/Users/DSUCSCL7-31/AppData/Roaming/nltk_data')

In [3]:
def load_mistral_model(hf_token=None):
    """Load Mistral-7B-Instruct model with 4-bit quantization and CPU offloading."""
    model_name = "mistralai/Mistral-7B-Instruct-v0.2"
    
    # Log in to Hugging Face if token is provided
    if hf_token:
        login(hf_token)
    
    # Clear GPU memory
    torch.cuda.empty_cache()
    
    try:
        # Configure 4-bit quantization with CPU offloading
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            llm_int8_enable_fp32_cpu_offload=True
        )
        # Custom device map: prioritize GPU, offload to CPU if needed
        device_map = {
            "": "cuda:0",  # Force all layers to GPU if possible
            "lm_head": "cpu"  # Offload language head to CPU
        }
        # Load tokenizer from cache
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=hf_token,
            local_files_only=True
        )
        # Load model from cache with quantization and custom device map
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map=device_map,
            trust_remote_code=True,
            token=hf_token
        )
        print(f"Model loaded from cache: {model_name}")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model from cache: {e}")
        print("Ensure the model is cached at C:\\Users\\DSUCSCL7-31\\.cache\\huggingface\\hub\\models--mistralai--Mistral-7B-Instruct-v0.2")
        raise


In [4]:
def split_text_into_segments(text, max_tokens=1000):
    """Split text into segments to fit within token limits."""
    sentences = sent_tokenize(text)
    segments = []
    current_segment = []
    current_token_count = 0
    
    for sentence in sentences:
        token_count = len(sentence.split()) * 1.5  # Rough estimate: 1 word ≈ 1.5 tokens
        if current_token_count + token_count > max_tokens:
            segments.append(' '.join(current_segment))
            current_segment = [sentence]
            current_token_count = token_count
        else:
            current_segment.append(sentence)
            current_token_count += token_count
    
    if current_segment:
        segments.append(' '.join(current_segment))
    
    return segments

In [10]:
def llm_chunk_segment(segment, model, tokenizer, max_words=300):
    """Use Mistral-7B to chunk a text segment into semantically coherent chunks."""
    prompt = f"""
    You are an expert in text processing. Your task is to split the following text into semantically coherent chunks, each approximately 200-300 words. Ensure each chunk covers a single topic or closely related ideas, preserving context. Output the chunks as a numbered list, with each chunk separated by '---'. If a natural boundary (e.g., header or paragraph break) exists, prioritize it. Do not modify the text content; only split it.

    Text:
    {segment}

    Output format:
    1. [First chunk text]
    ---
    2. [Second chunk text]
    ---
    ...
    """
    
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=500, temperature=0.7, top_p=0.9)  # Removed batch_size
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract chunks from response
    chunks = []
    current_chunk = []
    for line in response.split('\n'):
        if line.strip().startswith('---'):
            if current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
        elif line.strip() and not line.strip()[0].isdigit() and not line.strip().startswith("You are an expert"):  # Exclude prompt lines
            current_chunk.append(line.strip())
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    # Filter out empty or malformed chunks
    return [chunk for chunk in chunks if len(chunk.split()) > 10]

In [11]:
def llm_chunk_text(text, model, tokenizer, max_words=300):
    """Chunk the entire text using Mistral-7B."""
    segments = split_text_into_segments(text)
    all_chunks = []
    
    for segment in segments:
        segment_chunks = llm_chunk_segment(segment, model, tokenizer, max_words)
        for chunk in segment_chunks:
            word_count = len(chunk.split())
            if word_count <= max_words * 1.5:  # Allow slight overflow
                all_chunks.append((chunk, word_count))
            else:
                # Split oversized chunks at sentence boundaries
                sentences = sent_tokenize(chunk)
                current_chunk = []
                current_word_count = 0
                for sentence in sentences:
                    sentence_words = sentence.split()
                    if current_word_count + len(sentence_words) > max_words:
                        if current_chunk:
                            all_chunks.append((' '.join(current_chunk), current_word_count))
                        current_chunk = sentence_words
                        current_word_count = len(sentence_words)
                    else:
                        current_chunk.extend(sentence_words)
                        current_word_count += len(sentence_words)
                if current_chunk:
                    all_chunks.append((' '.join(current_chunk), current_word_count))
    
    return all_chunks


In [7]:
def save_chunks_to_json(chunks, output_dir="chunks", output_file="dr-arunkumar_chunks.json"):
    """Save chunks to a JSON file with metadata."""
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, output_file)
    
    json_data = []
    for i, (chunk_text, word_count) in enumerate(chunks):
        chunk_id = str(uuid.uuid4())
        json_data.append({
            "chunk_id": chunk_id,
            "header": "",  # No header since LLM determines boundaries
            "content": chunk_text,
            "word_count": word_count
        })
    
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(json_data, f, indent=4, ensure_ascii=False)
    print(f"Chunks saved to {output_path}")


In [12]:
# Example usage
if __name__ == "__main__":
    # Optional: Set Hugging Face token for gated models
    #hf_token = os.getenv("HF_TOKEN")  # Set HF_TOKEN in your environment
    hf_token = "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"  # Uncomment and add your token if needed
    
    # Load model and tokenizer
    model, tokenizer = load_mistral_model(hf_token)
    
    # Read input text
    with open('extracted_texts/dr-arunkumar.txt', 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Chunk text using LLM
    chunks = llm_chunk_text(text, model, tokenizer)
    
    # Save to JSON
    save_chunks_to_json(chunks)

Loading checkpoint shards: 100%|██████████| 3/3 [00:09<00:00,  3.18s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Model loaded from cache: mistralai/Mistral-7B-Instruct-v0.2


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Chunks saved to chunks\dr-arunkumar_chunks.json
