In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import ast
import re
import os
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from tokenizers.normalizers import BertNormalizer
from bert_score import score as bert_score
from rouge_score import rouge_scorer

In [None]:
def print_gpu_utilization():
    """Print GPU memory usage statistics for all available GPUs"""
    print("\nGPU Memory Usage:")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB / {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")

In [None]:
def load_vocab_mappings(file_path):
    """Load vocabulary mappings from file"""
    with open(file_path, 'r', encoding='utf-8') as f:
        mappings = f.read().strip().split('\n')
    
    return {m[0]: m[2:] for m in mappings}

In [None]:
def normalize_text(text, normalizer=None, mappings=None):
    """Normalize text using MatSciBERT normalizer with improved handling"""
    # Handle empty or None input
    if not text:
        return ""
        
    # Apply basic preprocessing
    text = text.strip()
    
    # Split into lines and normalize each line
    text = [normalizer.normalize_str(s) if normalizer else s for s in text.split('\n')]
    out = []
    for s in text:
        if mappings:
            # Apply mappings with better handling of special characters
            norm_s = ''.join(mappings.get(c, c if c.isalnum() else ' ') for c in s)
            # Remove redundant spaces
            norm_s = re.sub(r'\s+', ' ', norm_s).strip()
        else:
            norm_s = s
        out.append(norm_s)
    
    return '\n'.join(out)

In [None]:
def expand_query(query):
    """Expand the query with related terms to improve matching"""
    # Simple rule-based expansion
    # Extract key terms (simple approach)
    terms = re.findall(r'\b\w+\b', query.lower())
    
    # Filter out common stop words (simplified list)
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 
                 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'like', 'from'}
    
    key_terms = [term for term in terms if term not in stop_words and len(term) > 2]
    
    # If no key terms found, return original query
    if not key_terms:
        return query
        
    # Add original query plus key terms for emphasis
    expanded_query = query + " " + " ".join(key_terms)
    
    return expanded_query

In [None]:
def preprocess_embeddings(embeddings):
    """Normalize document embeddings for better cosine similarity"""
    print("Normalizing document embeddings...")
    # Remove any zero vectors
    zero_indices = np.where(np.linalg.norm(embeddings, axis=1) == 0)[0]
    if len(zero_indices) > 0:
        print(f"Warning: Found {len(zero_indices)} zero vectors in embeddings. Replacing with small random values.")
        for idx in zero_indices:
            embeddings[idx] = np.random.normal(0, 0.01, embeddings.shape[1])
    
    # L2 normalize embeddings for more accurate cosine similarity
    normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    return normalized_embeddings

In [None]:
def get_query_embedding(query, embedding_model, tokenizer, device, pooling='mean_cls'):
    """Generate embedding for a query text using optimized pooling strategies"""
    # Apply query expansion to improve matching
    expanded_query = expand_query(query)
    
    # Normalize query (simplified, you might want to pass a normalizer if needed)
    normalized_query = normalize_text(expanded_query)
    
    # Move computation to specified device
    with torch.cuda.device(device):
        # Tokenize with increased max length for better coverage
        tokenized_query = tokenizer(normalized_query, padding=True, 
                                   truncation=True, max_length=512, return_tensors='pt').to(device)
        
        # Get embeddings with attention mask
        with torch.no_grad():
            outputs = embedding_model(**tokenized_query)
            
            # Get last hidden state and attention mask
            last_hidden_state = outputs.last_hidden_state
            attention_mask = tokenized_query['attention_mask']
            
            # Choose pooling strategy
            if pooling == 'cls':
                # CLS token pooling (first token)
                query_embedding = last_hidden_state[:, 0].cpu().numpy()[0]
                
            elif pooling == 'mean':
                # Mean pooling with attention mask
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                query_embedding = (sum_embeddings / sum_mask).cpu().numpy()[0]
                
            else:  # 'mean_cls'
                # Weighted combination of CLS and mean pooling
                cls_embedding = last_hidden_state[:, 0]
                
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                mean_embedding = sum_embeddings / sum_mask
                
                # Combine both methods (0.7 for mean pooling, 0.3 for CLS token)
                query_embedding = (0.7 * mean_embedding + 0.3 * cls_embedding).cpu().numpy()[0]
    
    # Normalize the embedding vector to unit length for proper cosine similarity
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    return query_embedding

In [None]:
def find_similar_texts(query_embedding, embeddings, documents, 
                       top_k=5, 
                       distance_metric='hybrid', 
                       similarity_threshold=0.6):
    """Find the most similar texts to the query embedding"""
    if distance_metric == 'cosine':
        # Compute cosine similarity
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        
    elif distance_metric == 'dot':
        # Compute dot product
        similarities = np.dot(embeddings, query_embedding)
        
    elif distance_metric == 'hybrid':
        # Hybrid approach: combine cosine similarity and BM25-inspired weighting
        cosine_sim = cosine_similarity([query_embedding], embeddings)[0]
        
        # BM25-inspired length normalization
        k1 = 1.5
        b = 0.75
        doc_lengths = np.array([len(str(doc).split()) for doc in documents])
        avg_doc_length = np.mean(doc_lengths)
        
        length_norm = (1 - b) + b * (doc_lengths / avg_doc_length)
        bm25_weights = (k1 + 1) / (k1 * length_norm + 1)
        
        # Apply length normalization to cosine similarity
        similarities = cosine_sim * bm25_weights
    
    else:
        raise ValueError(f"Unknown distance metric: {distance_metric}")
    
    # Apply similarity threshold to filter out less relevant documents
    qualified_indices = np.where(similarities >= similarity_threshold)[0]
    
    # If no document passes the threshold, fall back to top k
    if len(qualified_indices) == 0:
        top_indices = similarities.argsort()[-top_k:][::-1]
    else:
        # Otherwise, get the top k from qualified documents
        top_qualified_similarities = similarities[qualified_indices]
        top_qualified_indices = top_qualified_similarities.argsort()[-min(top_k, len(qualified_indices)):][::-1]
        top_indices = qualified_indices[top_qualified_indices]
    
    # Get results with reranking
    results = []
    for idx in top_indices:
        doc_text = documents[idx]
        
        # Calculate additional features for reranking
        doc_length = len(str(doc_text).split())
        length_penalty = 1.0  # No penalty by default
        
        # Apply a small penalty for very short or very long documents
        if doc_length < 50:
            length_penalty = 0.9  # Slight penalty for very short docs
        elif doc_length > 1000:
            length_penalty = 0.95  # Slight penalty for very long docs
            
        # Final score combining similarity and length consideration
        final_score = similarities[idx] * length_penalty
        
        results.append({
            "index": idx,
            "text": doc_text,
            "similarity": similarities[idx],
            "final_score": final_score
        })
    
    # Final sort by the combined score
    results.sort(key=lambda x: x["final_score"], reverse=True)
    
    return results

In [None]:
def highlight_relevant_parts(text, question):
    # Extract key terms from the question
    question_terms = set(re.findall(r'\b\w+\b', question.lower()))
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 
                 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'like', 'from'}
    key_terms = {term for term in question_terms if term not in stop_words and len(term) > 2}
    
    if not key_terms or not text:
        return text
        
    # Split text into sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    # Score each sentence based on key term occurrence
    scored_sentences = []
    for sentence in sentences:
        sentence_lower = sentence.lower()
        score = sum(1 for term in key_terms if term in sentence_lower)
        scored_sentences.append((sentence, score))
    
    # Sort sentences by score (descending)
    scored_sentences.sort(key=lambda x: x[1], reverse=True)
    
    # Take top sentences and restore original order
    top_sentences = [s[0] for s in scored_sentences if s[1] > 0]
    if not top_sentences:
        # If no sentences contain key terms, return the first few sentences
        top_sentences = sentences[:3]
    
    # Get original indices to preserve document flow
    top_indices = [sentences.index(s) for s in top_sentences]
    top_indices.sort()
    
    # Reconstruct text preserving original order
    highlighted_text = " ".join([sentences[i] for i in top_indices])
    
    # If highlighted text is too short, return more of the original
    if len(highlighted_text) < len(text) * 0.3:
        return text[:1000]  # Return first 1000 chars
        
    return highlighted_text

In [None]:
def format_context(similar_texts, question, max_context_tokens=3000):
    context = "Here is the relevant information from the dataset:\n\n"
    
    # Estimate tokens for each document and add until we hit the budget
    current_tokens = 0
    
    for i, result in enumerate(similar_texts, 1):
        doc_text = result['text']
        similarity = result['similarity']
        
        # Highlight most relevant parts 
        highlighted_text = highlight_relevant_parts(doc_text, question)
        
        # Estimate token count (rough approximation: 4 chars ~= 1 token)
        doc_token_estimate = len(highlighted_text) // 4
        
        # If adding this document would exceed the budget, truncate or skip
        if current_tokens + doc_token_estimate > max_context_tokens:
            # If it's the first document, include a truncated version
            if i == 1:
                # Truncate to fit within remaining budget
                max_chars = (max_context_tokens - current_tokens) * 4
                truncated_text = highlighted_text[:max_chars] + "..."
                context += f"Document {i} (Similarity: {similarity:.4f}):\n{truncated_text}\n\n"
            # Otherwise, we've added enough context
            break
        
        # Add the document to context
        context += f"Document {i} (Similarity: {similarity:.4f}):\n{highlighted_text}\n\n"
        current_tokens += doc_token_estimate
    
    return context

In [None]:
def generate_answer(question, context, llm_pipeline, max_new_tokens=512):
    """Generate answer using the LLM with improved prompt engineering"""
    # Analyze question to determine appropriate response structure
    question_lower = question.lower()
    
    # Check for question type
    is_factoid = any(w in question_lower for w in ['who', 'what', 'when', 'where', 'which', 'how many', 'how much'])
    is_comparison = any(w in question_lower for w in ['compare', 'difference', 'similar', 'versus', 'vs'])
    is_how_to = 'how to' in question_lower or 'steps' in question_lower
    
    # Build a prompt suited to the question type
    system_instruction = """You are a helpful, accurate assistant. When answering questions:
1. Only provide information supported by the context.
2. Use simple, clear language to explain complex concepts.
3. Be concise but comprehensive.
4. If the context doesn't fully answer the question, acknowledge the limitations of the information."""

    # Add specific instruction based on question type
    if is_factoid:
        system_instruction += "\nFor this factual question, cite the most relevant parts of the context."
    elif is_comparison:
        system_instruction += "\nFor this comparison question, clearly outline the similarities and differences."
    elif is_how_to:
        system_instruction += "\nFor this procedural question, provide clear step-by-step instructions."
    
    prompt = f"""{system_instruction}

Context:
{context}

Question: {question}

Answer:"""

    # Generate answer with the LLM using improved settings
    response = llm_pipeline(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.5,  # Lower temperature for more factual responses
        top_p=0.92,
        top_k=50
    )[0]['generated_text']
    
    # Extract just the answer part (after the prompt)
    answer = response[len(prompt):].strip()
    
    return answer

In [None]:
def initialize_rag_system(
    embeddings_csv_path, 
    text_column="document", 
    embedding_column="embedding",
    llm_model_id="/kaggle/input/llama-3.1/transformers/8b/2",
    top_k=5,
    distance_metric='hybrid',
    embedding_pooling ='mean_cls',
    similarity_threshold=0.55
):
    """Initialize the RAG system components"""
    # Setup environment for memory optimization
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # Check GPU count
    gpu_count = torch.cuda.device_count()
    print(f"Found {gpu_count} GPUs")
    
    # Clear GPU caches
    for i in range(gpu_count):
        with torch.cuda.device(i):
            torch.cuda.empty_cache()
    
    # Load data from CSV
    print(f"Loading embeddings from {embeddings_csv_path}...")
    df = pd.read_csv(embeddings_csv_path)
    
    # Validate columns
    if text_column not in df.columns:
        raise ValueError(f"Text column '{text_column}' not found in CSV")
    
    if embedding_column not in df.columns:
        raise ValueError(f"Embedding column '{embedding_column}' not found in CSV")
    
    # Parse embeddings
    print("Parsing embeddings from string format...")
    embeddings = []
    for emb_str in df[embedding_column]:
        try:
            # Try parsing as list literal
            emb = np.array(ast.literal_eval(emb_str))
        except (ValueError, SyntaxError):
            # Fallback to comma-separated parsing
            emb = np.array([float(x) for x in emb_str.strip('[]').split(',')])
        embeddings.append(emb)
    
    # Convert to numpy array
    embeddings = np.vstack(embeddings)
    print(f"Loaded {len(embeddings)} embeddings with {embeddings.shape[1]} dimensions")
    
    # Preprocess embeddings
    normalized_embeddings = preprocess_embeddings(embeddings)
    
    # Load text embeddings model (on GPU 0)
    print("Loading MatSciBERT for query embeddings...")
    embedding_device = "cuda:0"
    tokenizer = AutoTokenizer.from_pretrained('m3rg-iitd/matscibert')
    embedding_model = AutoModel.from_pretrained('m3rg-iitd/matscibert', trust_remote_code=True).to(embedding_device)
    
    # Load vocabulary mappings (optional)
    vocab_path = '/kaggle/input/vocabmappings/vocab_mappings.txt'
    mappings = load_vocab_mappings(vocab_path) if os.path.exists(vocab_path) else {}
    
    # Create normalizer
    normalizer = BertNormalizer(lowercase=False, strip_accents=True, 
                                clean_text=True, handle_chinese_chars=True)
    
    # Configure LLM loading parameters
    print(f"Loading LLM from {llm_model_id}...")
    device_map = "balanced" if gpu_count >= 2 else "auto"
    
    # Load LLM pipeline with proper configuration
    llm_pipeline = transformers.pipeline(
        "text-generation", 
        model=llm_model_id, 
        model_kwargs={
            "torch_dtype": torch.bfloat16,
            "device_map": device_map,
            "offload_folder": "/tmp/offload"
        }
    )
    
    # Return a configuration dictionary for RAG system
    return {
        "df": df,
        "embeddings": normalized_embeddings,
        "documents": df[text_column].tolist(),
        "tokenizer": tokenizer,
        "embedding_model": embedding_model,
        "embedding_device": embedding_device,
        "normalizer": normalizer,
        "mappings": mappings,
        "llm_pipeline": llm_pipeline,
        "config": {
            "top_k": top_k,
            "distance_metric": distance_metric,
            "embedding_pooling": embedding_pooling,
            "similarity_threshold": similarity_threshold
        }
    }

In [None]:
def rag_question_answering(rag_system, question):
    """
    Perform RAG-based question answering
    
    Args:
        rag_system: Configuration dictionary from initialize_rag_system
        question: User's question
    
    Returns:
        Dictionary with answer and top results
    """
    print("\nProcessing query...")
    
    # Extract configuration
    config = rag_system['config']
    
    # Get query embedding
    query_embedding = get_query_embedding(
        question, 
        rag_system['embedding_model'], 
        rag_system['tokenizer'], 
        rag_system['embedding_device'], 
        pooling=config.get('embedding_pooling', 'mean_cls')
    )
    
    # Find similar texts
    similar_texts = find_similar_texts(
        query_embedding, 
        rag_system['embeddings'], 
        rag_system['documents'], 
        top_k=config.get('top_k', 5),
        distance_metric=config.get('distance_metric', 'hybrid'), 
        similarity_threshold=config.get('similarity_threshold', 0.6)
    )
    
    # Format context for LLM
    context = format_context(similar_texts, question)
    
    # Generate answer
    answer = generate_answer(
        question, 
        context, 
        rag_system['llm_pipeline']
    )
    
    return {
        "answer": answer,
        "top_results": similar_texts
    }

In [None]:
def demo_rag_system():
    """Run a demonstration of the RAG system"""
    # Initialize the RAG system
    rag_system = initialize_rag_system(
        embeddings_csv_path="/kaggle/input/embeddings/chroma_embeddings.csv",
        text_column="document",
        embedding_column="embedding"
    )
    
    # Interactive query loop
    print("\n" + "="*50)
    print("Optimized RAG Question Answering System")
    print("="*50)
    print("Type 'exit' to quit\n")
    
    while True:
        question = input("\nEnter your question: ")
        if question.lower() == 'exit':
            break
        
        print("\nProcessing...")
        try:
            result = rag_question_answering(rag_system, question)
            
            print("\n" + "="*50)
            print("ANSWER:")
            print("="*50)
            print(result["answer"])
            
            print("\n" + "="*50)
            print("TOP RELEVANT DOCUMENTS:")
            print("="*50)
            for i, doc in enumerate(result["top_results"], 1):
                print(f"{i}. Similarity: {doc['similarity']:.4f}, Final Score: {doc['final_score']:.4f}")
                # Limit text display to avoid overwhelming console
                text_preview = doc["text"]
                if len(text_preview) > 300:
                    text_preview = text_preview[:300] + "..."
                print(f"{text_preview}\n")
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print("\nERROR: CUDA out of memory. Try clearing GPU cache and reducing model parameters.")
                
                # Clear GPU caches
                for i in range(torch.cuda.device_count()):
                    with torch.cuda.device(i):
                        torch.cuda.empty_cache()
                        torch.cuda.synchronize()
                
                print_gpu_utilization()
            else:
                print(f"\nERROR: {str(e)}")

In [None]:
if __name__ == "__main__":
    demo_rag_system()