In [None]:
import pandas as pd
import numpy as np
import torch
import transformers
import ast
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from tokenizers.normalizers import BertNormalizer
import os
import re
from bert_score import score as bert_score
from rouge_score import rouge_scorer

In [None]:
def print_gpu_utilization():
    """Print GPU memory usage statistics for all available GPUs"""
    print("\nGPU Memory Usage:")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB / {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")

In [None]:
def load_embeddings(embeddings_csv_path, text_column="document", embedding_column="embedding"):
    """Load embeddings from CSV file"""
    print(f"Loading embeddings from {embeddings_csv_path}...")
    df = pd.read_csv(embeddings_csv_path)
    
    # Check if the necessary columns exist
    if text_column not in df.columns:
        raise ValueError(f"Text column '{text_column}' not found in CSV. Columns: {df.columns.tolist()}")
    
    if embedding_column not in df.columns:
        raise ValueError(f"Embedding column '{embedding_column}' not found in CSV. Columns: {df.columns.tolist()}")
    
    # Parse embeddings from string to numpy arrays
    print("Parsing embeddings from string format...")
    embeddings = []
    
    # Sample the first embedding to check format
    sample_embedding = df[embedding_column].iloc[0]
    
    try:
        # Try parsing as a list literal
        parsed_embedding = ast.literal_eval(sample_embedding)
        print(f"Detected embedding format: Python list with {len(parsed_embedding)} dimensions")
        
        # Parse all embeddings
        for emb_str in df[embedding_column]:
            embeddings.append(np.array(ast.literal_eval(emb_str)))
        
    except (ValueError, SyntaxError):
        # If not a list literal, try parsing as comma-separated values
        print("Trying comma-separated format...")
        try:
            parsed_embedding = np.array([float(x) for x in sample_embedding.strip('[]').split(',')])
            print(f"Detected embedding format: CSV with {len(parsed_embedding)} dimensions")
            
            # Parse all embeddings
            for emb_str in df[embedding_column]:
                values = [float(x) for x in emb_str.strip('[]').split(',')]
                embeddings.append(np.array(values))
                
        except Exception as e:
            raise ValueError(f"Failed to parse embeddings: {e}. Please check the format.")
    
    # Convert to numpy array for faster processing
    embeddings = np.vstack(embeddings)
    print(f"Successfully loaded {len(embeddings)} embeddings with {embeddings.shape[1]} dimensions")
    
    return df, embeddings

In [None]:
def preprocess_embeddings(embeddings, df, text_column):
    """Normalize document embeddings for better cosine similarity"""
    print("Normalizing document embeddings...")
    # Remove any zero vectors
    zero_indices = np.where(np.linalg.norm(embeddings, axis=1) == 0)[0]
    if len(zero_indices) > 0:
        print(f"Warning: Found {len(zero_indices)} zero vectors in embeddings. Replacing with small random values.")
        for idx in zero_indices:
            embeddings[idx] = np.random.normal(0, 0.01, embeddings.shape[1])
    
    # L2 normalize embeddings for more accurate cosine similarity
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
    
    # Calculate and store document statistics for weighting
    doc_lengths = np.array([len(str(doc).split()) for doc in df[text_column]])
    avg_doc_length = np.mean(doc_lengths)
    
    print(f"Average document length: {avg_doc_length:.2f} words")
    
    return embeddings, doc_lengths, avg_doc_length

In [None]:
def initialize_models(use_bert_base=True, llm_model_id="/kaggle/input/llama-3.1/transformers/8b/2"):
    """Initialize embedding and LLM models"""
    gpu_count = torch.cuda.device_count()
    print(f"Found {gpu_count} GPUs")
    
    if gpu_count < 2:
        print("Warning: Less than 2 GPUs detected. Will use available resources.")
    
    # Clear GPU caches before loading models
    for i in range(gpu_count):
        with torch.cuda.device(i):
            torch.cuda.empty_cache()

    print_gpu_utilization()
    
    # Load embedding model for queries on GPU 0
    if use_bert_base:
        print("Loading BERT Base Uncased for query embeddings on GPU 0...")
        # Move to GPU 0
        embedding_device = "cuda:0"
        
        # Initialize BERT Base Uncased with improved settings
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        embedding_model = AutoModel.from_pretrained('bert-base-uncased', trust_remote_code=True).to(embedding_device)
        
        # Load normalizer with simple settings
        norm = BertNormalizer(lowercase=True, strip_accents=True, clean_text=True)
    
    print_gpu_utilization()
    
    # Load LLM on GPU 1 (or distributed across GPUs if needed)
    print(f"Loading LLM from {llm_model_id}...")
    
    # Configure device map for model distribution
    if gpu_count >= 2:
        device_map = "balanced"  # Distribute across multiple GPUs
    else:
        device_map = "auto"      # Let Transformers decide

    # Set memory optimization parameters
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # Load LLM with proper configuration
    pipeline = transformers.pipeline(
        "text-generation", 
        model=llm_model_id, 
        model_kwargs={
            "torch_dtype": torch.bfloat16,
            "device_map": device_map,
            "offload_folder": "/tmp/offload"  # Enable offloading if needed
        }
    )

    print_gpu_utilization()        
    print("Model initialization complete")
    
    return tokenizer, embedding_model, norm, pipeline, gpu_count, embedding_device

In [None]:
def normalize_text(text, norm):
    """Normalize text using simple BERT normalizer"""
    # Handle empty or None input
    if not text:
        return ""
        
    # Apply basic preprocessing
    text = text.strip()
    
    # Split into lines and normalize each line
    text = [norm.normalize_str(s) for s in text.split('\n')]
    
    # Remove redundant spaces and join
    normalized_text = ' '.join([re.sub(r'\s+', ' ', s).strip() for s in text])
    
    return normalized_text

In [None]:
def expand_query(query):
    """Expand the query with related terms to improve matching"""
    # Simple rule-based expansion
    # This could be replaced with a more sophisticated approach
    
    # Extract key terms (simple approach)
    terms = re.findall(r'\b\w+\b', query.lower())
    
    # Filter out common stop words (simplified list)
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 
                 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'about', 'like', 'from'}
    
    key_terms = [term for term in terms if term not in stop_words and len(term) > 2]
    
    # If no key terms found, return original query
    if not key_terms:
        return query
        
    # Add original query plus key terms for emphasis
    expanded_query = query + " " + " ".join(key_terms)
    
    return expanded_query

In [None]:
def get_query_embedding(query, tokenizer, embedding_model, embedding_device, norm, use_bert_base=True, embedding_pooling='mean_cls'):
    """Generate better embedding for a query text using optimized pooling strategies"""
    if not use_bert_base:
        raise ValueError("Only BERT Base Uncased embedding is currently supported")
    
    # Apply query expansion to improve matching
    expanded_query = expand_query(query)
    
    # Normalize query
    normalized_query = normalize_text(expanded_query, norm)
    
    # Move computation to GPU 0
    with torch.cuda.device(0):
        # Tokenize with increased max length for better coverage
        tokenized_query = tokenizer(normalized_query, padding=True, 
                                   truncation=True, max_length=512, return_tensors='pt').to(embedding_device)
        
        # Get embeddings with attention mask
        with torch.no_grad():
            outputs = embedding_model(**tokenized_query)
            
            # Get last hidden state and attention mask
            last_hidden_state = outputs.last_hidden_state
            attention_mask = tokenized_query['attention_mask']
            
            # Choose pooling strategy based on configuration
            if embedding_pooling == 'cls':
                # CLS token pooling (first token)
                query_embedding = last_hidden_state[:, 0].cpu().numpy()[0]
                
            elif embedding_pooling == 'mean':
                # Mean pooling with attention mask
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                query_embedding = (sum_embeddings / sum_mask).cpu().numpy()[0]
                
            else:  # 'mean_cls' (default)
                # Weighted combination of CLS and mean pooling
                # CLS token embedding
                cls_embedding = last_hidden_state[:, 0]
                
                # Mean pooling with attention mask
                input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
                sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
                sum_mask = input_mask_expanded.sum(1)
                sum_mask = torch.clamp(sum_mask, min=1e-9)
                mean_embedding = sum_embeddings / sum_mask
                
                # Combine both methods (0.7 for mean pooling, 0.3 for CLS token)
                query_embedding = (0.7 * mean_embedding + 0.3 * cls_embedding).cpu().numpy()[0]
    
    # Normalize the embedding vector to unit length for proper cosine similarity
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    return query_embedding

In [None]:
def find_similar_texts(query_embedding, embeddings, df, text_column, doc_lengths, avg_doc_length, 
                      top_k=5, distance_metric='hybrid', similarity_threshold=0.6):
    """Find the most similar texts to the query embedding with improved similarity metrics"""
    if distance_metric == 'cosine':
        # Compute cosine similarity
        similarities = cosine_similarity([query_embedding], embeddings)[0]
        
    elif distance_metric == 'dot':
        # Compute dot product
        similarities = np.dot(embeddings, query_embedding)
        
    elif distance_metric == 'hybrid':
        # Hybrid approach: combine cosine similarity and BM25-inspired weighting
        cosine_sim = cosine_similarity([query_embedding], embeddings)[0]
        
        # BM25-inspired length normalization (penalize very short and very long documents)
        # k1 and b are BM25 parameters (can be tuned)
        k1 = 1.5
        b = 0.75
        length_norm = (1 - b) + b * (doc_lengths / avg_doc_length)
        bm25_weights = (k1 + 1) / (k1 * length_norm + 1)
        
        # Apply length normalization to cosine similarity
        similarities = cosine_sim * bm25_weights
    
    else:
        raise ValueError(f"Unknown distance metric: {distance_metric}")
    
    # Apply similarity threshold to filter out less relevant documents
    qualified_indices = np.where(similarities >= similarity_threshold)[0]
    
    # If no document passes the threshold, fall back to top k
    if len(qualified_indices) == 0:
        top_indices = similarities.argsort()[-top_k:][::-1]
    else:
        # Otherwise, get the top k from qualified documents
        top_qualified_similarities = similarities[qualified_indices]
        top_qualified_indices = top_qualified_similarities.argsort()[-min(top_k, len(qualified_indices)):][::-1]
        top_indices = qualified_indices[top_qualified_indices]
    
    # Get results with reranking
    results = []
    for idx in top_indices:
        doc_text = df.iloc[idx][text_column]
        
        # Calculate additional features for reranking
        doc_length = len(str(doc_text).split())
        length_penalty = 1.0  # No penalty by default
        
        # Apply a small penalty for very short or very long documents
        if doc_length < 50:
            length_penalty = 0.9  # Slight penalty for very short docs
        elif doc_length > 1000:
            length_penalty = 0.95  # Slight penalty for very long docs
            
        # Final score combining similarity and length consideration
        final_score = similarities[idx] * length_penalty
        
        results.append({
            "index": idx,
            "text": doc_text,
            "similarity": similarities[idx],
            "final_score": final_score
        })
    
    # Final sort by the combined score
    results.sort(key=lambda x: x["final_score"], reverse=True)
    
    return results

In [None]:
def format_context(similar_texts, question):
    """Enhanced context formatting to highlight key phrases"""
    context = "DIRECT CONTEXT FOR ANSWERING:\n\n"
    
    # Extract key terms from the question
    question_terms = set(re.findall(r'\b\w+\b', question.lower()))
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'is', 'are', 'was', 'were'}
    key_terms = {term for term in question_terms if term not in stop_words and len(term) > 2}
    
    # Prioritize and weight reference texts
    scored_texts = []
    for result in similar_texts:
        doc_text = result['text']
        
        # Score based on key term matches
        term_matches = sum(1 for term in key_terms if term in doc_text.lower())
        
        # Additional scoring based on similarity and length
        score = result['similarity'] * (1 + term_matches / len(key_terms))
        scored_texts.append((doc_text, score))
    
    # Sort texts by weighted score
    scored_texts.sort(key=lambda x: x[1], reverse=True)
    
    # Truncate to top most relevant texts
    top_texts = scored_texts[:3]
    
    # Format context with clear markers
    for i, (text, score) in enumerate(top_texts, 1):
        context += f"REFERENCE {i} (Relevance Score: {score:.2f}):\n{text}\n\n"
    
    return context

In [None]:
def generate_answer(question, context, pipeline):
    """Generate answer using the LLM with improved prompt engineering"""
    system_instruction = """You are a precise and informative assistant. When answering questions:
1. Directly use phrases and key information from the provided context.
2. Aim to closely match the language and terminology of the source documents.
3. Cite specific details from the context whenever possible.
4. If you cannot find a definitive answer, clearly state that.
5. Prioritize reproducing key information over generating entirely novel text."""

    prompt = f"""{system_instruction}

CONTEXT:
{context}

IMPORTANT INSTRUCTIONS:
- Use the exact wording from the context when relevant
- Directly incorporate phrases from the provided documents
- Ensure your answer closely reflects the source information

QUESTION: {question}

ANSWER:"""

    # Generate answer with context-aware parameters
    try:
        response = pipeline(
            prompt,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.3,  # Lower temperature for more deterministic output
            top_p=0.8,        # Tighter probability mass
            top_k=30,         # More focused token selection
            repetition_penalty=1.2,  # Encourage using context phrases
            batch_size=1
        )[0]['generated_text']
        
        # Extract just the answer part
        answer = response[len(prompt):].strip()
        
        # Fallback if answer is empty
        if not answer:
            answer = "I apologize, but I couldn't generate a substantive answer based on the available context."
    
    except Exception as e:
        print(f"Error generating answer: {e}")
        answer = "I apologize, but I encountered an error while trying to generate an answer."
    
    return answer

In [None]:
def evaluate_answer(answer, similar_texts):
    """Evaluate the generated answer using BERTScore and ROUGE metrics"""
    evaluation = {
        "bert_scores": {},
        "rouge_scores": {}
    }
    
    # Calculate BERTScore
    try:
        # Prepare reference texts
        reference_texts = [doc['text'] for doc in similar_texts[:3]]
        
        # Ensure texts are not empty and have sufficient length
        reference_texts = [ref for ref in reference_texts if ref and len(ref.strip()) > 10]
        
        if reference_texts and answer and len(answer.strip()) > 0:
            # If only one reference text, repeat it to match the number of candidates
            if len(reference_texts) == 1:
                reference_texts = reference_texts * 3
            
            # Truncate references to match candidates if needed
            reference_texts = reference_texts[:3]
            
            # Ensure we have multiple references
            if len(reference_texts) > 1:
                P, R, F1 = bert_score(
                    cands=[answer] * len(reference_texts),  # Repeat answer to match references
                    refs=reference_texts, 
                    lang="en", 
                    verbose=False,
                    model_type='bert-base-uncased'
                )
                
                bert_scores = {
                    'precision': float(P.mean()),
                    'recall': float(R.mean()),
                    'f1': float(F1.mean())
                }
                evaluation["bert_scores"] = bert_scores
    except Exception as e:
        print(f"BERTScore calculation error: {e}")
        import traceback
        traceback.print_exc()
    
    # Calculate ROUGE scores
    try:
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        
        # Prepare references with different preprocessing
        reference_texts = []
        for doc in similar_texts[:3]:
            # Original text
            reference_texts.append(doc['text'])
            
            # Extractive summary (first and last sentences)
            sentences = re.split(r'(?<=[.!?])\s+', doc['text'])
            extractive_ref = sentences[0] + " " + sentences[-1] if len(sentences) > 1 else doc['text']
            reference_texts.append(extractive_ref)
        
        # Collect ROUGE scores across multiple references
        all_rouge_results = []
        for ref_text in reference_texts:
            if ref_text and len(ref_text.strip()) > 10 and answer and len(answer.strip()) > 0:
                rouge_result = scorer.score(ref_text, answer)
                all_rouge_results.append(rouge_result)
        
        # Aggregate ROUGE scores
        if all_rouge_results:
            rouge_metrics = ['rouge1', 'rouge2', 'rougeL']
            rouge_scores = {metric: {
                'precision': np.mean([result[metric].precision for result in all_rouge_results]),
                'recall': np.mean([result[metric].recall for result in all_rouge_results]),
                'fmeasure': np.mean([result[metric].fmeasure for result in all_rouge_results])
            } for metric in rouge_metrics}
            
            evaluation["rouge_scores"] = rouge_scores
    except Exception as e:
        print(f"ROUGE score calculation error: {e}")
        import traceback
        traceback.print_exc()
    
    return evaluation

In [None]:
def answer_question(question, df, embeddings, tokenizer, embedding_model, norm, pipeline, 
                   gpu_count, embedding_device, doc_lengths, avg_doc_length, 
                   text_column="document", top_k=5, use_bert_base=True, 
                   distance_metric='hybrid', embedding_pooling='mean_cls', 
                   similarity_threshold=0.6):
    """
    Answer a question using RAG framework with improved context handling and response evaluation
    
    Args:
        question: The user's question
        df: DataFrame containing documents
        embeddings: Document embeddings
        tokenizer: BERT tokenizer
        embedding_model: BERT model for embeddings
        norm: Text normalizer
        pipeline: LLM pipeline
        gpu_count: Number of available GPUs
        embedding_device: Device for embedding model
        doc_lengths: Array of document lengths
        avg_doc_length: Average document length
        text_column: Name of column containing text
        top_k: Number of top results to retrieve
        use_bert_base: Whether to use BERT Base Uncased
        distance_metric: Similarity metric to use
        embedding_pooling: Pooling strategy
        similarity_threshold: Minimum similarity score
        
    Returns:
        Dictionary with answer, top results, and evaluation metrics
    """
    print("\nProcessing query...")
    
    # Clear GPU 0 cache
    with torch.cuda.device(0):
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    
    print("Getting query embedding on GPU 0...")
    # Get query embedding
    query_embedding = get_query_embedding(
        question, tokenizer, embedding_model, embedding_device, norm, 
        use_bert_base, embedding_pooling
    )
    
    print("Finding similar texts...")
    # Find similar texts
    similar_texts = find_similar_texts(
        query_embedding, embeddings, df, text_column, doc_lengths, avg_doc_length,
        top_k, distance_metric, similarity_threshold
    )
    
    # Check if we have any similar texts
    if not similar_texts:
        return {
            "answer": "I couldn't find any relevant information to answer your question.",
            "top_results": [],
            "evaluation": {
                "bert_scores": {},
                "rouge_scores": {}
            }
        }
    
    # Format context for LLM with improved chunking
    context = format_context(similar_texts, question)
    
    # Clear GPU caches before LLM inference
    for i in range(gpu_count):
        with torch.cuda.device(i):
            torch.cuda.empty_cache()
            torch.cuda.synchronize()
    
    print_gpu_utilization()
    print("Generating answer with LLM...")
    
    # Generate answer
    answer = generate_answer(question, context, pipeline)
    
    # Evaluate answer
    evaluation = evaluate_answer(answer, similar_texts)
    
    print("Processing complete.")
    print_gpu_utilization()
    
    return {
        "answer": answer,
        "top_results": similar_texts,
        "evaluation": evaluation
    }

In [None]:
def demo_rag_system():
    """Run a demonstration of the RAG system"""
    # Check if the model exists
    model_id = "/kaggle/input/llama-3.1/transformers/8b/2"
    
    # Initialize the RAG system with optimized parameters
    print("Loading embeddings...")
    df, embeddings = load_embeddings(
        embeddings_csv_path="/kaggle/input/bert-embeddings/bert_embeddings.csv",
        text_column="text_preview",
        embedding_column="embeddings"
    )
    
    print("Preprocessing embeddings...")
    embeddings, doc_lengths, avg_doc_length = preprocess_embeddings(embeddings, df, "text_preview")
    
    print("Initializing models...")
    tokenizer, embedding_model, norm, pipeline, gpu_count, embedding_device = initialize_models(
        use_bert_base=True,
        llm_model_id=model_id
    )
    
    # Interactive query loop
    print("\n" + "="*50)
    print("Optimized RAG Question Answering System")
    print("="*50)
    print("Type 'exit' to quit\n")
    
    while True:
        question = input("\nEnter your question: ")
        if question.lower() == 'exit':
            break
        
        print("\nProcessing...")
        try:
            result = answer_question(
                question, df, embeddings, tokenizer, embedding_model, norm, pipeline,
                gpu_count, embedding_device, doc_lengths, avg_doc_length,
                text_column="text_preview", top_k=5, use_bert_base=True,
                distance_metric='hybrid', embedding_pooling='mean_cls',
                similarity_threshold=0.55
            )
            
            print("\n" + "="*50)
            print("ANSWER:")
            print("="*50)
            print(result["answer"])
            
            print("\n" + "="*50)
            print("ANSWER EVALUATION:")
            print("="*50)
            
            # BERTScore handling
            bert_scores = result["evaluation"].get("bert_scores", {})
            if bert_scores:
                print("BERTScore:")
                for metric, value in bert_scores.items():
                    print(f"  {metric.capitalize()}: {value:.4f}")
            else:
                print("BERTScore: No scores available")
            
            # ROUGE Scores handling
            rouge_scores = result["evaluation"].get("rouge_scores", {})
            if rouge_scores:
                print("\nROUGE Scores:")
                for metric, scores in rouge_scores.items():
                    print(f"  {metric.upper()}:")
                    for score_type, value in scores.items():
                        print(f"    {score_type.capitalize()}: {value:.4f}")
            else:
                print("ROUGE Scores: No scores available")
            
            print("\n" + "="*50)
            print("TOP RELEVANT DOCUMENTS:")
            print("="*50)
            for i, doc in enumerate(result["top_results"], 1):
                print(f"{i}. Similarity: {doc['similarity']:.4f}, Final Score: {doc['final_score']:.4f}")
                # Limit text display to avoid overwhelming console
                text_preview = doc["text"]
                if len(text_preview) > 300:
                    text_preview = text_preview[:300] + "..."
                print(f"{text_preview}\n")
        except Exception as e:
            print(f"\nERROR: {str(e)}")
            import traceback
            traceback.print_exc()
            
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print("\nERROR: CUDA out of memory. Try clearing GPU cache and reducing model parameters.")
                
                # Clear GPU caches
                for i in range(torch.cuda.device_count()):
                    with torch.cuda.device(i):
                        torch.cuda.empty_cache()
                        torch.cuda.synchronize()
                
                print_gpu_utilization()
            else:
                print(f"\nERROR: {str(e)}")

In [None]:
if __name__ == "__main__":
    # Set memory optimization environment variable
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    
    # Start demo
    demo_rag_system()