In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/LLM_Nutriplan

In [None]:
!pip install langchain langchain-community transformers sentence-transformers pymongo

In [None]:
import re
import torch
from typing import List, Dict, Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
from pymongo import MongoClient
import numpy as np

# B∆Ø·ªöC 1: HIERARCHICAL PARSER - Ph√¢n t√≠ch c·∫•u tr√∫c t√†i li·ªáu

In [None]:
class NutriPlanManualParser:
    """
    Parser chuy√™n bi·ªát cho c·∫•u tr√∫c Nutri Plan Manual:
    - Part 1, 2, 3, 4 (## headers)
    - Subsections v·ªõi ### (numbered: 1., 2., 3.)
    - Sub-subsections v·ªõi #### (Feature:, Capabilities:, etc.)
    """

    def __init__(self, filepath: str):
        with open(filepath, 'r', encoding='utf-8') as f:
            self.content = f.read()
        self.sections = []

    def parse(self) -> List[Dict]:
        """
        Ph√¢n t√≠ch c·∫•u tr√∫c ph√¢n c·∫•p c·ªßa manual
        Returns: List of sections v·ªõi metadata ƒë·∫ßy ƒë·ªß
        """
        lines = self.content.split('\n')
        sections = []
        current_section = None
        buffer = []

        for i, line in enumerate(lines):
            # Detect headers (##, ###, ####)
            header_match = re.match(r'^(#{2,4})\s+(.+)$', line.strip())

            if header_match:
                # L∆∞u section tr∆∞·ªõc ƒë√≥
                if current_section:
                    current_section['content'] = '\n'.join(buffer).strip()
                    current_section['line_end'] = i - 1
                    if current_section['content']:  # Ch·ªâ l∆∞u n·∫øu c√≥ content
                        sections.append(current_section)

                # Parse header m·ªõi
                level = len(header_match.group(1))
                title = header_match.group(2).strip()

                # Detect section type
                section_type = self._detect_section_type(title, level)

                # T·∫°o section m·ªõi
                current_section = {
                    'level': level,
                    'title': title,
                    'type': section_type,
                    'line_start': i,
                    'line_end': None,
                    'content': ''
                }
                buffer = []

            elif current_section:
                # Th√™m content v√†o buffer
                buffer.append(line)

        # L∆∞u section cu·ªëi c√πng
        if current_section:
            current_section['content'] = '\n'.join(buffer).strip()
            current_section['line_end'] = len(lines) - 1
            if current_section['content']:
                sections.append(current_section)

        return sections

    def _detect_section_type(self, title: str, level: int) -> str:
        """Ph√¢n lo·∫°i section type d·ª±a tr√™n pattern"""
        title_lower = title.lower()

        # Part-level (## Part X:)
        if level == 2 and 'part' in title_lower:
            return 'part'

        # Feature categories
        if 'hub' in title_lower:
            return 'hub'

        if any(word in title_lower for word in ['feature:', 'planner', 'collection', 'user']):
            return 'feature'

        # Subsections
        if any(word in title_lower for word in ['how to', 'capabilities', 'limitations', 'usage']):
            return 'instruction'

        # Default
        return 'general'

    def build_hierarchy(self, sections: List[Dict]) -> List[Dict]:
        """
        X√¢y d·ª±ng parent-child relationships v√† breadcrumb path
        """
        stack = []

        for section in sections:
            # Pop sections c√≥ level >= current (kh√¥ng ph·∫£i parent)
            while stack and stack[-1]['level'] >= section['level']:
                stack.pop()

            # Build path hierarchy
            parent_titles = [s['title'] for s in stack]
            section['parent_path'] = ' > '.join(parent_titles) if parent_titles else ''
            section['full_path'] = (
                section['parent_path'] + ' > ' if section['parent_path'] else ''
            ) + section['title']

            # Th√™m parent type ƒë·ªÉ filter t·ªët h∆°n
            section['parent_type'] = stack[-1]['type'] if stack else None

            stack.append(section)

        return sections

# B∆Ø·ªöC 2: SEMANTIC CHUNKER - Chia nh·ªè d·ª±a tr√™n ng·ªØ nghƒ©a

In [None]:
class SmartSemanticChunker:
    """
    Chunker th√¥ng minh v·ªõi rules ƒë·∫∑c bi·ªát cho technical documentation
    """

    def __init__(self,
                 chunk_size: int = 800,
                 chunk_overlap: int = 200):

        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Separators ∆∞u ti√™n cho technical docs
        self.separators = [
            "\n\n\n",          # Major section breaks
            "\n\n",            # Paragraph breaks
            "\n* ",            # Bullet points
            "\n- ",            # Dash lists
            "\n1. ", "\n2. ",  # Numbered lists
            ". ",              # Sentences
            ", ",              # Clauses
            " ",               # Words
            ""                 # Characters
        ]

        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=self.separators,
            length_function=len,
            is_separator_regex=False
        )

    def chunk_section(self, section: Dict) -> List[Document]:
        """
        Chia section th√†nh chunks v·ªõi metadata enrichment
        """
        content = section['content'].strip()
        if not content:
            return []

        # Preprocessing: Gi·ªØ structure c·ªßa lists
        content = self._preserve_list_structure(content)

        # Split th√†nh chunks
        texts = self.splitter.split_text(content)

        # T·∫°o Documents v·ªõi rich metadata
        docs = []
        for i, text in enumerate(texts):
            # Extract key phrases t·ª´ content
            key_phrases = self._extract_key_phrases(text)

            doc = Document(
                page_content=text,
                metadata={
                    # Section info
                    'section_title': section['title'],
                    'section_level': section['level'],
                    'section_type': section['type'],

                    # Hierarchy
                    'parent_path': section['parent_path'],
                    'parent_type': section['parent_type'],
                    'full_path': section['full_path'],

                    # Chunk info
                    'chunk_index': i,
                    'total_chunks': len(texts),
                    'chunk_size': len(text),

                    # Searchability
                    'key_phrases': key_phrases,

                    # Source
                    'source': 'nutri_plan_manual',
                    'doc_type': 'user_manual'
                }
            )
            docs.append(doc)

        return docs

    def _preserve_list_structure(self, content: str) -> str:
        """ƒê·∫£m b·∫£o bullet points kh√¥ng b·ªã t√°ch r·ªùi context"""
        # Gi·ªØ nguy√™n formatting c·ªßa markdown lists
        return content

    def _extract_key_phrases(self, text: str) -> List[str]:
        """Extract key terms ƒë·ªÉ enhance searchability"""
        # Extract words in **bold** (markdown)
        bold_phrases = re.findall(r'\*\*(.+?)\*\*', text)

        # Extract quoted terms
        quoted = re.findall(r'"(.+?)"', text)

        # Extract code/path references
        code_refs = re.findall(r'`(.+?)`', text)

        # Combine v√† deduplicate
        key_phrases = list(set(bold_phrases + quoted + code_refs))

        return key_phrases[:10]  # Limit to top 10

# B∆Ø·ªöC 3: EMBEDDING & STORAGE PIPELINE

In [None]:
class DocumentEmbeddingPipeline:
    """Production-ready pipeline v·ªõi MongoDB Atlas Vector Search"""

    def __init__(self,
                 mongodb_uri: str,
                 database_name: str,
                 collection_name: str = "user_manual_embeddings"):

        print("üöÄ Initializing Embedding Pipeline...")

        # Initialize embeddings
        print("   Loading multilingual-e5-small model...")
        self.embeddings = HuggingFaceEmbeddings(
            model_name="intfloat/multilingual-e5-small",
            model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
        print(f"   ‚úì Model loaded on: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

        # MongoDB connection
        print("   Connecting to MongoDB...")
        self.client = MongoClient(mongodb_uri)
        self.db = self.client[database_name]
        self.collection = self.db[collection_name]
        print(f"   ‚úì Connected to {database_name}.{collection_name}")

        # Setup indexes
        self._create_indexes()

    def _create_indexes(self):
        """T·∫°o indexes cho efficient querying"""
        print("\nüìä Setting up database indexes...")

        # Text search indexes
        try:
            self.collection.create_index([("metadata.section_title", "text")])
            self.collection.create_index([("metadata.key_phrases", "text")])
            print("   ‚úì Text search indexes created")
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Text index warning: {e}")

        # Metadata filter indexes
        indexes = [
            "metadata.full_path",
            "metadata.section_type",
            "metadata.section_level",
            "metadata.parent_type"
        ]

        for idx in indexes:
            self.collection.create_index(idx)

        print("   ‚úì Metadata filter indexes created")

        # Vector Search reminder
        print("\n" + "="*70)
        print("‚ö†Ô∏è  ATLAS VECTOR SEARCH INDEX REQUIRED FOR PRODUCTION")
        print("="*70)
        print(f"Database: {self.db.name}")
        print(f"Collection: {self.collection.name}")
        print("\nCreate index with this configuration:")
        print("""
{
  "fields": [
    {
      "type": "vector",
      "path": "embedding",
      "numDimensions": 384,
      "similarity": "cosine"
    },
    {
      "type": "filter",
      "path": "metadata.section_type"
    },
    {
      "type": "filter",
      "path": "metadata.parent_type"
    },
    {
      "type": "filter",
      "path": "metadata.full_path"
    }
  ]
}
        """)
        print("="*70 + "\n")

    def process_document(self, filepath: str) -> Dict:
        """
        Main processing pipeline

        Args:
            filepath: Path to the markdown manual file

        Returns:
            Statistics dictionary
        """
        print("="*70)
        print("üîÑ DOCUMENT PROCESSING PIPELINE")
        print("="*70)

        # Step 1: Parse document
        print("\nüìñ Step 1: Parsing document structure...")
        parser = NutriPlanManualParser(filepath)
        sections = parser.parse()
        sections = parser.build_hierarchy(sections)

        print(f"   ‚úì Parsed {len(sections)} sections")

        # Show section distribution
        section_types = {}
        for s in sections:
            section_types[s['type']] = section_types.get(s['type'], 0) + 1

        print("\n   Section distribution:")
        for stype, count in section_types.items():
            print(f"   - {stype}: {count}")

        # Step 2: Chunk sections
        print("\n‚úÇÔ∏è  Step 2: Creating semantic chunks...")
        chunker = SmartSemanticChunker(chunk_size=800, chunk_overlap=200)
        all_docs = []

        for section in sections:
            docs = chunker.chunk_section(section)
            all_docs.extend(docs)

        print(f"   ‚úì Created {len(all_docs)} chunks")

        # Step 3: Generate embeddings
        print("\nüßÆ Step 3: Generating embeddings...")
        texts = [doc.page_content for doc in all_docs]

        print(f"   Processing {len(texts)} texts...")
        vectors = self.embeddings.embed_documents(texts)

        print(f"   ‚úì Generated {len(vectors)} vectors (dim={len(vectors[0])})")

        # Step 4: Store in MongoDB
        print("\nüíæ Step 4: Storing in MongoDB...")

        # Clear existing data
        delete_result = self.collection.delete_many({})
        print(f"   Cleared {delete_result.deleted_count} existing documents")

        # Prepare documents
        documents_to_insert = []
        for doc, vector in zip(all_docs, vectors):
            doc_dict = {
                'text': doc.page_content,
                'embedding': vector,
                'metadata': doc.metadata
            }
            documents_to_insert.append(doc_dict)

        # Batch insert
        result = self.collection.insert_many(documents_to_insert)
        print(f"   ‚úì Inserted {len(result.inserted_ids)} documents")

        # Return statistics
        stats = {
            'total_sections': len(sections),
            'section_types': section_types,
            'total_chunks': len(all_docs),
            'total_embeddings': len(vectors),
            'embedding_dim': len(vectors[0]),
            'inserted_documents': len(result.inserted_ids)
        }

        print("\n" + "="*70)
        print("‚úÖ PROCESSING COMPLETE")
        print("="*70)

        return stats

    def semantic_search(self,
                       query: str,
                       top_k: int = 5,
                       filters: Dict = None,
                       use_atlas_search: bool = True) -> List[Dict]:
        """
        T√¨m ki·∫øm semantic v·ªõi optional filters

        Args:
            query: Search query
            top_k: Number of results
            filters: MongoDB filter dict (e.g., {"metadata.section_type": "feature"})
            use_atlas_search: Use Atlas Vector Search or fallback
        """
        # Embed query
        query_vector = self.embeddings.embed_query(query)

        if use_atlas_search:
            try:
                # Build aggregation pipeline
                pipeline = [
                    {
                        "$vectorSearch": {
                            "index": "vector_index",
                            "path": "embedding",
                            "queryVector": query_vector,
                            "numCandidates": top_k * 10,
                            "limit": top_k
                        }
                    }
                ]

                # Add filters if provided
                if filters:
                    pipeline.append({"$match": filters})

                # Project results
                pipeline.append({
                    "$project": {
                        "text": 1,
                        "metadata": 1,
                        "score": {"$meta": "vectorSearchScore"}
                    }
                })

                results = list(self.collection.aggregate(pipeline))

                return [{
                    'text': r['text'],
                    'metadata': r['metadata'],
                    'score': r['score']
                } for r in results]

            except Exception as e:
                print(f"‚ö†Ô∏è  Atlas Vector Search failed: {e}")
                print("   Falling back to manual search...")
                use_atlas_search = False

        # Fallback: Manual search
        if not use_atlas_search:
            query_dict = filters if filters else {}
            all_docs = list(self.collection.find(query_dict))

            similarities = []
            for doc in all_docs:
                similarity = np.dot(query_vector, doc['embedding'])
                similarities.append({
                    'text': doc['text'],
                    'metadata': doc['metadata'],
                    'score': float(similarity)
                })

            similarities.sort(key=lambda x: x['score'], reverse=True)
            return similarities[:top_k]

    def get_statistics(self) -> Dict:
        """L·∫•y th·ªëng k√™ v·ªÅ database"""
        total_docs = self.collection.count_documents({})

        # Aggregate by section type
        pipeline = [
            {
                "$group": {
                    "_id": "$metadata.section_type",
                    "count": {"$sum": 1}
                }
            }
        ]

        type_distribution = {
            item['_id']: item['count']
            for item in self.collection.aggregate(pipeline)
        }

        return {
            'total_chunks': total_docs,
            'type_distribution': type_distribution
        }

    def close(self):
        """ƒê√≥ng MongoDB connection"""
        self.client.close()
        print("‚úì MongoDB connection closed")

# B∆Ø·ªöC 4: USAGE EXAMPLE

In [None]:
def main():
    """Example usage"""

    # Configuration
    MONGODB_URI = os.getenv("MONGODB_URI")
    DATABASE_NAME = "test"
    COLLECTION_NAME = "llm_documents"
    MANUAL_FILE = "user_manual_for_rag.md"  # Path to your manual file

    # Initialize pipeline
    pipeline = DocumentEmbeddingPipeline(
        mongodb_uri=MONGODB_URI,
        database_name=DATABASE_NAME,
        collection_name=COLLECTION_NAME
    )

    try:
        # Process document
        # stats = pipeline.process_document(MANUAL_FILE)

        # Print statistics
        # print("\nüìä PROCESSING STATISTICS:")
        # print(f"Total Sections: {stats['total_sections']}")
        # print(f"Total Chunks: {stats['total_chunks']}")
        # print(f"Embedding Dimension: {stats['embedding_dim']}")

        # Test searches
        print("\n" + "="*70)
        print("üîç TESTING SEMANTIC SEARCH")
        print("="*70)

        test_queries = [
            "T√¥i ƒë√£ ƒëƒÉng nh·∫≠p, l√†m sao ƒë·ªÉ t√¥i t·∫°o th·ª±c ƒë∆°n trong ng√†y",
            "What are the guest features?",
            "How to log out of the application?",
            "How does the grocery list work?"
        ]

        for query in test_queries:
            print(f"\n‚ùì Query: '{query}'")
            results = pipeline.semantic_search(query, top_k=3)

            for i, result in enumerate(results, 1):
                print(f"\n  [{i}] Score: {result['score']:.4f}")
                print(f"      Path: {result['metadata']['full_path']}")
                print(f"      Preview: {result['text']}")

        # Database stats
        print("\n" + "="*70)
        db_stats = pipeline.get_statistics()
        print("üìà DATABASE STATISTICS:")
        print(f"Total Chunks: {db_stats['total_chunks']}")
        print("Type Distribution:", db_stats['type_distribution'])

    finally:
        pipeline.close()


if __name__ == "__main__":
    main()