In [18]:
import os
from datetime import datetime
from dotenv import load_dotenv
#
# #--------Google Drive Integration--------#
# # from google.colab import drive, userdata
# # This gives Colab access to your files in Google Drive.
# # drive.mount('/content/drive')
# # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
#
#
# #--------Cursor Integration--------#
# # Load environment variables from .env file
load_dotenv()
#
# # Get GitHub credentials from environment variables
GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
#
# # Print to verify the variables are loaded (remove this in production)
print(f"Username: {GITHUB_USERNAME}")
print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
#
# Repository information
REPOSITORY_NAME = 'PyNucleus-Model'
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"

# Pull latest changes from GitHub
print("Pulling latest changes from GitHub...")
!git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

print("Repository is up to date!")

# Log start time
with open("update_log.txt", "a") as f:
    f.write(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Log Update\n")

Username: Saytor20
Token: ghp_...
Pulling latest changes from GitHub...
From https://github.com/Saytor20/PyNucleus-Model
 * branch            main       -> FETCH_HEAD
Already up to date.
Repository is up to date!


# **Data Ingestion and Preprocessing for RAG**

In [19]:
#----- Date processing for all documents types -----#
import os
from langchain_unstructured import UnstructuredLoader
from PyPDF2 import PdfReader

# --- Configuration ---
# Folder where you will place all your source files (PDFs, DOCX, TXT, etc.)
INPUT_DIR = 'source_documents'

# Folder where the processed .txt files will be saved
OUTPUT_DIR = 'processed_txt_files'

# --- Main Logic ---
if __name__ == "__main__":
    # Create the input directory if it doesn't exist and give instructions
    if not os.path.exists(INPUT_DIR):
        print(f"📂 Creating directory: '{INPUT_DIR}'")
        os.makedirs(INPUT_DIR)
        print(f" Please place your files (PDF, DOCX, TXT, etc.) in the '{INPUT_DIR}' directory and run the script again.")
        exit()

    # Create the output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    files_to_process = [f for f in os.listdir(INPUT_DIR) if os.path.isfile(os.path.join(INPUT_DIR, f))]

    if not files_to_process:
        print(f"ℹ The '{INPUT_DIR}' directory is empty. Nothing to process.")
        exit()

    print(f"--- 📄 Starting processing for {len(files_to_process)} file(s) in '{INPUT_DIR}' ---")

    for filename in files_to_process:
        # Skip hidden files like .DS_Store
        if filename.startswith('.'):
            continue

        input_path = os.path.join(INPUT_DIR, filename)
        output_filename = os.path.splitext(os.path.basename(filename))[0] + '.txt'
        output_path = os.path.join(OUTPUT_DIR, output_filename)

        print(f" ▶ Processing: {filename}")

        try:
            # Handle PDF files differently
            if filename.lower().endswith('.pdf'):
                # Use PyPDF2 for PDF files
                reader = PdfReader(input_path)
                full_text = ""
                for page in reader.pages:
                    full_text += page.extract_text() + "\n\n"
            else:
                # Use UnstructuredLoader for other file types
                loader = UnstructuredLoader(input_path)
                documents = loader.load()
                full_text = "\n\n".join([doc.page_content for doc in documents])

            # Save the extracted text to a new .txt file
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(full_text)

            print(f"   • Success! Saved to: {output_path}")

        except Exception as e:
            print(f"   • Error processing {filename}: {e}")

    print("\n\n All files processed.")

--- 📄 Starting processing for 4 file(s) in 'source_documents' ---
 ▶ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx




   • Success! Saved to: processed_txt_files/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
 ▶ Processing: mcp_basics.txt
   • Success! Saved to: processed_txt_files/mcp_basics.txt
 ▶ Processing: feasibility_factors.txt
   • Success! Saved to: processed_txt_files/feasibility_factors.txt
 ▶ Processing: Bist_Madan.pdf
   • Success! Saved to: processed_txt_files/Bist_Madan.txt


 All files processed.


In [20]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import quote

# --- CONFIGURATION ---
# Keywords to search for in Wikipedia
SEARCH_KEYWORDS = [
    "modular design",
    "software architecture",
    "system design",
    "industrial design",
    "supply chain"
]

# Output directory for saved articles
DATA_DIR = "data_sources"

def search_wikipedia(keyword):
    """Search Wikipedia for a keyword and return the first result URL"""
    search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={quote(keyword)}&format=json"
    response = requests.get(search_url)
    data = response.json()
    
    if data['query']['search']:
        title = data['query']['search'][0]['title']
        return f"https://en.wikipedia.org/wiki/{quote(title)}"
    return None

def scrape_and_save_article(url, keyword):
    """Scrape a Wikipedia article and save it as a text file"""
    print(f"▶️  Searching for: {keyword}")
    
    try:
        # Fetch the article
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the main content
        content = soup.find('div', {'class': 'mw-parser-output'})
        if not content:
            print(f"❌  Could not find article content for: {keyword}")
            return
        
        # Extract text from paragraphs and headers
        article_text = ""
        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = element.get_text().strip()
            if text:
                article_text += text + "\n\n"
        
        # Create output directory if it doesn't exist
        os.makedirs(DATA_DIR, exist_ok=True)
        
        # Save to file
        filename = f"wikipedia_{keyword.replace(' ', '_')}.txt"
        filepath = os.path.join(DATA_DIR, filename)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(article_text)
            
        print(f"✅  Saved article to: {filepath}")
        
    except Exception as e:
        print(f"❌  Error processing {keyword}: {str(e)}")

def main():
    print(f"🔍 Starting Wikipedia article search for {len(SEARCH_KEYWORDS)} keywords...")
    
    for keyword in SEARCH_KEYWORDS:
        article_url = search_wikipedia(keyword)
        if article_url:
            scrape_and_save_article(article_url, keyword)
        else:
            print(f"❌  No article found for: {keyword}")
    
    print("\n✨ Article scraping complete!")

# Run the scraper
if __name__ == "__main__":
    main()

🔍 Starting Wikipedia article search for 5 keywords...
▶️  Searching for: modular design
✅  Saved article to: data_sources/wikipedia_modular_design.txt
▶️  Searching for: software architecture
✅  Saved article to: data_sources/wikipedia_software_architecture.txt
▶️  Searching for: system design
✅  Saved article to: data_sources/wikipedia_system_design.txt
▶️  Searching for: industrial design
✅  Saved article to: data_sources/wikipedia_industrial_design.txt
▶️  Searching for: supply chain
✅  Saved article to: data_sources/wikipedia_supply_chain.txt

✨ Article scraping complete!


In [21]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import json
from datetime import datetime

def load_and_chunk_files():
    """
    Load and chunk files from both data_sources and processed_txt_files directories
    """
    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
        is_separator_regex=False,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    all_documents = []
    
    # Process files from both directories
    directories = ['data_sources', 'processed_txt_files']
    
    for directory in directories:
        if not os.path.exists(directory):
            print(f"⚠️ Directory {directory} not found")
            continue
            
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):
                file_path = os.path.join(directory, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                        # Create Document object with metadata
                        doc = Document(
                            page_content=text,
                            metadata={"source": file_path}
                        )
                        all_documents.append(doc)
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")
    
    print(f"\nLoaded {len(all_documents)} documents for chunking")
    
    # Split documents into chunks
    chunked_documents = text_splitter.split_documents(all_documents)
    print(f"Split into {len(chunked_documents)} chunks")
    
    return chunked_documents

def save_chunked_data(chunked_documents, output_dir="Chuncked_Data"):
    """
    Save chunked documents into three separate files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Save full content with metadata
    full_content = []
    for i, chunk in enumerate(chunked_documents):
        full_content.append({
            "chunk_id": i,
            "content": chunk.page_content,
            "source": chunk.metadata.get('source', 'N/A'),
            "length": len(chunk.page_content)
        })
    
    with open(os.path.join(output_dir, "chunked_data_full.json"), 'w', encoding='utf-8') as f:
        json.dump(full_content, f, indent=2, ensure_ascii=False)
    
    # 2. Save statistical analysis
    stats = {
        "total_chunks": len(chunked_documents),
        "chunk_lengths": [len(chunk.page_content) for chunk in chunked_documents],
        "sources": list(set(chunk.metadata.get('source', 'N/A') for chunk in chunked_documents)),
        "generated_at": datetime.now().isoformat()
    }
    
    with open(os.path.join(output_dir, "chunked_data_stats.json"), 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2)
    
    # 3. Save content-only version (for easy reading)
    with open(os.path.join(output_dir, "chunked_data_content.txt"), 'w', encoding='utf-8') as f:
        for i, chunk in enumerate(chunked_documents):
            f.write(f"=== Chunk {i+1} ===\n")
            f.write(f"Source: {chunk.metadata.get('source', 'N/A')}\n")
            f.write(f"Length: {len(chunk.page_content)} characters\n")
            f.write("\nContent:\n")
            f.write(chunk.page_content)
            f.write("\n\n" + "="*50 + "\n\n")
    
    print(f"\n✅ Successfully saved chunked data to {output_dir}/:")
    print(f"  • chunked_data_full.json - Complete data with metadata")
    print(f"  • chunked_data_stats.json - Statistical analysis")
    print(f"  • chunked_data_content.txt - Human-readable content")

if __name__ == "__main__":
    # Load and chunk the documents
    chunked_docs = load_and_chunk_files()
    
    # Save the chunked data
    save_chunked_data(chunked_docs)


Loaded 9 documents for chunking
Split into 883 chunks

✅ Successfully saved chunked data to Chuncked_Data/:
  • chunked_data_full.json - Complete data with metadata
  • chunked_data_stats.json - Statistical analysis
  • chunked_data_content.txt - Human-readable content


In [22]:
import numpy as np
import os
from collections import Counter
import json
from langchain.schema import Document

def load_chunked_data(json_file='Chuncked_Data/chunked_data_full.json'):
    """
    Load chunked data from JSON file and convert to Document objects.
    
    Args:
        json_file (str): Path to the JSON file containing chunked data
        
    Returns:
        list: List of Document objects
    """
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            chunked_data = json.load(f)
        
        # Convert JSON data to Document objects
        chunked_documents = [
            Document(
                page_content=chunk['content'],
                metadata={'source': chunk['source']}
            ) for chunk in chunked_data
        ]
        return chunked_documents
    except Exception as e:
        print(f"Error loading chunked data: {str(e)}")
        return None

def analyze_chunked_data(chunked_documents):
    """
    Analyze the chunked documents and provide statistical insights.
    
    Args:
        chunked_documents (list): List of chunked Document objects
    """
    if not chunked_documents:
        print("⚠️ No chunked documents provided for analysis.")
        return
    
    print("\n--- Statistical Analysis & Quality Check ---")
    
    # Calculate the lengths of all chunks
    chunk_lengths = [len(chunk.page_content) for chunk in chunked_documents]
    
    # Calculate and print key statistics
    total_chunks = len(chunk_lengths)
    min_size = np.min(chunk_lengths)
    max_size = np.max(chunk_lengths)
    avg_size = np.mean(chunk_lengths)
    std_dev = np.std(chunk_lengths)
    median_size = np.median(chunk_lengths)
    
    print(f"Total Chunks: {total_chunks}")
    print(f"Minimum Chunk Size: {min_size} characters")
    print(f"Maximum Chunk Size: {max_size} characters")
    print(f"Average Chunk Size: {avg_size:.2f} characters")
    print(f"Median Chunk Size: {median_size:.2f} characters")
    print(f"Standard Deviation of Chunk Size: {std_dev:.2f}")
    
    # --- Source Distribution Analysis ---
    source_counts = Counter([chunk.metadata.get('source', 'N/A') for chunk in chunked_documents])
    print("\n--- Source Distribution ---")
    for source, count in source_counts.most_common():
        print(f"{os.path.basename(source)}: {count} chunks ({count/total_chunks*100:.1f}%)")
    
    # --- Automated Quality Feedback ---
    CHUNK_SIZE = 500  # Target chunk size
    
    # 1. Check for high variation in chunk size
    if std_dev > 150:
        print(f"\n[WARNING] High chunk size variation detected (Std Dev: {std_dev:.2f}).")
        print("  > This suggests documents may have irregular structures (e.g., many short lines or lists).")
        print("  > Resulting chunks may have inconsistent levels of context.")
    
    # 2. Check for and count potentially "orphaned" or very small chunks
    small_chunk_threshold = CHUNK_SIZE * 0.20  # Chunks smaller than 20% of the target size
    small_chunk_count = sum(1 for length in chunk_lengths if length < small_chunk_threshold)
    
    if small_chunk_count > 0:
        print(f"\n[ADVISORY] Found {small_chunk_count} chunks smaller than {small_chunk_threshold} characters.")
        print(f"  > The smallest chunk is {min_size} characters.")
        print("  > These small chunks might lack sufficient context and could clutter search results.")
        print("  > Consider cleaning the source documents or adjusting the chunking separators.")
    
    # 3. Check for very large chunks
    large_chunk_threshold = CHUNK_SIZE * 1.5  # Chunks larger than 150% of the target size
    large_chunk_count = sum(1 for length in chunk_lengths if length > large_chunk_threshold)
    
    if large_chunk_count > 0:
        print(f"\n[ADVISORY] Found {large_chunk_count} chunks larger than {large_chunk_threshold} characters.")
        print(f"  > The largest chunk is {max_size} characters.")
        print("  > These large chunks might contain too much information for effective processing.")
    
    # Add a success message if no issues are flagged
    if std_dev <= 150 and small_chunk_count == 0 and large_chunk_count == 0:
        print("\n[INFO] Chunking statistics appear healthy. Sizes are consistent.")
    
    # --- Sample Chunks Preview ---
    print("\n--- Sample Chunks Preview ---")
    for i, chunk in enumerate(chunked_documents[:3]):  # Print first 3 chunks
        chunk_source = os.path.basename(chunk.metadata.get('source', 'N/A'))
        print(f"\n--- Chunk {i+1} (Source: {chunk_source}, Length: {len(chunk.page_content)} chars) ---")
        print(chunk.page_content[:200] + "..." if len(chunk.page_content) > 200 else chunk.page_content)

def main():
    """
    Main function to run the analysis.
    """
    # Load the chunked data
    chunked_documents = load_chunked_data()
    
    if chunked_documents:
        # Run the analysis
        analyze_chunked_data(chunked_documents)
    else:
        print("⚠️ Failed to load chunked data. Please check the JSON file path.")

if __name__ == "__main__":
    main()


--- Statistical Analysis & Quality Check ---
Total Chunks: 883
Minimum Chunk Size: 4 characters
Maximum Chunk Size: 500 characters
Average Chunk Size: 365.26 characters
Median Chunk Size: 416.00 characters
Standard Deviation of Chunk Size: 125.58

--- Source Distribution ---
Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt: 344 chunks (39.0%)
Bist_Madan.txt: 296 chunks (33.5%)
wikipedia_supply_chain.txt: 74 chunks (8.4%)
wikipedia_industrial_design.txt: 63 chunks (7.1%)
wikipedia_software_architecture.txt: 61 chunks (6.9%)
wikipedia_modular_design.txt: 34 chunks (3.9%)
wikipedia_system_design.txt: 7 chunks (0.8%)
mcp_basics.txt: 2 chunks (0.2%)
feasibility_factors.txt: 2 chunks (0.2%)

[ADVISORY] Found 47 chunks smaller than 100.0 characters.
  > The smallest chunk is 4 characters.
  > These small chunks might lack sufficient context and could clutter search results.
  > Consider cleaning the source documents or adjusting the chunking separators.

--- S

In [25]:
import os
import torch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import json
from datetime import datetime

class VectorDBManager:
    def __init__(self, vector_store_dir="vector_store", output_dir="vectordb_outputs"):
        """
        Initialize the VectorDB Manager.
        
        Args:
            vector_store_dir (str): Directory to store vector database files
            output_dir (str): Directory to store output logs and analysis
        """
        self.vector_store_dir = vector_store_dir
        self.output_dir = output_dir
        self.embeddings = None
        self.vector_db = None
        self.faiss_index_path = os.path.join(vector_store_dir, 'pynucleus_mcp_faiss_index')
        
        # Create necessary directories
        os.makedirs(vector_store_dir, exist_ok=True)
        os.makedirs(output_dir, exist_ok=True)
        
        # Initialize output file
        self.output_file = os.path.join(output_dir, f'vectordb_analysis_{datetime.now().strftime("%Y%m%d_%H%M%S")}.txt')
        self._write_to_output("=== VectorDB Analysis Report ===\n")
        self._write_to_output(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

    def _write_to_output(self, text, print_to_console=True):
        """
        Write text to output file and optionally print to console.
        
        Args:
            text (str): Text to write
            print_to_console (bool): Whether to also print to console
        """
        with open(self.output_file, 'a', encoding='utf-8') as f:
            f.write(text + "\n")
        if print_to_console:
            print(text)

    def _explain_score(self, score):
        """
        Explain the meaning of a similarity score.
        
        Args:
            score (float): Similarity score
            
        Returns:
            str: Explanation of the score
        """
        if score < 0.5:
            return "Low similarity - Content may not be very relevant"
        elif score < 0.7:
            return "Moderate similarity - Content is somewhat relevant"
        elif score < 0.85:
            return "Good similarity - Content is relevant"
        else:
            return "High similarity - Content is very relevant"

    def setup_embeddings(self):
        """
        Set up the embedding model using sentence-transformers.
        """
        self._write_to_output("\n=== Setting up Embedding Model ===")
        
        # Determine device (GPU/CPU)
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self._write_to_output(f"Using device: {device} for HuggingFaceEmbeddings")

        # Configure model parameters
        model_kwargs = {'device': device}
        encode_kwargs = {'normalize_embeddings': True}  # Important for cosine similarity

        # Initialize the embedding model
        self.embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
        )

        # Test the embedding model
        sample_text = "Modular Chemical Plants offer faster deployment."
        sample_embedding = self.embeddings.embed_query(sample_text)
        self._write_to_output(f"Sample embedding dimension: {len(sample_embedding)}")
        self._write_to_output("Embedding model setup complete.")
        
        return self.embeddings

    def load_chunked_documents(self, json_file='Chuncked_Data/chunked_data_full.json'):
        """
        Load chunked documents from JSON file.
        
        Args:
            json_file (str): Path to the JSON file containing chunked data
            
        Returns:
            list: List of Document objects
        """
        self._write_to_output("\n=== Loading Chunked Documents ===")
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                chunked_data = json.load(f)
            
            # Convert JSON data to Document objects
            chunked_documents = [
                Document(
                    page_content=chunk['content'],
                    metadata={'source': chunk['source']}
                ) for chunk in chunked_data
            ]
            self._write_to_output(f"Successfully loaded {len(chunked_documents)} documents from {json_file}")
            return chunked_documents
        except Exception as e:
            self._write_to_output(f"Error loading chunked data: {str(e)}")
            return None

    def create_vector_db(self, chunked_documents):
        """
        Create FAISS vector database from chunked documents.
        
        Args:
            chunked_documents (list): List of Document objects
        """
        self._write_to_output("\n=== Creating Vector Database ===")
        
        if not self.embeddings:
            self._write_to_output("Embeddings not initialized. Setting up embeddings first...")
            self.setup_embeddings()
        
        try:
            self.vector_db = FAISS.from_documents(chunked_documents, self.embeddings)
            self._write_to_output("FAISS index created successfully.")
            
            # Save the FAISS index
            self.vector_db.save_local(self.faiss_index_path)
            self._write_to_output(f"FAISS index saved to: {self.faiss_index_path}")
            
            # Add statistics about the vector database
            self._write_to_output("\nVector Database Statistics:")
            self._write_to_output(f"Total documents indexed: {len(chunked_documents)}")
            self._write_to_output(f"Embedding dimension: {len(self.embeddings.embed_query('test'))}")
            
            return self.vector_db
        except Exception as e:
            self._write_to_output(f"Error creating FAISS index: {str(e)}")
            return None

    def load_vector_db(self):
        """
        Load existing FAISS vector database.
        """
        self._write_to_output("\n=== Loading Vector Database ===")
        
        if not self.embeddings:
            self._write_to_output("Embeddings not initialized. Setting up embeddings first...")
            self.setup_embeddings()
        
        try:
            self.vector_db = FAISS.load_local(
                self.faiss_index_path,
                self.embeddings,
                allow_dangerous_deserialization=True
            )
            self._write_to_output("FAISS index loaded successfully.")
            return self.vector_db
        except Exception as e:
            self._write_to_output(f"Error loading FAISS index: {str(e)}")
            return None

    def test_semantic_search(self, query, k=2):
        """
        Test semantic search functionality.
        
        Args:
            query (str): Search query
            k (int): Number of results to return
        """
        self._write_to_output(f"\n=== Testing Semantic Search ===")
        self._write_to_output(f"Query: {query}")
        
        if not self.vector_db:
            self._write_to_output("VectorDB not available. Loading or creating it first...")
            if not self.load_vector_db():
                self._write_to_output("Failed to load VectorDB. Please create it first.")
                return None
        
        try:
            retrieved_docs_with_scores = self.vector_db.similarity_search_with_score(query, k=k)
            
            if retrieved_docs_with_scores:
                self._write_to_output(f"\nRetrieved {len(retrieved_docs_with_scores)} documents:")
                for i, (doc, score) in enumerate(retrieved_docs_with_scores):
                    score_explanation = self._explain_score(score)
                    self._write_to_output(f"\n--- Document {i+1} ---")
                    self._write_to_output(f"Similarity Score: {score:.4f}")
                    self._write_to_output(f"Score Explanation: {score_explanation}")
                    self._write_to_output(f"Source: {doc.metadata.get('source')}")
                    self._write_to_output(f"Content: {doc.page_content[:200]}..." if len(doc.page_content) > 200 else doc.page_content)
            else:
                self._write_to_output("No documents retrieved for the query.")
            
            return retrieved_docs_with_scores
        except Exception as e:
            self._write_to_output(f"Error during similarity search: {str(e)}")
            return None

def main():
    """
    Main function to demonstrate VectorDB setup and testing.
    """
    # Initialize VectorDB Manager
    vdb_manager = VectorDBManager()
    
    # Set up embeddings
    vdb_manager.setup_embeddings()
    
    # Load chunked documents
    chunked_docs = vdb_manager.load_chunked_documents()
    
    if chunked_docs:
        # Create vector database
        vdb_manager.create_vector_db(chunked_docs)
        
        # Test semantic search
        test_queries = [
            "What are the advantages of modular chemical plants?",
            "How does modular design improve scalability?",
            "What are the key factors in system design?",
            "Explain the concept of modular design in manufacturing",
            "What are the challenges in implementing modular systems?"
        ]
        
        vdb_manager._write_to_output("\n=== Running Test Queries ===")
        for query in test_queries:
            vdb_manager.test_semantic_search(query)
        
        vdb_manager._write_to_output("\n=== Analysis Complete ===")
        vdb_manager._write_to_output(f"Full analysis report saved to: {vdb_manager.output_file}")

if __name__ == "__main__":
    main() 

INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


=== VectorDB Analysis Report ===

Analysis started at: 2025-06-04 14:07:50


=== Setting up Embedding Model ===
Using device: cpu for HuggingFaceEmbeddings
Sample embedding dimension: 384
Embedding model setup complete.

=== Loading Chunked Documents ===
Successfully loaded 883 documents from Chuncked_Data/chunked_data_full.json

=== Creating Vector Database ===
FAISS index created successfully.
FAISS index saved to: vector_store/pynucleus_mcp_faiss_index

Vector Database Statistics:
Total documents indexed: 883
Embedding dimension: 384

=== Running Test Queries ===

=== Testing Semantic Search ===
Query: What are the advantages of modular chemical plants?

Retrieved 2 documents:

--- Document 1 ---
Similarity Score: 0.5153
Score Explanation: Moderate similarity - Content is somewhat relevant
Source: processed_txt_files/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
Content: To provide a clearer comparative context, Table 1 summarizes the key character

# This is the last cell of the code

In [23]:
# Log end time
with open("update_log.txt", "a") as f:
    f.write(f"\n {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} changes made and pushed to origin main\n")

# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()

[main 9d3d6d6] Update: Adding all files to repository
 2 files changed, 4 insertions(+), 1 deletion(-)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 497 bytes | 497.00 KiB/s, done.
Total 5 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/Saytor20/PyNucleus-Model.git
   9ef67a4..9d3d6d6  main -> main
All files pushed to GitHub successfully!
