In [None]:
# import os
# from datetime import datetime
# from dotenv import load_dotenv
# #
# # #--------Google Drive Integration--------#
# # # from google.colab import drive, userdata
# # # This gives Colab access to your files in Google Drive.
# # # drive.mount('/content/drive')
# # # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# # GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# # GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# # REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# # NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
# #
# #
# # #--------Cursor Integration--------#
# # # Load environment variables from .env file
# load_dotenv()
# #
# # # Get GitHub credentials from environment variables
# GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
# GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
# #
# # # Print to verify the variables are loaded (remove this in production)
# print(f"Username: {GITHUB_USERNAME}")
# print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
# #
# # Repository information
# REPOSITORY_NAME = 'PyNucleus-Model'
# NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
# LOG_FILENAME = "update_log.txt"

# # Pull latest changes from GitHub
# print("Pulling latest changes from GitHub...")
# !git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

# print("Repository is up to date!")

# # Log start time
# with open("update_log.txt", "a") as f:
#     f.write(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Log Update\n")

# **Data Ingestion and Preprocessing for RAG**

In [26]:
import sys
import os
import importlib

# Clear any cached imports to ensure we get the latest versions
modules_to_reload = [
    'core_modules.rag.wiki_scraper',
    'core_modules.rag.document_processor', 
    'core_modules.rag.data_chunking',
    'core_modules.rag.vector_store'
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])

sys.path.append(os.path.abspath('.'))

# Project module imports
from core_modules.rag.document_processor import process_documents
from core_modules.rag.wiki_scraper import scrape_wikipedia_articles
from core_modules.rag.data_chunking import load_and_chunk_files, save_chunked_data
from core_modules.rag.vector_store import FAISSDBManager, _load_docs 
from core_modules.rag.performance_analyzer import PerformanceAnalyzer

# Test the import to make sure it works
print("üîß Testing imports...")
try:
    from core_modules.rag.wiki_scraper import scrape_wikipedia_article
    print("‚úÖ scrape_wikipedia_article imported successfully")
except ImportError as e:
    print(f"‚ùå Import error: {e}")

print("üöÄ All imports ready!\n")


# Step 1: Process source documents (PDF, DOCX, etc.)
print("Step 1: Processing source documents...")
process_documents()

# Step 2: Scrape Wikipedia articles
print("\nStep 2: Scraping Wikipedia articles...")
scrape_wikipedia_articles()

# Step 3: Process and chunk all documents
print("\nStep 3: Processing and chunking documents...")
chunked_docs = load_and_chunk_files()
save_chunked_data(chunked_docs)

# Step 4: Build and evaluate the FAISS vector store
print("\nStep 4: Building and evaluating FAISS vector store...")   

GROUND_TRUTH = {
       "what are the benefits of modular design": "web_sources/wikipedia_modular_design.txt",
       "how does modular design work in vehicles": "web_sources/wikipedia_modular_design.txt"
   }

JSON_PATH = "converted_chunked_data/chunked_data_full.json"

f_mgr = FAISSDBManager()
f_docs = _load_docs(JSON_PATH, f_mgr.log)
f_mgr.build(f_docs)
f_mgr.evaluate(GROUND_TRUTH)
print(f"\nFAISS log ‚Üí {f_mgr.log_path}")

üîß Testing imports...
‚úÖ scrape_wikipedia_article imported successfully
üöÄ All imports ready!

Step 1: Processing source documents...
--- üìÑ Starting processing for 5 file(s) in 'source_documents' ---
 ‚ñ∂ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx




   ‚Ä¢ Success! Saved to: converted_to_txt/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
 ‚ñ∂ Processing: mcp_basics.txt
   ‚Ä¢ Success! Saved to: converted_to_txt/mcp_basics.txt
 ‚ñ∂ Processing: feasibility_factors.txt
   ‚Ä¢ Success! Saved to: converted_to_txt/feasibility_factors.txt
 ‚ñ∂ Processing: Bist_Madan.pdf




   ‚Ä¢ Success! Saved to: converted_to_txt/Bist_Madan.txt
 ‚ñ∂ Processing: sample_document.txt
   ‚Ä¢ Success! Saved to: converted_to_txt/sample_document.txt

 All files processed.

Step 2: Scraping Wikipedia articles...
üîç Starting Wikipedia article search for 5 keywords...
‚ñ∂Ô∏è  Searching for: modular design
‚úÖ  Saved article to: web_sources/wikipedia_modular_design.txt
‚ñ∂Ô∏è  Searching for: software architecture
‚úÖ  Saved article to: web_sources/wikipedia_software_architecture.txt
‚ñ∂Ô∏è  Searching for: system design
‚úÖ  Saved article to: web_sources/wikipedia_system_design.txt
‚ñ∂Ô∏è  Searching for: industrial design
‚úÖ  Saved article to: web_sources/wikipedia_industrial_design.txt
‚ñ∂Ô∏è  Searching for: supply chain


INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


‚úÖ  Saved article to: web_sources/wikipedia_supply_chain.txt

‚ú® Article scraping complete!

Step 3: Processing and chunking documents...
üì∞ Found 5 Wikipedia articles
üìÑ Found 5 converted documents
üìã Total documents loaded: 10
‚úÇÔ∏è Split into 867 chunks

‚úÖ Successfully saved chunked data to converted_chunked_data/:
  ‚Ä¢ chunked_data_full.json - Complete data with metadata
  ‚Ä¢ chunked_data_stats.json - Statistical analysis
  ‚Ä¢ chunked_data_content.txt - Human-readable content


Step 4: Building and evaluating FAISS vector store...
=== FAISS VectorDB Analysis ===
Started: 2025-06-10 01:25:18
Loaded 867 documents from converted_chunked_data/chunked_data_full.json
Embedding device ‚Üí cpu   | dim=384
Docs indexed : 867
Index file   : vector_db/pynucleus_mcp.faiss
Embeds .pkl  : vector_db/embeddings.pkl

-- Files in vector_db/ --
  ¬∑ embeddings.pkl
  ¬∑ pynucleus_mcp.faiss

=== Evaluation (Recall@3) ===
Q: what are the benefits of modular design  ‚úì   top-score=0.4110
Q

In [1]:
# DWSIM Simulation - Simple Function Calls
from dwsim_workflow import run_dwsim_simulation, quick_dwsim_demo

# One-line demo - runs the entire DWSIM workflow
quick_dwsim_demo()

# Or run a custom simulation:
# csv_path = run_dwsim_simulation("my_plant.dwsim", "results/my_streams.csv")


üöÄ Running DWSIM Quick Demo...
üîß Starting DWSIM simulation workflow...
‚ùå Unexpected error: No module named 'System'

üí° To use DWSIM integration:
   1. Install DWSIM on your system
   2. Set DWSIM_DLL_PATH environment variable
   3. Place a .dwsim file in examples/ directory
   4. Run: run_dwsim_simulation('your_file.dwsim')


# This is the last cell of the code

In [None]:
# # Log end time
# with open("update_log.txt", "a") as f:
#     f.write(f"\n {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} changes made and pushed to origin main\n")

# # Simple GitHub update function
# def update_github():
#     !git add .
#     !git commit -m "Update: Adding all files to repository"
#     !git push origin main
#     print("All files pushed to GitHub successfully!")

# # To use it, just run:
# update_github()