In [None]:
# import os
# from datetime import datetime
# from dotenv import load_dotenv
# #
# # #--------Google Drive Integration--------#
# # # from google.colab import drive, userdata
# # # This gives Colab access to your files in Google Drive.
# # # drive.mount('/content/drive')
# # # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# # GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# # GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# # REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# # NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
# #
# #
# # #--------Cursor Integration--------#
# # # Load environment variables from .env file
# load_dotenv()
# #
# # # Get GitHub credentials from environment variables
# GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
# GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
# #
# # # Print to verify the variables are loaded (remove this in production)
# print(f"Username: {GITHUB_USERNAME}")
# print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
# #
# # Repository information
# REPOSITORY_NAME = 'PyNucleus-Model'
# NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
# LOG_FILENAME = "update_log.txt"

# # Pull latest changes from GitHub
# print("Pulling latest changes from GitHub...")
# !git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

# print("Repository is up to date!")

# # Log start time
# with open("update_log.txt", "a") as f:
#     f.write(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Log Update\n")

# **Data Ingestion and Preprocessing for RAG**

In [None]:
import sys
import os
import importlib
from core_modules.rag import config

# Clear any cached imports to ensure we get the latest versions
modules_to_reload = [
    'core_modules.rag.wiki_scraper',
    'core_modules.rag.document_processor', 
    'core_modules.rag.data_chunking',
    'core_modules.rag.vector_store'
]

for module_name in modules_to_reload:
    if module_name in sys.modules:
        importlib.reload(sys.modules[module_name])

sys.path.append(os.path.abspath('.'))

# Project module imports
from core_modules.rag.document_processor import process_documents
from core_modules.rag.wiki_scraper import scrape_wikipedia_articles
from core_modules.rag.data_chunking import load_and_chunk_files, save_chunked_data
from core_modules.rag.vector_store import FAISSDBManager, _load_docs 
from core_modules.rag.performance_analyzer import PerformanceAnalyzer
from core_modules.rag import config

# Test the import to make sure it works
print("🔧 Testing imports...")
try:
    from core_modules.rag.wiki_scraper import scrape_wikipedia_article
    print("✅ scrape_wikipedia_article imported successfully")
except ImportError as e:
    print(f"❌ Import error: {e}")

print("🚀 All imports ready!\n")

# Step 1: Process source documents (PDF, DOCX, etc.)
print("Step 1: Processing source documents...")
process_documents()

# Step 2: Scrape Wikipedia articles
print("\nStep 2: Scraping Wikipedia articles...")
scrape_wikipedia_articles()

# Step 3: Process and chunk all documents
print("\nStep 3: Processing and chunking documents...")
chunked_docs = load_and_chunk_files()
save_chunked_data(chunked_docs)

# Step 4: Build and evaluate the FAISS vector store
print("\nStep 4: Building and evaluating FAISS vector store...")   

GROUND_TRUTH = config.GROUND_TRUTH_DATA
JSON_PATH = str(config.FULL_JSON_PATH)

f_mgr = FAISSDBManager()
f_docs = _load_docs(JSON_PATH, f_mgr.log)
f_mgr.build(f_docs)
f_mgr.evaluate(GROUND_TRUTH)
print(f"\nFAISS log → {f_mgr.log_path}")


In [None]:
# Test diverse queries
print("🔍 Testing diverse queries...\n")

test_queries = [
    "What are the key challenges in implementing modular chemical plants?",
    "How does supply chain management affect modular design?",
    "What are the economic benefits of modular construction?",
    "How does software architecture relate to modular design?",
    "What are the environmental impacts of modular manufacturing?"
]

# Create a new FAISS manager instance
f_mgr = FAISSDBManager()
f_docs = _load_docs(str(config.FULL_JSON_PATH), f_mgr.log)

# Build the index
f_mgr.build(f_docs)

# Test each query
print("=== Query Results ===\n")
for query in test_queries:
    print(f"\n📝 Query: {query}")
    results = f_mgr.search(query, k=3)
    
    print("\nTop 3 Results:")
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n{i}. Score: {score:.4f}")
        print(f"   Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"   Content: {doc.page_content[:200]}...")

# Analyze chunking statistics
print("\n=== Chunking Statistics ===")
print(f"Total Chunks: {len(f_docs)}")
print(f"Average Chunk Size: {sum(len(doc.page_content) for doc in f_docs) / len(f_docs):.1f} characters")
print(f"Number of Sources: {len(set(doc.metadata.get('source') for doc in f_docs))}")

# Distribution of chunks per source
source_counts = {}
for doc in f_docs:
    source = doc.metadata.get('source', 'Unknown')
    source_counts[source] = source_counts.get(source, 0) + 1

print("\nChunks per Source:")
for source, count in sorted(source_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  • {source.split('/')[-1]}: {count} chunks")

In [38]:
# DWSIM Simulation - Simple Function Calls
from dwsim_workflow import run_dwsim_simulation, quick_dwsim_demo

# One-line demo - runs the entire DWSIM workflow
quick_dwsim_demo()

# Or run a custom simulation:
# csv_path = run_dwsim_simulation("my_plant.dwsim", "results/my_streams.csv")


🚀 Running DWSIM Quick Demo...
🔧 Starting DWSIM simulation workflow...
❌ Unexpected error: No module named 'System'

💡 To use DWSIM integration:
   1. Install DWSIM on your system
   2. Set DWSIM_DLL_PATH environment variable
   3. Place a .dwsim file in examples/ directory
   4. Run: run_dwsim_simulation('your_file.dwsim')


# This is the last cell of the code

In [36]:
from datetime import datetime

# Log end time
with open("update_log.txt", "a") as f:
    f.write(f"\n {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} changes made and pushed to origin main\n")

# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[main 135c545] Update: Adding all files to repository
 4 files changed, 27 insertions(+), 76 deletions(-)
 rename DIRECTORY_RENAME_SUMMARY.md => project_info/DIRECTORY_RENAME_SUMMARY.md (100%)
 rename PROJECT_STRUCTURE.md => project_info/PROJECT_STRUCTURE.md (100%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 779 bytes | 779.00 KiB/s, done.
Total 5 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/Saytor20/PyNucleus-Model.git
   7699006..135c545  main -> main
All files pushed to GitHub successfully!
