In [198]:
# import os
# from datetime import datetime
# from dotenv import load_dotenv
# #
# # #--------Google Drive Integration--------#
# # # from google.colab import drive, userdata
# # # This gives Colab access to your files in Google Drive.
# # # drive.mount('/content/drive')
# # # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# # GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# # GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# # REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# # NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
# #
# #
# # #--------Cursor Integration--------#
# # # Load environment variables from .env file
# load_dotenv()
# #
# # # Get GitHub credentials from environment variables
# GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
# GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
# #
# # # Print to verify the variables are loaded (remove this in production)
# print(f"Username: {GITHUB_USERNAME}")
# print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
# #
# # Repository information
# REPOSITORY_NAME = 'PyNucleus-Model'
# NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
# LOG_FILENAME = "update_log.txt"

# # Pull latest changes from GitHub
# print("Pulling latest changes from GitHub...")
# !git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

# print("Repository is up to date!")

# # Log start time
# with open("update_log.txt", "a") as f:
#     f.write(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Log Update\n")

In [199]:
# PyNucleus Model - Setup and Imports
import sys
from pathlib import Path

# Add src to Python path
src_path = str(Path().resolve() / "src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import PyNucleus Pipeline modules
from pynucleus.pipeline import RAGPipeline, DWSIMPipeline, ResultsExporter, PipelineUtils
from pynucleus.integration.config_manager import ConfigManager
from pynucleus.integration.dwsim_rag_integrator import DWSIMRAGIntegrator
from pynucleus.integration.llm_output_generator import LLMOutputGenerator


print(" PyNucleus Model - Pipeline Ready!")
print(" Available Components:")
print("   • RAGPipeline - Document processing and retrieval")
print("   • DWSIMPipeline - Chemical process simulation") 
print("   • ResultsExporter - CSV export functionality")
print("   • PipelineUtils - Complete pipeline orchestration")

 PyNucleus Model - Pipeline Ready!
 Available Components:
   • RAGPipeline - Document processing and retrieval
   • DWSIMPipeline - Chemical process simulation
   • ResultsExporter - CSV export functionality
   • PipelineUtils - Complete pipeline orchestration


# **PyNucleus Model - Complete Pipeline**

This notebook contains the complete PyNucleus model pipeline with separate sections for:
1. **Data Ingestion and Preprocessing for RAG** 
2. **DWSIM Integration and Simulation**
3. **Results Export to CSV**

In [200]:
# Initialize Pipeline Components
pipeline = PipelineUtils(results_dir="data/05_output/results")

print("\n🔧 Pipeline Initialized!")
print("Available Functions:")
print("   • pipeline.run_complete_pipeline() - Run everything")
print("   • pipeline.run_rag_only() - RAG pipeline only")  
print("   • pipeline.run_dwsim_only() - DWSIM simulations only")
print("   • pipeline.quick_test() - Verify status")
print("   • pipeline.view_results_summary() - View results")
print("   • pipeline.print_pipeline_status() - Detailed status")
print("   • pipeline.clean_all_results() - Clean all data")


🔧 Setting up RAG imports...
✅ RAG imports ready!
🔧 Setting up DWSIM imports...
✅ DWSIM modules imported successfully
📁 Results directory: data/05_output/results
🔧 Pipeline Utils initialized with results dir: data/05_output/results

🔧 Pipeline Initialized!
Available Functions:
   • pipeline.run_complete_pipeline() - Run everything
   • pipeline.run_rag_only() - RAG pipeline only
   • pipeline.run_dwsim_only() - DWSIM simulations only
   • pipeline.quick_test() - Verify status
   • pipeline.view_results_summary() - View results
   • pipeline.print_pipeline_status() - Detailed status
   • pipeline.clean_all_results() - Clean all data


In [201]:
# ========================================
# SECTION 1: COMPLETE PIPELINE - Run Everything (Basic Mode)
"""This section is the basic pipeline which covers 
- Runs standard RAG + DWSIM + CSV Export
- Output: Basic CSV files with simulation results
- For: Regular users who need standard functionality """
# ========================================

# Run the complete pipeline (RAG + DWSIM + Export)
results = pipeline.run_complete_pipeline()

# Display results summary
if results:
    print(f"\n🎉 Pipeline completed in {results['duration']:.1f} seconds!")
    print(f"📊 RAG Results: {len(results['rag_data'])} queries processed")
    print(f"🔬 DWSIM Results: {len(results['dwsim_data'])} simulations completed")
    print(f"📁 Exported Files: {len(results['exported_files'])} CSV files created")
else:
    print("❌ Pipeline execution failed")

🚀 Running complete PyNucleus pipeline...
🗑️ RAG results cleared.
🗑️ DWSIM results cleared.
📚 Starting RAG Pipeline...
Step 1: Processing source documents...
--- 📄 Starting processing for 5 file(s) in '/Users/mohammadalmusaiteer/PyNucleus-Model/data/01_raw/source_documents' ---




 ▶ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx
   • Success! Saved to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/02_processed/converted_to_txt/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
 ▶ Processing: mcp_basics.txt
   • Success! Saved to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/02_processed/converted_to_txt/mcp_basics.txt
 ▶ Processing: feasibility_factors.txt
   • Success! Saved to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/02_processed/converted_to_txt/feasibility_factors.txt
 ▶ Processing: Bist_Madan.pdf


Processing files: 100%|██████████| 5/5 [00:00<00:00, 11.77it/s]

   • Success! Saved to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/02_processed/converted_to_txt/Bist_Madan.txt
 ▶ Processing: sample_document.txt
   • Success! Saved to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/02_processed/converted_to_txt/sample_document.txt

 All files processed.

Step 2: Scraping Wikipedia articles...
🔍 Starting Wikipedia article search for 5 keywords...
▶️  Searching for: modular design





✅  Saved article to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/01_raw/web_sources/wikipedia_modular_design.txt
▶️  Searching for: software architecture
✅  Saved article to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/01_raw/web_sources/wikipedia_software_architecture.txt
▶️  Searching for: system design
✅  Saved article to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/01_raw/web_sources/wikipedia_system_design.txt
▶️  Searching for: industrial design
✅  Saved article to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/01_raw/web_sources/wikipedia_industrial_design.txt
▶️  Searching for: supply chain


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


✅  Saved article to: /Users/mohammadalmusaiteer/PyNucleus-Model/data/01_raw/web_sources/wikipedia_supply_chain.txt

✨ Article scraping complete!

Step 3: Processing and chunking documents...
📰 Found 5 Wikipedia articles
📄 Found 5 converted documents
📋 Total documents loaded: 10
✂️ Split into 846 chunks

✅ Successfully saved chunked data to /Users/mohammadalmusaiteer/PyNucleus-Model/data/03_intermediate/converted_chunked_data/:
  • chunked_data_full.json - Complete data with metadata
  • chunked_data_stats.json - Statistical analysis
  • chunked_data_content.txt - Human-readable content


Step 4: Building FAISS vector store...
=== FAISS VectorDB Analysis ===
Started: 2025-06-10 23:11:51
Loaded 846 documents from /Users/mohammadalmusaiteer/PyNucleus-Model/data/03_intermediate/converted_chunked_data/chunked_data_full.json
Embedding device → cpu   | dim=384
Docs indexed : 846
Index file   : /Users/mohammadalmusaiteer/PyNucleus-Model/data/04_models/chunk_reports/pynucleus_mcp.faiss
Embeds .

In [202]:
# # ========================================
# # OPTIONAL: INDIVIDUAL PIPELINE COMPONENTS
# # ========================================

# # Option 1: Run only RAG Pipeline
# print("📚 RAG Only Pipeline:")
# rag_results = pipeline.run_rag_only()
# if rag_results:
#     print(f"   ✅ {len(rag_results['rag_data'])} RAG queries processed")

# print("\n" + "="*50 + "\n")

# # Option 2: Run only DWSIM Simulations  
# print("🔬 DWSIM Only Pipeline:")
# dwsim_results = pipeline.run_dwsim_only()
# if dwsim_results:
#     print(f"   ✅ {len(dwsim_results['dwsim_data'])} DWSIM simulations completed")

In [203]:
# # ========================================
# # OPTIONAL: View Results & Status (After Running Pipelines)
# # ========================================

# # View pipeline status
# pipeline.print_pipeline_status()

# print("\n" + "="*50 + "\n")

# # View results summary
# pipeline.view_results_summary()

# print("\n" + "="*50 + "\n")

# # Quick test
# test_results = pipeline.quick_test()
# print(f"✅ Quick test completed! Found {test_results['csv_files_count']} CSV files")

In [204]:
# ========================================
# OPTIONAL: CLEANUP AND RESET (Optional)
# ========================================
# Uncomment the lines below if you need to clean up results:

# Clean up and reset (optional - removes all previous results)
# pipeline.clean_all_results()

# print("✅ PyNucleus Pipeline is ready!")
# print("📋 Usage Guide:")
# print("   • Section 1: Run complete pipeline (RAG + DWSIM + Export)")
# print("   • Section 2: Individual components (commented out)")
# print("   • Section 3: Utility functions (status, summary, test)")
# print("   • Enhanced Pipeline: Advanced configuration, integration, LLM output")
# print("   • Enhanced Features: Configuration, Integration, LLM Output")
# print("\n🔄 Run any cell multiple times to re-execute components")
# print("📁 All results automatically saved as CSV files in data/05_output/results/")

In [205]:
# ========================================
# OPTION A: Run Individual Components (Alternative to Section 1)
# ========================================
# Uncomment ONLY the lines you want to run:

# Option A1: Complete pipeline (same as Section 1)
# results = pipeline.run_complete_pipeline()

# Option A2: Individual components only
# rag_results = pipeline.run_rag_only()        # RAG documents only
# dwsim_results = pipeline.run_dwsim_only()    # DWSIM simulations only

In [206]:
# ========================================
# ENHANCED PIPELINE - Initialize Advanced Features
"""PREREQUISITE: Run Section 1 (Complete Pipeline) first!
This section adds advanced capabilities ON TOP OF the basic pipeline:
• Financial analysis with ROI calculations
• LLM-ready reports and summaries  
• DWSIM-RAG integration with enhanced analytics
• Custom configuration templates"""
# ========================================

print("🔧 Initializing Enhanced Pipeline Components...")

try:
    # Force reload modules to get latest version (fixes notebook caching)
    import importlib
    
    # Clear any cached modules
    modules_to_reload = [
        'pynucleus.integration.config_manager',
        'pynucleus.integration.dwsim_rag_integrator', 
        'pynucleus.integration.llm_output_generator'
    ]
    
    for module_name in modules_to_reload:
        if module_name in sys.modules:
            importlib.reload(sys.modules[module_name])
    
    config_manager = ConfigManager(config_dir="configs")
    dwsim_rag_integrator = DWSIMRAGIntegrator(
    rag_pipeline=pipeline.rag_pipeline if hasattr(pipeline, 'rag_pipeline') else None,
    results_dir="data/05_output/results"
    )
    llm_generator = LLMOutputGenerator(results_dir="data/05_output/llm_reports")

    print("✅ Enhanced Pipeline Ready: Configuration, Integration, LLM Output")
    print(f"✅ LLM reports will be saved to: {llm_generator.results_dir}/")
    enhanced_available = True
    
except Exception as e:
    print(f"⚠️ Enhanced features not available: {e}")
    import traceback
    traceback.print_exc()
    enhanced_available = False

🔧 Initializing Enhanced Pipeline Components...
✅ Enhanced Pipeline Ready: Configuration, Integration, LLM Output
✅ LLM reports will be saved to: data/05_output/llm_reports/


In [207]:
# ENHANCED PIPELINE STEP 2: Configuration Templates
if 'enhanced_available' in locals() and enhanced_available:
    json_template = config_manager.create_template_json("simulation_config_template.json", verbose=True)
    csv_template = config_manager.create_template_csv("simulation_config_template.csv", verbose=True)
    
    print("✅ Configuration templates created:")
    print(f"   JSON: {json_template}")
    print(f"   CSV: {csv_template}")
else:
    print("❌ Enhanced configuration not available")
    print("⚠️ Run Cell 10 (Enhanced Pipeline Initialization) first!")

✅ Pydantic template created: configs/simulation_config_template.json
✅ Template created: configs/simulation_config_template.csv
✅ Configuration templates created:
   JSON: configs/simulation_config_template.json
   CSV: configs/simulation_config_template.csv


In [208]:
# ENHANCED PIPELINE STEP 3: DWSIM-RAG Integration with Enhanced Analytics
if 'enhanced_available' in locals() and enhanced_available:
    dwsim_results = pipeline.dwsim_pipeline.get_results()
    
    if dwsim_results:
        # Perform integration
        integrated_results = dwsim_rag_integrator.integrate_simulation_results(
            dwsim_results, perform_rag_analysis=True
        )
        
        # Export results
        integrated_export_file = dwsim_rag_integrator.export_integrated_results()
        
        # Show key metrics only
        if integrated_results:
            sample = integrated_results[0]
            print(f"✅ Enhanced Analysis Complete:")
            print(f"   Simulations: {len(integrated_results)}")
            print(f"   Performance: {sample['performance_metrics']['overall_performance']}")
            print(f"   Efficiency: {sample['performance_metrics']['efficiency_rating']}")
            print(f"   Results: {integrated_export_file}")
    else:
        print("⚠️ No DWSIM results available")
else:
    print("❌ Enhanced integration not available")
    print("⚠️ Run Cell 10 (Enhanced Pipeline Initialization) first!")

✅ Enhanced 5 simulations with RAG insights
✅ Enhanced Analysis Complete:
   Simulations: 5
   Performance: Good
   Efficiency: High
   Results: data/05_output/results/integrated_dwsim_rag_results_20250610_231158.json


In [209]:
# ENHANCED PIPELINE STEP 4: LLM-Ready Output with Enhanced Feed Conditions
if 'enhanced_available' in locals() and enhanced_available and 'integrated_results' in locals():
    print("🔄 Running enhanced LLM output generation with detailed feed conditions...")
    # Generate LLM summary for each simulation using Jinja2 template
    llm_ready_files = []
    for i, result in enumerate(integrated_results):
        # Template uses original_simulation.case_name, no need to add simulation_name
        llm_file = llm_generator.export_llm_ready_text(result)
        llm_ready_files.append(llm_file)
    
    # Export financial analysis and show metrics
    financial_file = llm_generator.export_financial_analysis(integrated_results)
    metrics = llm_generator._calculate_key_metrics(integrated_results)
    
    print(f"✅ Analysis Reports Generated:")
    print(f"   LLM Summaries: {len(llm_ready_files)} files created")
    for llm_file in llm_ready_files:
        print(f"     • {llm_file}")
    print(f"   Financial Analysis: {financial_file}")
    print(f"\n💰 Key Financial Metrics:")
    print(f"   Recovery Rate: {metrics['avg_recovery']:.1f}%")
    print(f"   Daily Revenue: ${metrics['estimated_revenue']:,.2f}")
    print(f"   Daily Profit: ${metrics['net_profit']:,.2f}")
    print(f"   ROI: {metrics['roi']:.1f}%")
    
elif 'enhanced_available' in locals() and enhanced_available:
    print("❌ Run Cell 12 (DWSIM-RAG Integration) first to generate integrated_results")
else:
    print("❌ Enhanced LLM output not available")
    print("⚠️ Run Cell 10 (Enhanced Pipeline Initialization) first!")

🔄 Running enhanced LLM output generation with detailed feed conditions...
✅ Analysis Reports Generated:
   LLM Summaries: 5 files created
     • data/05_output/llm_reports/distillation_ethanol_water_summary.md
     • data/05_output/llm_reports/reactor_methane_combustion_summary.md
     • data/05_output/llm_reports/heat_exchanger_steam_summary.md
     • data/05_output/llm_reports/absorber_co2_capture_summary.md
     • data/05_output/llm_reports/crystallizer_salt_summary.md
   Financial Analysis: data/05_output/llm_reports/financial_analysis_20250610_231158.csv

💰 Key Financial Metrics:
   Recovery Rate: 82.5%
   Daily Revenue: $148,500.00
   Daily Profit: $58,500.00
   ROI: 6.5%


In [210]:
#  PIPELINE STEP 5: Custom Simulations (Advanced Configuration)  
if 'enhanced_available' in locals() and enhanced_available:
    # Create and save custom configuration
    custom_simulations = [
        {
            "name": "optimized_ethanol_plant",
            "type": "distillation",
            "components": ["water", "ethanol"],
            "description": "Optimized ethanol plant with enhanced parameters",
            "parameters": {"temperature": 82.0, "pressure": 101325, "flow_rate": 1500, "reflux_ratio": 3.0},
            "expected_outputs": {"conversion": 0.97, "selectivity": 0.99, "yield": 0.96}
        }
    ]
    
    # Save and load configuration to configs/ directory
    import json
    from pathlib import Path
    config_file = Path("configs/custom_simulations_demo.json")
    config_file.parent.mkdir(exist_ok=True)
    
    with open(config_file, 'w') as f:
        json.dump({"simulations": custom_simulations}, f, indent=2)
    
    print(f"✅ Custom Configuration Demo:")
    print(f"   Configuration: {config_file}")
    print(f"   Simulations: {len(custom_simulations)}")
    print(f"   Status: Ready for processing")
    
else:
    print("❌ Enhanced configuration not available")
    print("⚠️ Run Cell 10 (Enhanced Pipeline Initialization) first!")

✅ Custom Configuration Demo:
   Configuration: configs/custom_simulations_demo.json
   Simulations: 1
   Status: Ready for processing


In [211]:
# # ========================================
# # VERSION CONTROL (Optional - For Maintainers Only)
# # ========================================
# # Uncomment the lines below if you need to update the repository:

# from datetime import datetime

# # Log end time
# with open("update_log.txt", "a") as f:
#     f.write(f"\n {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} changes made and pushed to origin main\n")

# # Simple GitHub update function
# def update_github():
#     print(" Starting GitHub update...")
#     !git add .
#     print("📦 Files added to staging")
#     !git commit -m "Update: $(date +'%Y-%m-%d %H:%M:%S')"
#     print("💾 Changes committed")
#     !git push origin main
#     print("✅ Changes pushed to GitHub successfully!")

# # To use it, just run:
# update_github()