In [18]:
import os
from datetime import datetime
from dotenv import load_dotenv
#
# #--------Google Drive Integration--------#
# # from google.colab import drive, userdata
# # This gives Colab access to your files in Google Drive.
# # drive.mount('/content/drive')
# # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
#
#
# #--------Cursor Integration--------#
# # Load environment variables from .env file
load_dotenv()
#
# # Get GitHub credentials from environment variables
GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
#
# # Print to verify the variables are loaded (remove this in production)
print(f"Username: {GITHUB_USERNAME}")
print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
#
# Repository information
REPOSITORY_NAME = 'PyNucleus-Model'
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"

# Pull latest changes from GitHub
print("Pulling latest changes from GitHub...")
!git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

print("Repository is up to date!")

# Log start time
with open("update_log.txt", "a") as f:
    f.write(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Log Update\n")

Username: Saytor20
Token: ghp_...
Pulling latest changes from GitHub...
From https://github.com/Saytor20/PyNucleus-Model
 * branch            main       -> FETCH_HEAD
Already up to date.
Repository is up to date!


# **Data Ingestion and Preprocessing for RAG**

In [28]:
# Import all the modular components
from document_processor import process_documents
from wiki_scraper import scrape_wikipedia_articles
from data_processor import load_and_chunk_files, save_chunked_data
from faiss_manager import FAISSDBManager, _load_docs

# Step 1: Process source documents (PDF, DOCX, etc.)
print("Step 1: Processing source documents...")
process_documents()

# Step 2: Scrape Wikipedia articles
print("\nStep 2: Scraping Wikipedia articles...")
scrape_wikipedia_articles()

# Step 3: Process and chunk all documents
print("\nStep 3: Processing and chunking documents...")
chunked_docs = load_and_chunk_files()
save_chunked_data(chunked_docs)

# Step 4: Build and evaluate the FAISS vector store
print("\nStep 4: Building and evaluating FAISS vector store...")
GROUND_TRUTH = {
    "advantages of modular chemical plants": "dummy_1",
    "scalability of modular design": "dummy_2",
}
JSON_PATH = "Chuncked_Data/chunked_data_full.json"

f_mgr = FAISSDBManager()
f_docs = _load_docs(JSON_PATH, f_mgr.log)
f_mgr.build(f_docs)
f_mgr.evaluate(GROUND_TRUTH)
print(f"\nFAISS log → {f_mgr.log_path}")

Step 1: Processing source documents...
--- 📄 Starting processing for 4 file(s) in 'source_documents' ---
 ▶ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx




   • Success! Saved to: processed_txt_files/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
 ▶ Processing: mcp_basics.txt
   • Success! Saved to: processed_txt_files/mcp_basics.txt
 ▶ Processing: feasibility_factors.txt
   • Success! Saved to: processed_txt_files/feasibility_factors.txt
 ▶ Processing: Bist_Madan.pdf
   • Success! Saved to: processed_txt_files/Bist_Madan.txt


 All files processed.

Step 2: Scraping Wikipedia articles...
🔍 Starting Wikipedia article search for 5 keywords...
▶️  Searching for: modular design
✅  Saved article to: data_sources/wikipedia_modular_design.txt
▶️  Searching for: software architecture
✅  Saved article to: data_sources/wikipedia_software_architecture.txt
▶️  Searching for: system design
✅  Saved article to: data_sources/wikipedia_system_design.txt
▶️  Searching for: industrial design
✅  Saved article to: data_sources/wikipedia_industrial_design.txt
▶️  Searching for: supply chain


INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


✅  Saved article to: data_sources/wikipedia_supply_chain.txt

✨ Article scraping complete!

Step 3: Processing and chunking documents...

Loaded 9 documents for chunking
Split into 883 chunks

✅ Successfully saved chunked data to Chuncked_Data/:
  • chunked_data_full.json - Complete data with metadata
  • chunked_data_stats.json - Statistical analysis
  • chunked_data_content.txt - Human-readable content

Step 4: Building and evaluating FAISS vector store...
=== FAISS VectorDB Analysis ===
Started: 2025-06-04 16:22:22
Loaded 883 documents from Chuncked_Data/chunked_data_full.json
Embedding device → cpu   | dim=384
Docs indexed : 883
Index file   : faiss_store/pynucleus_mcp.faiss
Embeds .pkl  : faiss_store/embeddings.pkl

-- Files in faiss_store/ --
  · embeddings.pkl
  · pynucleus_mcp.faiss

=== Evaluation (Recall@3) ===
Q: advantages of modular chemical plants          ✗   top-score=0.4061
Q: scalability of modular design                  ✗   top-score=0.5283

Recall@3: 0/2  →  0.0%



# This is the last cell of the code

In [None]:
# Log end time
with open("update_log.txt", "a") as f:
    f.write(f"\n {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} changes made and pushed to origin main\n")

# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()