In [18]:
import os
from datetime import datetime
from dotenv import load_dotenv
#
# #--------Google Drive Integration--------#
# # from google.colab import drive, userdata
# # This gives Colab access to your files in Google Drive.
# # drive.mount('/content/drive')
# # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
#
#
# #--------Cursor Integration--------#
# # Load environment variables from .env file
load_dotenv()
#
# # Get GitHub credentials from environment variables
GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
#
# # Print to verify the variables are loaded (remove this in production)
print(f"Username: {GITHUB_USERNAME}")
print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
#
# Repository information
REPOSITORY_NAME = 'PyNucleus-Model'
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"

# Pull latest changes from GitHub
print("Pulling latest changes from GitHub...")
!git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

print("Repository is up to date!")

# Log start time
with open("update_log.txt", "a") as f:
    f.write(f" {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}: Log Update\n")

Username: Saytor20
Token: ghp_...
Pulling latest changes from GitHub...
From https://github.com/Saytor20/PyNucleus-Model
 * branch            main       -> FETCH_HEAD
Already up to date.
Repository is up to date!


# **Data Ingestion and Preprocessing for RAG**

In [42]:
#----- Date processing for all documents types -----#
import os
from langchain_unstructured import UnstructuredLoader
from PyPDF2 import PdfReader

# --- Configuration ---
# Folder where you will place all your source files (PDFs, DOCX, TXT, etc.)
INPUT_DIR = 'source_documents'

# Folder where the processed .txt files will be saved
OUTPUT_DIR = 'processed_txt_files'

# --- Main Logic ---
if __name__ == "__main__":
    # Create the input directory if it doesn't exist and give instructions
    if not os.path.exists(INPUT_DIR):
        print(f"📂 Creating directory: '{INPUT_DIR}'")
        os.makedirs(INPUT_DIR)
        print(f" Please place your files (PDF, DOCX, TXT, etc.) in the '{INPUT_DIR}' directory and run the script again.")
        exit()

    # Create the output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    files_to_process = [f for f in os.listdir(INPUT_DIR) if os.path.isfile(os.path.join(INPUT_DIR, f))]

    if not files_to_process:
        print(f"ℹ The '{INPUT_DIR}' directory is empty. Nothing to process.")
        exit()

    print(f"--- 📄 Starting processing for {len(files_to_process)} file(s) in '{INPUT_DIR}' ---")

    for filename in files_to_process:
        # Skip hidden files like .DS_Store
        if filename.startswith('.'):
            continue

        input_path = os.path.join(INPUT_DIR, filename)
        output_filename = os.path.splitext(os.path.basename(filename))[0] + '.txt'
        output_path = os.path.join(OUTPUT_DIR, output_filename)

        print(f" ▶ Processing: {filename}")

        try:
            # Handle PDF files differently
            if filename.lower().endswith('.pdf'):
                # Use PyPDF2 for PDF files
                reader = PdfReader(input_path)
                full_text = ""
                for page in reader.pages:
                    full_text += page.extract_text() + "\n\n"
            else:
                # Use UnstructuredLoader for other file types
                loader = UnstructuredLoader(input_path)
                documents = loader.load()
                full_text = "\n\n".join([doc.page_content for doc in documents])

            # Save the extracted text to a new .txt file
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(full_text)

            print(f"   • Success! Saved to: {output_path}")

        except Exception as e:
            print(f"   • Error processing {filename}: {e}")

    print("\n\n All files processed.")

--- 📄 Starting processing for 4 file(s) in 'source_documents' ---
 ▶ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx




   • Success! Saved to: processed_txt_files/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
 ▶ Processing: mcp_basics.txt
   • Success! Saved to: processed_txt_files/mcp_basics.txt
 ▶ Processing: feasibility_factors.txt
   • Success! Saved to: processed_txt_files/feasibility_factors.txt
 ▶ Processing: Bist_Madan.pdf
   • Success! Saved to: processed_txt_files/Bist_Madan.txt


 All files processed.


In [43]:
"--- Wikipedia Data Scraping ---"
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import quote

# --- CONFIGURATION ---
# Keywords to search for in Wikipedia
SEARCH_KEYWORDS = [
    "modular design",
    "software architecture",
    "system design",
    "industrial design",
    "supply chain"
]

# Output directory for saved articles
DATA_DIR = "data_sources"

def search_wikipedia(keyword):
    """Search Wikipedia for a keyword and return the first result URL"""
    search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={quote(keyword)}&format=json"
    response = requests.get(search_url)
    data = response.json()
    
    if data['query']['search']:
        title = data['query']['search'][0]['title']
        return f"https://en.wikipedia.org/wiki/{quote(title)}"
    return None

def scrape_and_save_article(url, keyword):
    """Scrape a Wikipedia article and save it as a text file"""
    print(f"▶️  Searching for: {keyword}")
    
    try:
        # Fetch the article
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Get the main content
        content = soup.find('div', {'class': 'mw-parser-output'})
        if not content:
            print(f"❌  Could not find article content for: {keyword}")
            return
        
        # Extract text from paragraphs and headers
        article_text = ""
        for element in content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
            text = element.get_text().strip()
            if text:
                article_text += text + "\n\n"
        
        # Create output directory if it doesn't exist
        os.makedirs(DATA_DIR, exist_ok=True)
        
        # Save to file
        filename = f"wikipedia_{keyword.replace(' ', '_')}.txt"
        filepath = os.path.join(DATA_DIR, filename)
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(article_text)
            
        print(f"✅  Saved article to: {filepath}")
        
    except Exception as e:
        print(f"❌  Error processing {keyword}: {str(e)}")

def main():
    print(f"🔍 Starting Wikipedia article search for {len(SEARCH_KEYWORDS)} keywords...")
    
    for keyword in SEARCH_KEYWORDS:
        article_url = search_wikipedia(keyword)
        if article_url:
            scrape_and_save_article(article_url, keyword)
        else:
            print(f"❌  No article found for: {keyword}")
    
    print("\n✨ Article scraping complete!")

# Run the scraper
if __name__ == "__main__":
    main()

🔍 Starting Wikipedia article search for 5 keywords...
▶️  Searching for: modular design
✅  Saved article to: data_sources/wikipedia_modular_design.txt
▶️  Searching for: software architecture
✅  Saved article to: data_sources/wikipedia_software_architecture.txt
▶️  Searching for: system design
✅  Saved article to: data_sources/wikipedia_system_design.txt
▶️  Searching for: industrial design
✅  Saved article to: data_sources/wikipedia_industrial_design.txt
▶️  Searching for: supply chain
✅  Saved article to: data_sources/wikipedia_supply_chain.txt

✨ Article scraping complete!


In [44]:
# ---- Document Chunking and Analysis ----#
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import json
from datetime import datetime

def load_and_chunk_files():
    """
    Load and chunk files from both data_sources and processed_txt_files directories
    """
    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
        is_separator_regex=False,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    all_documents = []
    
    # Process files from both directories
    directories = ['data_sources', 'processed_txt_files']
    
    for directory in directories:
        if not os.path.exists(directory):
            print(f"⚠️ Directory {directory} not found")
            continue
            
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):
                file_path = os.path.join(directory, filename)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        text = f.read()
                        # Create Document object with metadata
                        doc = Document(
                            page_content=text,
                            metadata={"source": file_path}
                        )
                        all_documents.append(doc)
                except Exception as e:
                    print(f"Error processing {file_path}: {str(e)}")
    
    print(f"\nLoaded {len(all_documents)} documents for chunking")
    
    # Split documents into chunks
    chunked_documents = text_splitter.split_documents(all_documents)
    print(f"Split into {len(chunked_documents)} chunks")
    
    return chunked_documents

def save_chunked_data(chunked_documents, output_dir="Chuncked_Data"):
    """
    Save chunked documents into three separate files
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # 1. Save full content with metadata
    full_content = []
    for i, chunk in enumerate(chunked_documents):
        full_content.append({
            "chunk_id": i,
            "content": chunk.page_content,
            "source": chunk.metadata.get('source', 'N/A'),
            "length": len(chunk.page_content)
        })
    
    with open(os.path.join(output_dir, "chunked_data_full.json"), 'w', encoding='utf-8') as f:
        json.dump(full_content, f, indent=2, ensure_ascii=False)
    
    # 2. Save statistical analysis
    stats = {
        "total_chunks": len(chunked_documents),
        "chunk_lengths": [len(chunk.page_content) for chunk in chunked_documents],
        "sources": list(set(chunk.metadata.get('source', 'N/A') for chunk in chunked_documents)),
        "generated_at": datetime.now().isoformat()
    }
    
    with open(os.path.join(output_dir, "chunked_data_stats.json"), 'w', encoding='utf-8') as f:
        json.dump(stats, f, indent=2)
    
    # 3. Save content-only version (for easy reading)
    with open(os.path.join(output_dir, "chunked_data_content.txt"), 'w', encoding='utf-8') as f:
        for i, chunk in enumerate(chunked_documents):
            f.write(f"=== Chunk {i+1} ===\n")
            f.write(f"Source: {chunk.metadata.get('source', 'N/A')}\n")
            f.write(f"Length: {len(chunk.page_content)} characters\n")
            f.write("\nContent:\n")
            f.write(chunk.page_content)
            f.write("\n\n" + "="*50 + "\n\n")
    
    print(f"\n✅ Successfully saved chunked data to {output_dir}/:")
    print(f"  • chunked_data_full.json - Complete data with metadata")
    print(f"  • chunked_data_stats.json - Statistical analysis")
    print(f"  • chunked_data_content.txt - Human-readable content")

if __name__ == "__main__":
    # Load and chunk the documents
    chunked_docs = load_and_chunk_files()
    
    # Save the chunked data
    save_chunked_data(chunked_docs)


Loaded 9 documents for chunking
Split into 883 chunks

✅ Successfully saved chunked data to Chuncked_Data/:
  • chunked_data_full.json - Complete data with metadata
  • chunked_data_stats.json - Statistical analysis
  • chunked_data_content.txt - Human-readable content


Vector DB

In [None]:
# %%  FAISS Vector Store (flat-folder, detailed logging, no log-wipe)
import os, json, pickle, shutil, torch
from datetime import datetime
from typing import Dict, List
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document

# ---------- helpers ----------
def _mkdir_clean(path: str):
    if os.path.isdir(path):
        shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

def _mkdir_if_missing(path: str):            # <- preserve logs
    if not os.path.isdir(path):
        os.makedirs(path, exist_ok=True)

def _timestamp(): return datetime.now().strftime("%Y%m%d_%H%M%S")
def _now():        return datetime.now().strftime("%Y-%m-%d %H:%M:%S")

def _logfile(out_dir: str, prefix: str):
    _mkdir_if_missing(out_dir)
    path = os.path.join(out_dir, f"{prefix}_{_timestamp()}.txt")
    def write(msg: str, echo=True):
        with open(path, "a", encoding="utf-8") as f:
            f.write(msg + "\n")
        if echo: print(msg)
    return write, path

def _load_docs(json_path: str, log) -> List[Document]:
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)
        log(f"Loaded {len(data)} documents from {json_path}")
        return [Document(page_content=d["content"],
                         metadata={"source": d["source"]}) for d in data]
    except Exception as e:
        log(f"⚠️  {e} – falling back to 3 dummy docs.")
        dummy = [
            ("Modular chemical plants reduce construction time.", "dummy_1"),
            ("Scalability is a key advantage of modular design.", "dummy_2"),
            ("Challenges include supply-chain coordination.", "dummy_3"),
        ]
        return [Document(page_content=t, metadata={"source": s}) for t, s in dummy]

# ---------- manager ----------
class FAISSDBManager:
    def __init__(self, vec_dir="faiss_store", out_dir="vectordb_outputs"):
        _mkdir_clean(vec_dir)      # re-create store each run
        self.vec_dir, self.out_dir = vec_dir, out_dir
        self.index_path = os.path.join(vec_dir, "pynucleus_mcp.faiss")
        self.embed_path = os.path.join(vec_dir, "embeddings.pkl")
        self.log, self.log_path = _logfile(out_dir, "faiss_analysis")
        self.log("=== FAISS VectorDB Analysis ===")
        self.log(f"Started: {_now()}")
        self.db, self.embeddings = None, None

    def _emb(self):
        if self.embeddings is None:
            dev = "cuda" if torch.cuda.is_available() else "cpu"
            self.embeddings = HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2",
                model_kwargs={"device": dev},
                encode_kwargs={"normalize_embeddings": True})
            self.log(f"Embedding device → {dev}   | dim={len(self.embeddings.embed_query('hi'))}")
        return self.embeddings

    def build(self, docs):
        emb = self._emb()
        self.db = FAISS.from_documents(docs, emb)
        self.db.save_local(self.index_path)
        pickle.dump(emb, open(self.embed_path, "wb"))
        self.log(f"Docs indexed : {len(docs)}")
        self.log(f"Index file   : {self.index_path}")
        self.log(f"Embeds .pkl  : {self.embed_path}")

        # List everything in vec_dir so you can verify
        self.log("\n-- Files in faiss_store/ --")
        for f in os.listdir(self.vec_dir):
            self.log(f"  · {f}")

    def load(self):
        if self.db is None:
            if self.embeddings is None and os.path.isfile(self.embed_path):
                self.embeddings = pickle.load(open(self.embed_path, "rb"))
            self.db = FAISS.load_local(self.index_path,
                                       self.embeddings,
                                       allow_dangerous_deserialization=True)

    def search(self, q: str, k=3):
        self.load()
        return self.db.similarity_search_with_score(q, k)

    def evaluate(self, gt: Dict[str, str], k=3):
        self.log("\n=== Evaluation (Recall@3) ===")
        hits = 0
        for q, expect in gt.items():
            res = self.search(q, k)
            best = res[0][1] if res else float("inf")
            good = any(expect in d.page_content or expect == d.metadata["source"] for d, _ in res)
            hits += good
            self.log(f"Q: {q[:45]:<45}  {'✓' if good else '✗'}   top-score={best:.4f}")
        self.log(f"\nRecall@{k}: {hits}/{len(gt)}  →  {hits/len(gt):.1%}")

# ---------- quick demo ----------
GROUND_TRUTH = {
    "advantages of modular chemical plants": "dummy_1",
    "scalability of modular design": "dummy_2",
}
JSON_PATH = "Chuncked_Data/chunked_data_full.json"

f_mgr = FAISSDBManager()
f_docs = _load_docs(JSON_PATH, f_mgr.log)
f_mgr.build(f_docs)
f_mgr.evaluate(GROUND_TRUTH)
print(f"\nFAISS log → {f_mgr.log_path}")


# This is the last cell of the code

In [None]:
# Log end time
with open("update_log.txt", "a") as f:
    f.write(f"\n {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} changes made and pushed to origin main\n")

# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()