In [5]:
import os
from datetime import datetime
from dotenv import load_dotenv
#
# #--------Google Drive Integration--------#
# # from google.colab import drive, userdata
# # This gives Colab access to your files in Google Drive.
# # drive.mount('/content/drive')
# # 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
#
#
# #--------Cursor Integration--------#
# # Load environment variables from .env file
load_dotenv()
#
# # Get GitHub credentials from environment variables
GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')
#
# # Print to verify the variables are loaded (remove this in production)
print(f"Username: {GITHUB_USERNAME}")
print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security
#
# Repository information
REPOSITORY_NAME = 'PyNucleus-Model'
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"

# Pull latest changes from GitHub
print("Pulling latest changes from GitHub...")
!git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

print("Repository is up to date!")

# Log start time
with open("update_log.txt", "a") as f:
    f.write(f"Session started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

Username: Saytor20
Token: ghp_...
Pulling latest changes from GitHub...
From https://github.com/Saytor20/PyNucleus-Model
 * branch            main       -> FETCH_HEAD
Already up to date.
Repository is up to date!


# **Data Ingestion and Preprocessing for RAG**

In [6]:
#----- Date processing for all documents types -----#
import os
from langchain_unstructured import UnstructuredLoader
from PyPDF2 import PdfReader

# --- Configuration ---
# Folder where you will place all your source files (PDFs, DOCX, TXT, etc.)
INPUT_DIR = 'source_documents'

# Folder where the processed .txt files will be saved
OUTPUT_DIR = 'processed_txt_files'

# --- Main Logic ---
if __name__ == "__main__":
    # Create the input directory if it doesn't exist and give instructions
    if not os.path.exists(INPUT_DIR):
        print(f"📂 Creating directory: '{INPUT_DIR}'")
        os.makedirs(INPUT_DIR)
        print(f" Please place your files (PDF, DOCX, TXT, etc.) in the '{INPUT_DIR}' directory and run the script again.")
        exit()

    # Create the output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    files_to_process = [f for f in os.listdir(INPUT_DIR) if os.path.isfile(os.path.join(INPUT_DIR, f))]

    if not files_to_process:
        print(f"ℹ The '{INPUT_DIR}' directory is empty. Nothing to process.")
        exit()

    print(f"--- 📄 Starting processing for {len(files_to_process)} file(s) in '{INPUT_DIR}' ---")

    for filename in files_to_process:
        # Skip hidden files like .DS_Store
        if filename.startswith('.'):
            continue

        input_path = os.path.join(INPUT_DIR, filename)
        output_filename = os.path.splitext(os.path.basename(filename))[0] + '.txt'
        output_path = os.path.join(OUTPUT_DIR, output_filename)

        print(f" ▶ Processing: {filename}")

        try:
            # Handle PDF files differently
            if filename.lower().endswith('.pdf'):
                # Use PyPDF2 for PDF files
                reader = PdfReader(input_path)
                full_text = ""
                for page in reader.pages:
                    full_text += page.extract_text() + "\n\n"
            else:
                # Use UnstructuredLoader for other file types
                loader = UnstructuredLoader(input_path)
                documents = loader.load()
                full_text = "\n\n".join([doc.page_content for doc in documents])

            # Save the extracted text to a new .txt file
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(full_text)

            print(f"   • Success! Saved to: {output_path}")

        except Exception as e:
            print(f"   • Error processing {filename}: {e}")

    print("\n\n All files processed.")

--- 📄 Starting processing for 4 file(s) in 'source_documents' ---
 ▶ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx




   • Success! Saved to: processed_txt_files/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
 ▶ Processing: mcp_basics.txt
   • Success! Saved to: processed_txt_files/mcp_basics.txt
 ▶ Processing: feasibility_factors.txt
   • Success! Saved to: processed_txt_files/feasibility_factors.txt
 ▶ Processing: Bist_Madan.pdf
   • Success! Saved to: processed_txt_files/Bist_Madan.txt


 All files processed.


In [None]:
#!/usr/bin/env python3
# test_crawl_modular_plants.py
# ────────────────────────────────────────────────────────────────────────
# 1. Reads any existing articles.csv to avoid re-scraping
# 2. Asks how many NEW articles you want
# 3. Google-searches each keyword (≤ TEST_RESULTS_PER_KW hits)
# 4. Downloads each candidate (HTML or PDF) → extracts text
# 5. Saves rows (title, url, text) to articles.csv
# -----------------------------------------------------------------------

import io, time, pathlib, requests, pandas as pd
from typing import List
from requests.exceptions import HTTPError

import trafilatura
from pdfminer.high_level import extract_text as pdf_text
from googlesearch import search                         # works with both forks

# ── CONFIG (testing-mode: small numbers) ───────────────────────────────
KEYWORDS               = [
    "modular chemical plants cost analysis 2024",
    "MCP feasibility study",
    "pre-fabricated chemical modules logistics",
]
TEST_RESULTS_PER_KW    = 10      # small batch for testing
MIN_LENGTH             = 50      # keep short so we see something fast
PAUSE_BETWEEN_DL       = 1       # polite download delay (sec)

OUTPUT_DIR   = pathlib.Path("data_sources")
OUTPUT_FILE  = OUTPUT_DIR / "articles.csv"
HEADERS      = {"User-Agent": "Mozilla/5.0 (testing-crawler)"}
TIMEOUT      = 15   # seconds

OUTPUT_DIR.mkdir(exist_ok=True)

# ── STEP 1: load existing URLs ─────────────────────────────────────────
print("\n[1] Loading existing article list …")
if OUTPUT_FILE.exists():
    seen_urls = set(pd.read_csv(OUTPUT_FILE, usecols=["url"])["url"])
    print(f"    → {len(seen_urls):,} URLs already stored – will skip.")
else:
    seen_urls = set()
    print("    → No previous CSV – starting fresh.")

# ── Ask user for target count ──────────────────────────────────────────
try:
    target = int(input("\nHow many NEW articles to fetch (testing)? ").strip())
except ValueError:
    target = 0
if target <= 0:
    print("Nothing requested – exiting.")
    exit(0)

# ── Helpers ────────────────────────────────────────────────────────────
def google_search(q: str, limit: int) -> List[str]:
    """Return up to `limit` URLs using whichever googlesearch signature works."""
    try:
        hits = search(q, num_results=limit, lang="en")
    except TypeError:
        hits = search(q, lang="en", num=limit, stop=limit)
    clean = [
        u for u in hits
        if u.startswith("http")
        and not any(bad in u for bad in ("google.", "/search?", "facebook.com"))
    ]
    return clean[:limit]

def extract_article(url: str, resp: requests.Response) -> str:
    ctype = resp.headers.get("content-type", "").lower()
    if "pdf" in ctype or url.lower().endswith(".pdf"):
        return pdf_text(io.BytesIO(resp.content))
    # ensure a reasonable encoding
    if resp.encoding is None:
        resp.encoding = resp.apparent_encoding
    return trafilatura.extract(resp.text) or ""

# ── STEP 2: search & scrape ────────────────────────────────────────────
new_rows = []
processed = set()
print("\n[2] Searching & scraping …")

for kw in KEYWORDS:
    if len(new_rows) >= target:
        break
    print(f"  • Keyword: {kw!r}")

    for url in google_search(kw, TEST_RESULTS_PER_KW):
        if len(new_rows) >= target:
            break
        if url in seen_urls or url in processed:
            continue

        print(f"    → Fetching: {url}")
        processed.add(url)

        try:
            r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            r.raise_for_status()
            text = extract_article(url, r)

            if len(text) < MIN_LENGTH:
                print("      Skipped – too short.")
                continue

            meta = trafilatura.extract_metadata(r.text) if "text" in dir(r) else None
            title = (meta.title if meta and meta.title else url.split("/")[-1])[:120]
            new_rows.append({"title": title, "url": url, "text": text})
            print(f"      ✔ saved ({len(text):,} chars)")

        except HTTPError as e:
            print(f"      HTTP error: {e}")
        except Exception as e:
            print(f"      Fail: {e}")

        time.sleep(PAUSE_BETWEEN_DL)

# ── STEP 3: save CSV ───────────────────────────────────────────────────
print("\n[3] Saving …")
if new_rows:
    df_new = pd.DataFrame(new_rows)
    if OUTPUT_FILE.exists():
        df_new.to_csv(OUTPUT_FILE, mode="a", header=False, index=False, encoding="utf-8")
    else:
        df_new.to_csv(OUTPUT_FILE, index=False, encoding="utf-8")
    print(f"    Added {len(df_new)} new rows. CSV now has "
          f"{len(seen_urls) + len(df_new):,} total.")
else:
    print("    No new qualifying articles extracted.")


In [43]:
# --- 4. Inspect the Results ---
import numpy as np
# --- NEW: Statistical Analysis of Chunks ---
print("\n--- Statistical Analysis & Quality Check ---")

# Calculate the lengths of all chunks
chunk_lengths = [len(chunk.page_content) for chunk in chunked_documents]

# Calculate and print key statistics
total_chunks = len(chunk_lengths)
min_size = np.min(chunk_lengths)
max_size = np.max(chunk_lengths)
avg_size = np.mean(chunk_lengths)
std_dev = np.std(chunk_lengths)

print(f"Total Chunks: {total_chunks}")
print(f"Minimum Chunk Size: {min_size} characters")
print(f"Maximum Chunk Size: {max_size} characters")
print(f"Average Chunk Size: {avg_size:.2f} characters")
print(f"Standard Deviation of Chunk Size: {std_dev:.2f}")

# --- Automated Quality Feedback ---

# 1. Check for high variation in chunk size
# A high standard deviation suggests inconsistent chunking.
if std_dev > 150:
    print(f"\n[WARNING] High chunk size variation detected (Std Dev: {std_dev:.2f}).")
    print("  > This suggests documents may have irregular structures (e.g., many short lines or lists).")
    print("  > Resulting chunks may have inconsistent levels of context.")

# 2. Check for and count potentially "orphaned" or very small chunks
small_chunk_threshold = CHUNK_SIZE * 0.20 # Chunks smaller than 20% of the target size
small_chunk_count = sum(1 for length in chunk_lengths if length < small_chunk_threshold)

if small_chunk_count > 0:
    # This check is more specific than just looking at the absolute minimum.
    print(f"\n[ADVISORY] Found {small_chunk_count} chunks smaller than {small_chunk_threshold} characters.")
    print(f"  > The smallest chunk is {min_size} characters.")
    print("  > These small chunks might lack sufficient context and could clutter search results.")
    print("  > Consider cleaning the source documents or adjusting the chunking separators.")

# Add a success message if no issues are flagged
if std_dev <= 150 and small_chunk_count == 0:
    print("\n[INFO] Chunking statistics appear healthy. Sizes are consistent.")


# --- Manual Inspection of Sample Chunks ---
# (This part remains the same)
print("\n--- Sample Chunk Preview ---")
# Print the first few chunks to get a feel for their content and structure
for i, chunk in enumerate(chunked_documents[:3]): # Print first 3 chunks
    chunk_source = os.path.basename(chunk.metadata.get('source', 'N/A'))
    print(f"\n--- Chunk {i+1} (Source: {chunk_source}, Length: {len(chunk.page_content)} chars) ---")
    print(chunk.page_content)


print("\n\nData ingestion and preprocessing is complete. The 'chunked_documents' are ready for the next stage (embedding).")


--- Statistical Analysis & Quality Check ---
Total Chunks: 4
Minimum Chunk Size: 126 characters
Maximum Chunk Size: 472 characters
Average Chunk Size: 322.00 characters
Standard Deviation of Chunk Size: 143.16

[INFO] Chunking statistics appear healthy. Sizes are consistent.

--- Sample Chunk Preview ---

--- Chunk 1 (Source: mcp_basics.txt, Length: 472 chars) ---
Modular Chemical Plants (MCPs) represent a paradigm shift in chemical process engineering. They involve constructing plants from standardized, pre-fabricated modules built off-site. This approach significantly reduces on-site construction time and costs compared to traditional stick-built plants. Key advantages include faster deployment, scalability, and potentially lower capital expenditure. However, module transportation and site integration require careful planning

--- Chunk 2 (Source: mcp_basics.txt, Length: 126 chars) ---
. MCPs are particularly suited for remote locations or projects with uncertain market demands, all

# This is the last cell of the code

In [4]:
# Log end time
with open("update_log.txt", "a") as f:
    f.write(f"Session ended at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()

NameError: name 'datetime' is not defined