In [1]:
import os
from datetime import datetime
from dotenv import load_dotenv

#--------Google Drive Integration--------#
# from google.colab import drive, userdata
# This gives Colab access to your files in Google Drive.
# drive.mount('/content/drive')
# 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"


#--------Cursor Integration--------#
# Load environment variables from .env file
load_dotenv()

# Get GitHub credentials from environment variables
GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

# Print to verify the variables are loaded (remove this in production)
print(f"Username: {GITHUB_USERNAME}")
print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security

# Repository information
REPOSITORY_NAME = 'PyNucleus-Model'
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"

# Pull latest changes from GitHub
print("Pulling latest changes from GitHub...")
!git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

print("Repository is up to date!")

Username: Saytor20
Token: ghp_...
Pulling latest changes from GitHub...
From https://github.com/Saytor20/PyNucleus-Model
 * branch            main       -> FETCH_HEAD
Already up to date.
Repository is up to date!


# **Data Ingestion and Preprocessing for RAG**

In [2]:
#----- Date processing for all documents types -----#
import os
from langchain_community.document_loaders import UnstructuredFileLoader

# --- Configuration ---
# Folder where you will place all your source files (PDFs, DOCX, TXT, etc.)
INPUT_DIR = 'source_documents'

# Folder where the processed .txt files will be saved
OUTPUT_DIR = 'processed_txt_files'

# --- Main Logic ---
if __name__ == "__main__":
    # Create the input directory if it doesn't exist and give instructions
    if not os.path.exists(INPUT_DIR):
        print(f"📂 Creating directory: '{INPUT_DIR}'")
        os.makedirs(INPUT_DIR)
        print(f" Please place your files (PDF, DOCX, TXT, etc.) in the '{INPUT_DIR}' directory and run the script again.")
        exit()

    # Create the output directory
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    files_to_process = [f for f in os.listdir(INPUT_DIR) if os.path.isfile(os.path.join(INPUT_DIR, f))]

    if not files_to_process:
        print(f"ℹ The '{INPUT_DIR}' directory is empty. Nothing to process.")
        exit()

    print(f"--- 📄 Starting processing for {len(files_to_process)} file(s) in '{INPUT_DIR}' ---")

    for filename in files_to_process:
        input_path = os.path.join(INPUT_DIR, filename)
        
        # Create a clean output filename by changing the extension to .txt
        output_filename = os.path.splitext(os.path.basename(filename))[0] + '.txt'
        output_path = os.path.join(OUTPUT_DIR, output_filename)
        
        print(f"  ▶ Processing: {filename}")

        try:
            # The UnstructuredFileLoader automatically handles different file types.
            # The "paged" mode is faster and works for most text-based files.
            # It can be set to "elements" or "ocr" for more complex, scanned documents.
            loader = UnstructuredFileLoader(input_path, mode="paged")
            
            # The .load() method does all the work of extracting the text
            documents = loader.load()
            
            # Combine the page content into a single block of text
            full_text = "\n".join([doc.page_content for doc in documents])

            # Save the extracted text to a new .txt file
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(full_text)
                
            print(f"    • Success! Saved to: {output_path}")

        except Exception as e:
            print(f"    • Error processing {filename}: {e}")
            
    print("\n\n All files processed.")

--- 📄 Starting processing for 4 file(s) in 'source_documents' ---
  ▶ Processing: 1-s2.0-S0925527302003742-main.pdf


  loader = UnstructuredFileLoader(input_path, mode="paged")
  from .autonotebook import tqdm as notebook_tqdm
`mode='paged'` is deprecated in favor of the 'by_page' chunking strategy. Learn more about chunking here: https://docs.unstructured.io/open-source/core-functionality/chunking


    • Success! Saved to: processed_txt_files/1-s2.0-S0925527302003742-main.txt
  ▶ Processing: Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.docx


`mode='paged'` is deprecated in favor of the 'by_page' chunking strategy. Learn more about chunking here: https://docs.unstructured.io/open-source/core-functionality/chunking
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
`mode='paged'` is deprecated in favor of the 'by_page' chunking strategy. Learn more about chunking here: https://docs.unstructured.io/open-source/core-functionality/chunking
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
`mode='paged'` is deprecated in favor of the 'by_page' chunking strategy. Learn more about chunking here: https://docs.unstructured.io/open-source/core-functionality/chunking


    • Success! Saved to: processed_txt_files/Manuscript Draft_Can Modular Plants Lower African Industrialization Barriers.txt
  ▶ Processing: mcp_basics.txt
    • Success! Saved to: processed_txt_files/mcp_basics.txt
  ▶ Processing: feasibility_factors.txt
    • Success! Saved to: processed_txt_files/feasibility_factors.txt


 All files processed.


In [8]:
# =========================================================================
# A SIMPLER CRAWLER USING REQUESTS (REPLACES THE SCRAPY VERSION)
# =========================================================================
import requests
from googlesearch import search
import trafilatura
from pdfminer.high_level import extract_text
import pandas as pd
import pathlib
import io
import time

# --- CONFIGURATION ---
KEYWORDS = [
    "modular chemical plants cost analysis 2024",
    "MCP feasibility study",
    "pre-fabricated chemical modules logistics",
]
SEARCH_RESULTS_PER_KW = 30 # How many results to fetch per keyword
MIN_LENGTH = 50            # Minimum character length to save an article
OUTPUT_DIR = pathlib.Path("data_source")
OUTPUT_FILE = OUTPUT_DIR / "articles.csv"

# Create the output directory if it doesn't exist
OUTPUT_DIR.mkdir(exist_ok=True)

# --- CRAWLING LOGIC ---

# 1. Get all URLs from Google Search
# -----------------------------------
all_urls = set() # Use a set to automatically handle duplicates
print(" Phase 1: Searching Google for URLs ".center(80, "="))
for kw in KEYWORDS:
    print(f"Searching for keyword: '{kw}'...")
    try:
        # We add a short delay to be respectful to Google's servers
        for url in search(query=kw, num=SEARCH_RESULTS_PER_KW, stop=SEARCH_RESULTS_PER_KW, pause=2.0, lang="en"):
            if not any(bad in url for bad in ("google.", "/search?", "facebook.com")):
                all_urls.add(url)
    except Exception as e:
        print(f"An error occurred during Google search: {e}")
        print("Continuing with next keyword...")

print(f"\nFound {len(all_urls)} unique candidate URLs.")


# 2. Download and Extract Content from each URL
# ---------------------------------------------
results = []
processed_urls = set()
print("\n Phase 2: Downloading and Extracting Content ".center(80, "="))

for i, url in enumerate(all_urls):
    if url in processed_urls:
        continue

    print(f"Processing ({i+1}/{len(all_urls)}): {url}")
    try:
        # Download the content with a timeout
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

        text = ""
        title = ""
        
        # Check if content is PDF or HTML
        content_type = response.headers.get("content-type", "").lower()
        if "pdf" in content_type or url.lower().endswith('.pdf'):
            # It's a PDF
            text = extract_text(io.BytesIO(response.content))
            title = url.split('/')[-1] # Use filename as title for PDFs
        else:
            # It's HTML
            # Use trafilatura to get the main text and metadata
            article_text = trafilatura.extract(response.text)
            if article_text:
                text = article_text
                metadata = trafilatura.extract_metadata(response.text)
                if metadata and metadata.title:
                    title = metadata.title

        # Check if the extracted text is long enough
        if text and len(text) >= MIN_LENGTH:
            results.append({
                'title': title,
                'url': url,
                'text': text
            })
            print(f"  -> Success: Extracted {len(text):,} characters.")
        else:
            print("  -> Skipped: Content was too short or extraction failed.")

    except requests.exceptions.RequestException as e:
        print(f"  -> Failed to download: {e}")
    except Exception as e:
        print(f"  -> An unexpected error occurred: {e}")
    
    processed_urls.add(url)
    time.sleep(1) # Wait 1 second between requests to be polite


# 3. Save the results to a CSV file
# -----------------------------------
print("\n Phase 3: Saving Results ".center(80, "="))
if results:
    df = pd.DataFrame(results)
    df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
    print(f" Success! Saved {len(df)} articles to '{OUTPUT_FILE}'")
else:
    print(" No articles meeting the criteria were found or extracted.")

Searching for keyword: 'modular chemical plants cost analysis 2024'...
An error occurred during Google search: search() got an unexpected keyword argument 'query'
Continuing with next keyword...
Searching for keyword: 'MCP feasibility study'...
An error occurred during Google search: search() got an unexpected keyword argument 'query'
Continuing with next keyword...
Searching for keyword: 'pre-fabricated chemical modules logistics'...
An error occurred during Google search: search() got an unexpected keyword argument 'query'
Continuing with next keyword...

Found 0 unique candidate URLs.
 No articles meeting the criteria were found or extracted.


In [43]:
# --- 4. Inspect the Results ---
import numpy as np
# --- NEW: Statistical Analysis of Chunks ---
print("\n--- Statistical Analysis & Quality Check ---")

# Calculate the lengths of all chunks
chunk_lengths = [len(chunk.page_content) for chunk in chunked_documents]

# Calculate and print key statistics
total_chunks = len(chunk_lengths)
min_size = np.min(chunk_lengths)
max_size = np.max(chunk_lengths)
avg_size = np.mean(chunk_lengths)
std_dev = np.std(chunk_lengths)

print(f"Total Chunks: {total_chunks}")
print(f"Minimum Chunk Size: {min_size} characters")
print(f"Maximum Chunk Size: {max_size} characters")
print(f"Average Chunk Size: {avg_size:.2f} characters")
print(f"Standard Deviation of Chunk Size: {std_dev:.2f}")

# --- Automated Quality Feedback ---

# 1. Check for high variation in chunk size
# A high standard deviation suggests inconsistent chunking.
if std_dev > 150:
    print(f"\n[WARNING] High chunk size variation detected (Std Dev: {std_dev:.2f}).")
    print("  > This suggests documents may have irregular structures (e.g., many short lines or lists).")
    print("  > Resulting chunks may have inconsistent levels of context.")

# 2. Check for and count potentially "orphaned" or very small chunks
small_chunk_threshold = CHUNK_SIZE * 0.20 # Chunks smaller than 20% of the target size
small_chunk_count = sum(1 for length in chunk_lengths if length < small_chunk_threshold)

if small_chunk_count > 0:
    # This check is more specific than just looking at the absolute minimum.
    print(f"\n[ADVISORY] Found {small_chunk_count} chunks smaller than {small_chunk_threshold} characters.")
    print(f"  > The smallest chunk is {min_size} characters.")
    print("  > These small chunks might lack sufficient context and could clutter search results.")
    print("  > Consider cleaning the source documents or adjusting the chunking separators.")

# Add a success message if no issues are flagged
if std_dev <= 150 and small_chunk_count == 0:
    print("\n[INFO] Chunking statistics appear healthy. Sizes are consistent.")


# --- Manual Inspection of Sample Chunks ---
# (This part remains the same)
print("\n--- Sample Chunk Preview ---")
# Print the first few chunks to get a feel for their content and structure
for i, chunk in enumerate(chunked_documents[:3]): # Print first 3 chunks
    chunk_source = os.path.basename(chunk.metadata.get('source', 'N/A'))
    print(f"\n--- Chunk {i+1} (Source: {chunk_source}, Length: {len(chunk.page_content)} chars) ---")
    print(chunk.page_content)


print("\n\nData ingestion and preprocessing is complete. The 'chunked_documents' are ready for the next stage (embedding).")


--- Statistical Analysis & Quality Check ---
Total Chunks: 4
Minimum Chunk Size: 126 characters
Maximum Chunk Size: 472 characters
Average Chunk Size: 322.00 characters
Standard Deviation of Chunk Size: 143.16

[INFO] Chunking statistics appear healthy. Sizes are consistent.

--- Sample Chunk Preview ---

--- Chunk 1 (Source: mcp_basics.txt, Length: 472 chars) ---
Modular Chemical Plants (MCPs) represent a paradigm shift in chemical process engineering. They involve constructing plants from standardized, pre-fabricated modules built off-site. This approach significantly reduces on-site construction time and costs compared to traditional stick-built plants. Key advantages include faster deployment, scalability, and potentially lower capital expenditure. However, module transportation and site integration require careful planning

--- Chunk 2 (Source: mcp_basics.txt, Length: 126 chars) ---
. MCPs are particularly suited for remote locations or projects with uncertain market demands, all

# **Data Scrapping**

# This is the last cell of the code

In [10]:
# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()

[main a01bcca] Update: Adding all files to repository
 1 file changed, 146 insertions(+), 146 deletions(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 3.04 KiB | 3.04 MiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/Saytor20/PyNucleus-Model.git
   8c7c64b..a01bcca  main -> main
All files pushed to GitHub successfully!
