In [8]:
import os
from datetime import datetime
from dotenv import load_dotenv

#--------Google Drive Integration--------#
# from google.colab import drive, userdata
# This gives Colab access to your files in Google Drive.
# drive.mount('/content/drive')
# 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
# GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
# GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
# REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name
# NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"


#--------Cursor Integration--------#
# Load environment variables from .env file
load_dotenv()

# Get GitHub credentials from environment variables
GITHUB_USERNAME = os.getenv('GITHUB_USERNAME')
GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

# Print to verify the variables are loaded (remove this in production)
print(f"Username: {GITHUB_USERNAME}")
print(f"Token: {GITHUB_TOKEN[:4]}...") # Only print first 4 chars of token for security

# Repository information
REPOSITORY_NAME = 'PyNucleus-Model'
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"

# Pull latest changes from GitHub
print("Pulling latest changes from GitHub...")
!git pull https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git main

print("Repository is up to date!")

Username: Saytor20
Token: ghp_...
Pulling latest changes from GitHub...
From https://github.com/Saytor20/PyNucleus-Model
 * branch            main       -> FETCH_HEAD
Already up to date.
Repository is up to date!


In [9]:
# # Clone the Repository
# repo_path = f'/content/{REPOSITORY_NAME}'
# !git clone https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git {repo_path}

# # Change the current working directory to the repository
# # All subsequent commands will run from inside the repo folder.
# os.chdir(repo_path)

# # Add a new line to your log file with the current date and time.
# log_message = f"Notebook saved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
# with open(LOG_FILENAME, "a") as f:
#     f.write(log_message + "\n")
# print(f"Updated '{LOG_FILENAME}'")

# # Copy the latest version of notebook from Drive into the cloned repo.
# !cp "{NOTEBOOK_DRIVE_PATH}" "{NOTEBOOK_REPO_FILENAME}"
# print(f"Copied '{NOTEBOOK_REPO_FILENAME}' from Google Drive.")

# # Git identity for commenting
# !git config user.name "{GITHUB_USERNAME}"
# !git config user.email "{GITHUB_USERNAME}@users.noreply.github.com"

# **Data Ingestion and Preprocessing for RAG**

In [19]:
# Data Ingestion and Preprocessing for RAG
import os
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the path to your data (current directory)
DATA_DIR = os.getcwd()

# Load all.txt files from the current directory
print(f"Loading documents from: {DATA_DIR}")
loader = DirectoryLoader(DATA_DIR, glob="*.txt", loader_cls=TextLoader, show_progress=True)
documents = loader.load()

print(f"\nLoaded {len(documents)} documents.")
for i, doc in enumerate(documents):
    print(f"  - Document {i+1} Source: {os.path.basename(doc.metadata.get('source', 'N/A'))}")

Loading documents from: /Users/mohammadalmusaiteer/PyNucleus-Model


100%|██████████| 2/2 [00:00<00:00, 491.31it/s]


Loaded 2 documents.
  - Document 1 Source: mcp_basics.txt
  - Document 2 Source: feasibility_factors.txt





In [25]:
#2. Text Chunking

# Define chunking parameters
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

# Create the text splitter with these parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", ". ", " ", ""]
)

# Apply the splitter to the loaded documents
chunked_documents = text_splitter.split_documents(documents)

print(f"\nSplit {len(documents)} documents into {len(chunked_documents)} chunks.")



Split 2 documents into 4 chunks.


In [29]:
# --- 4. Inspect the Results ---
import numpy as np
# --- NEW: Statistical Analysis of Chunks ---
print("\n--- Statistical Analysis & Quality Check ---")

# Calculate the lengths of all chunks
chunk_lengths = [len(chunk.page_content) for chunk in chunked_documents]

# Calculate and print key statistics
total_chunks = len(chunk_lengths)
min_size = np.min(chunk_lengths)
max_size = np.max(chunk_lengths)
avg_size = np.mean(chunk_lengths)
std_dev = np.std(chunk_lengths)

print(f"Total Chunks: {total_chunks}")
print(f"Minimum Chunk Size: {min_size} characters")
print(f"Maximum Chunk Size: {max_size} characters")
print(f"Average Chunk Size: {avg_size:.2f} characters")
print(f"Standard Deviation of Chunk Size: {std_dev:.2f}")

# --- Automated Quality Feedback ---

# 1. Check for high variation in chunk size
# A high standard deviation suggests inconsistent chunking.
if std_dev > 150:
    print(f"\n[WARNING] High chunk size variation detected (Std Dev: {std_dev:.2f}).")
    print("  > This suggests documents may have irregular structures (e.g., many short lines or lists).")
    print("  > Resulting chunks may have inconsistent levels of context.")

# 2. Check for and count potentially "orphaned" or very small chunks
small_chunk_threshold = CHUNK_SIZE * 0.20 # Chunks smaller than 20% of the target size
small_chunk_count = sum(1 for length in chunk_lengths if length < small_chunk_threshold)

if small_chunk_count > 0:
    # This check is more specific than just looking at the absolute minimum.
    print(f"\n[ADVISORY] Found {small_chunk_count} chunks smaller than {small_chunk_threshold} characters.")
    print(f"  > The smallest chunk is {min_size} characters.")
    print("  > These small chunks might lack sufficient context and could clutter search results.")
    print("  > Consider cleaning the source documents or adjusting the chunking separators.")

# Add a success message if no issues are flagged
if std_dev <= 150 and small_chunk_count == 0:
    print("\n[INFO] Chunking statistics appear healthy. Sizes are consistent.")


# --- Manual Inspection of Sample Chunks ---
# (This part remains the same)
print("\n--- Sample Chunk Preview ---")
# Print the first few chunks to get a feel for their content and structure
for i, chunk in enumerate(chunked_documents[:3]): # Print first 3 chunks
    chunk_source = os.path.basename(chunk.metadata.get('source', 'N/A'))
    print(f"\n--- Chunk {i+1} (Source: {chunk_source}, Length: {len(chunk.page_content)} chars) ---")
    print(chunk.page_content)


print("\n\nData ingestion and preprocessing is complete. The 'chunked_documents' are ready for the next stage (embedding).")


--- Statistical Analysis & Quality Check ---
Total Chunks: 4
Minimum Chunk Size: 126 characters
Maximum Chunk Size: 472 characters
Average Chunk Size: 322.00 characters
Standard Deviation of Chunk Size: 143.16

[INFO] Chunking statistics appear healthy. Sizes are consistent.

--- Sample Chunk Preview ---

--- Chunk 1 (Source: mcp_basics.txt, Length: 472 chars) ---
Modular Chemical Plants (MCPs) represent a paradigm shift in chemical process engineering. They involve constructing plants from standardized, pre-fabricated modules built off-site. This approach significantly reduces on-site construction time and costs compared to traditional stick-built plants. Key advantages include faster deployment, scalability, and potentially lower capital expenditure. However, module transportation and site integration require careful planning

--- Chunk 2 (Source: mcp_basics.txt, Length: 126 chars) ---
. MCPs are particularly suited for remote locations or projects with uncertain market demands, all

# **Data Scrapping**

# This is the last cell of the code

In [16]:
# Simple GitHub update function
def update_github():
    !git add .
    !git commit -m "Update: Adding all files to repository"
    !git push origin main
    print("All files pushed to GitHub successfully!")

# To use it, just run:
update_github()

[main 2702bc3] Update: Adding all files to repository
 3 files changed, 28 insertions(+), 40 deletions(-)
 create mode 100644 feasibility_factors.txt
 create mode 100644 mcp_basics.txt
Enumerating objects: 7, done.
Counting objects: 100% (7/7), done.
Delta compression using up to 8 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 1.74 KiB | 1.74 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0 (from 0)
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/Saytor20/PyNucleus-Model.git
   d8c23ed..2702bc3  main -> main
All files pushed to GitHub successfully!
