In [8]:
# Commented out IPython magic to ensure Python compatibility.
from google.colab import drive,userdata
import os
from datetime import datetime

drive.mount('/content/drive')
notebook_drive_path = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
notebook_filename_in_repo = "Capstone Project.ipynb"
log_filename = "update_log.txt"

# GitHub details
GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
REPOSITORY_NAME = 'PyNucleus-Model'
REPO_PATH = f"/content/drive/MyDrive/{REPOSITORY_NAME}"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
# --- CLONE REPOSITORY OR SYNC IF IT EXISTS ---
if not os.path.exists(REPO_PATH):
    print(f"Cloning {REPOSITORY_NAME} repository...")
    repository_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git"
    !git clone {repository_url} {REPO_PATH}
    %cd {REPO_PATH}
else:
    print("Repository already exists. Syncing with GitHub...")
    %cd "{REPO_PATH}"
    !git pull

# --- 3. NAVIGATE INTO REPO & CONFIGURE ---
%cd {REPO_PATH}

# --- CONFIGURE GIT USER HERE ---
print("Configuring Git user...")
!git config user.name "{GITHUB_USERNAME}"
!git config user.email "{GITHUB_USERNAME}@users.noreply.github.com"

print(f"\n Setup complete. Current directory: {os.getcwd()}")


Cloning PyNucleus-Model repository...
Cloning into '/content/drive/MyDrive/PyNucleus-Model'...
remote: Enumerating objects: 36, done.[K
remote: Counting objects: 100% (36/36), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 36 (delta 17), reused 24 (delta 8), pack-reused 0 (from 0)[K
Receiving objects: 100% (36/36), 15.80 KiB | 735.00 KiB/s, done.
Resolving deltas: 100% (17/17), done.
/content/drive/MyDrive/PyNucleus-Model
Configuring Git user...

✅ Setup complete. Current directory: /content/drive/MyDrive/PyNucleus-Model


In [5]:
# --- Installation Cell (Run this ONCE per session) ---
print("Installing all required packages...")

# Core LangChain and Community Packages
!pip install -q langchain langchain-core langchain-community langchain-text-splitters

# Document Loading & Processing (Unstructured handles many file types including OCR)
!pip install -q "unstructured[local-inference]"

# LLM & ML Libraries
!pip install -q transformers accelerate bitsandbytes torch sentence-transformers

# Vector Stores
!pip install -q chromadb faiss-cpu # faiss-gpu if you have a Pro Colab with a good GPU

# Data Handling & Utilities
!pip install -q pandas numpy tqdm PyYAML

print("All packages installed successfully.")

Installing all required packages...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3

In [6]:
# --- Core Utilities & File Handling ---
import getpass
import yaml # Make sure PyYAML is installed
from pathlib import Path

# --- Data Handling & Progress Bars ---
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Document Loading ---
# UnstructuredFileLoader is now in langchain_community
from langchain_community.document_loaders import UnstructuredFileLoader
# For future testing of more specific loaders:
# from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader

# --- OCR Libraries ---
# No direct imports needed here if using UnstructuredFileLoader with local-inference,
# as it handles OCR internally.
# You would only import these for a manual OCR process:
# import pytesseract
# from pdf2image import convert_from_path
# from PIL import Image

# --- Core ML & LLM Libraries ---
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes
import accelerate # Often used with transformers

# --- Vector Stores & Embeddings ---
# Using ChromaDB as the primary vector store.
import chromadb
from langchain_community.vectorstores import Chroma # Chroma is now in langchain_community
from langchain_community.embeddings import HuggingFaceEmbeddings # Embeddings are also in langchain_community
# For future testing of a high-performance alternative:
# import faiss
# from langchain_community.vectorstores import FAISS

# --- RAG/Agent Frameworks ---
# Core LangChain components
import langchain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# For future testing of an alternative RAG-focused toolkit:
# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex



# **Data Ingestion and Preprocessing for RAG**

# This is the last cell of the code

In [7]:

# --- CREATE/UPDATE THE LOG FILE ---
# This will add a new line to your log file with the current date and time.
log_message = f"Notebook saved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with open(log_filename, "a") as f:
    f.write(log_message + "\n")
print(f"Updated '{log_filename}'")

# --- COPY YOUR NOTEBOOK FROM DRIVE INTO THE REPO ---
print(f"Copying '{notebook_filename_in_repo}' from Google Drive...")
!cp "{notebook_drive_path}" "{notebook_filename_in_repo}"

# --- ADD, COMMIT, AND PUSH BOTH FILES ---
print("Staging files for commit...")
# Add BOTH the notebook and the log file to Git
!git add "{notebook_filename_in_repo}"
!git add "{log_filename}"

# Create a commit message
commit_message = f"Update project notebook and log file - {datetime.now().strftime('%Y-%m-%d')}"
print(f"Committing with message: '{commit_message}'")
!git commit -m "{commit_message}"

print("\nPushing changes to GitHub...")
!git push origin main

print("\n SUCCESS! Your notebook and log file have been updated on GitHub.")

Updated 'update_log.txt'
Copying 'Capstone Project.ipynb' from Google Drive...
Staging files for commit...
Committing with message: 'Update project notebook and log file - 2025-06-03'
[main ed2079f] Update project notebook and log file - 2025-06-03
 2 files changed, 2 insertions(+), 1 deletion(-)
 rewrite Capstone Project.ipynb (97%)

Pushing changes to GitHub...
Enumerating objects: 7, done.
Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (4/4), 4.34 KiB | 4.34 MiB/s, done.
Total 4 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/Saytor20/PyNucleus-Model.git
   4289821..ed2079f  main -> main

🎉 SUCCESS! Your notebook and log file have been updated on GitHub.
