In [26]:
# Commented out IPython magic to ensure Python compatibility.
from google.colab import drive,userdata
import os
from datetime import datetime

drive.mount('/content/drive')
notebook_drive_path = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
notebook_filename_in_repo = "Capstone Project.ipynb"
log_filename = "update_log.txt"

# GitHub details
GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
REPOSITORY_NAME = 'PyNucleus-Model'
REPO_PATH = f"/content/drive/MyDrive/{REPOSITORY_NAME}"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
# --- CLONE REPOSITORY OR SYNC IF IT EXISTS ---
if not os.path.exists(REPO_PATH):
    print(f"Cloning {REPOSITORY_NAME} repository into your Google Drive...")
    # Clone directly into the full REPO_PATH
    !git clone {repository_url} {REPO_PATH}
else:
    print("Repository already exists in Drive. Syncing with GitHub...")
    # If it exists, we don't need to do anything here,
    # because the commands after this block will handle it.

# --- NAVIGATE INTO REPO, PULL, & CONFIGURE ---
# This ensures we are in the right place, regardless of if we just cloned or not.
%cd {REPO_PATH}
print("Pulling latest changes to ensure we are up-to-date...")
!git pull

print("\nConfiguring Git user for this session...")
!git config user.name "{GITHUB_USERNAME}"
!git config user.email "{GITHUB_USERNAME}@users.noreply.github.com"

print(f"\n Setup complete. You are ready to work. Current directory: {os.getcwd()}")

Repository already exists in Drive. Syncing with GitHub...
/content/drive/MyDrive/PyNucleus-Model
Pulling latest changes to ensure we are up-to-date...
Already up to date.

Configuring Git user for this session...

 Setup complete. You are ready to work. Current directory: /content/drive/MyDrive/PyNucleus-Model


In [28]:
# # --- Installation Cell (Run this ONCE per session) ---
# print("Installing all required packages...")

# # Core LangChain and Community Packages
# !pip install -q langchain langchain-core langchain-community langchain-text-splitters

# # Document Loading & Processing (Unstructured handles many file types including OCR)
# !pip install -q "unstructured[local-inference]"

# # LLM & ML Libraries
# !pip install -q transformers accelerate bitsandbytes torch sentence-transformers

# # Vector Stores
# !pip install -q chromadb faiss-cpu # faiss-gpu if you have a Pro Colab with a good GPU

# # Data Handling & Utilities
# !pip install -q pandas numpy tqdm PyYAML

# print("All packages installed successfully.")

Installing all required packages...
[31mERROR: Operation cancelled by user[0m[31m
[0mAll packages installed successfully.


In [29]:
# --- Core Utilities & File Handling ---
import getpass
import yaml # Make sure PyYAML is installed
from pathlib import Path

# --- Data Handling & Progress Bars ---
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Document Loading ---
# UnstructuredFileLoader is now in langchain_community
from langchain_community.document_loaders import UnstructuredFileLoader
# For future testing of more specific loaders:
# from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader

# --- OCR Libraries ---
# No direct imports needed here if using UnstructuredFileLoader with local-inference,
# as it handles OCR internally.
# You would only import these for a manual OCR process:
# import pytesseract
# from pdf2image import convert_from_path
# from PIL import Image

# --- Core ML & LLM Libraries ---
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes
import accelerate # Often used with transformers

# --- Vector Stores & Embeddings ---
# Using ChromaDB as the primary vector store.
import chromadb
from langchain_community.vectorstores import Chroma # Chroma is now in langchain_community
from langchain_community.embeddings import HuggingFaceEmbeddings # Embeddings are also in langchain_community
# For future testing of a high-performance alternative:
# import faiss
# from langchain_community.vectorstores import FAISS

# --- RAG/Agent Frameworks ---
# Core LangChain components
import langchain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# For future testing of an alternative RAG-focused toolkit:
# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

# **Data Ingestion and Preprocessing for RAG**

# This is the last cell of the code

In [30]:

# --- CREATE/UPDATE THE LOG FILE ---
# This will add a new line to your log file with the current date and time.
log_message = f"Notebook saved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with open(log_filename, "a") as f:
    f.write(log_message + "\n")
print(f"Updated '{log_filename}'")

# --- COPY YOUR NOTEBOOK FROM DRIVE INTO THE REPO ---
print(f"Copying '{notebook_filename_in_repo}' from Google Drive...")
!cp "{notebook_drive_path}" "{notebook_filename_in_repo}"

# --- ADD, COMMIT, AND PUSH BOTH FILES ---
print("Staging files for commit...")
# Add BOTH the notebook and the log file to Git
!git add "{log_filename}"
!git add "{notebook_filename_in_repo}"

# Create a commit message
commit_message = f"Update project notebook and log file - {datetime.now().strftime('%Y-%m-%d')}"
print(f"Committing with message: '{commit_message}'")
!git commit -m "{commit_message}"

print("\nPushing changes to GitHub...")
!git push origin main

print("\n SUCCESS! Your notebook and log file have been updated on GitHub.")

Updated 'update_log.txt'
Copying 'Capstone Project.ipynb' from Google Drive...
Staging files for commit...
Committing with message: 'Update project notebook and log file - 2025-06-03'
[main e881e41] Update project notebook and log file - 2025-06-03
 1 file changed, 1 insertion(+)

Pushing changes to GitHub...
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 327 bytes | 54.00 KiB/s, done.
Total 3 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/Saytor20/PyNucleus-Model.git
   b8bdc5a..e881e41  main -> main

 SUCCESS! Your notebook and log file have been updated on GitHub.
