In [9]:
# Commented out IPython magic to ensure Python compatibility.
from google.colab import drive,userdata
import os

drive.mount('/content/drive', force_remount=True)
notebook_drive_path = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"
notebook_filename_in_repo = "Capstone Project.ipynb"
log_filename = "update_log.txt"

# GitHub details
GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
REPOSITORY_NAME = 'PyNucleus-Model'
REPO_PATH = f"/content/{REPOSITORY_NAME}" # The path where the repo will be cloned

# --- CLONE REPOSITORY OR SYNC IF IT EXISTS ---
if not os.path.exists(REPO_PATH):
    print(f"Cloning {REPOSITORY_NAME} repository...")
    repository_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git"
    !git clone {repository_url} {REPO_PATH}
    %cd {REPO_PATH}
else:
    print("Repository already exists. Syncing with GitHub...")
    %cd {REPO_PATH}
    # --- THIS IS THE CRUCIAL NEW STEP ---
    # Fetch the latest changes from GitHub and merge them into your local copy
    !git pull

# --- STEP 3: NAVIGATE INTO REPOSITORY & COPY YOUR NOTEBOOK ---

# The '%cd' command changes the current directory in Colab
%cd {REPO_PATH}

# --- CONFIGURE GIT USER HERE ---
print("Configuring Git user...")
!git config user.name "{GITHUB_USERNAME}"
!git config user.email "{GITHUB_USERNAME}@users.noreply.github.com"

# print("Setup complete. You are ready to work.")

Mounted at /content/drive
Repository already exists. Syncing with GitHub...
/content/PyNucleus-Model
Already up to date.
/content/PyNucleus-Model
Configuring Git user...


In [None]:
# --- Installation Cell (Run this ONCE per session) ---
print("Installing all required packages...")

# Core LangChain and Community Packages
!pip install -q langchain langchain-core langchain-community langchain-text-splitters

# Document Loading & Processing (Unstructured handles many file types including OCR)
!pip install -q "unstructured[local-inference]"

# LLM & ML Libraries
!pip install -q transformers accelerate bitsandbytes torch sentence-transformers

# Vector Stores
!pip install -q chromadb faiss-cpu # faiss-gpu if you have a Pro Colab with a good GPU

# Data Handling & Utilities
!pip install -q pandas numpy tqdm PyYAML

print("All packages installed successfully.")

Installing all required packages...


In [None]:
# --- Core Utilities & File Handling ---
import getpass
import PyYAML
import yaml # Make sure PyYAML is installed
from pathlib import Path

# --- Data Handling & Progress Bars ---
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Document Loading ---
# UnstructuredFileLoader is now in langchain_community
from langchain_community.document_loaders import UnstructuredFileLoader
# For future testing of more specific loaders:
# from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader

# --- OCR Libraries ---
# No direct imports needed here if using UnstructuredFileLoader with local-inference,
# as it handles OCR internally.
# You would only import these for a manual OCR process:
# import pytesseract
# from pdf2image import convert_from_path
# from PIL import Image

# --- Core ML & LLM Libraries ---
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes
import accelerate # Often used with transformers

# --- Vector Stores & Embeddings ---
# Using ChromaDB as the primary vector store.
import chromadb
from langchain_community.vectorstores import Chroma # Chroma is now in langchain_community
from langchain_community.embeddings import HuggingFaceEmbeddings # Embeddings are also in langchain_community
# For future testing of a high-performance alternative:
# import faiss
# from langchain_community.vectorstores import FAISS

# --- RAG/Agent Frameworks ---
# Core LangChain components
import langchain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# For future testing of an alternative RAG-focused toolkit:
# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

# **Data Ingestion and Preprocessing for RAG**

# This is the last cell of the code

In [None]:
from datetime import datetime

# --- CREATE/UPDATE THE LOG FILE ---
# This will add a new line to your log file with the current date and time.
log_message = f"Notebook saved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with open(log_filename, "a") as f:
    f.write(log_message + "\n")
print(f"Updated '{log_filename}'")

# --- COPY YOUR NOTEBOOK FROM DRIVE INTO THE REPO ---
print(f"Copying '{notebook_filename_in_repo}' from Google Drive...")
# Note: We use the variables defined in the first cell.
!cp "{notebook_drive_path}" "{notebook_filename_in_repo}"

# --- ADD, COMMIT, AND PUSH BOTH FILES ---
print("Staging files for commit...")
# Add BOTH the notebook and the log file to Git
!git add "{notebook_filename_in_repo}"
!git add "{log_filename}"

# Create a commit message
commit_message = f"Update project notebook and log file - {datetime.now().strftime('%Y-%m-%d')}"
print(f"Committing with message: '{commit_message}'")
!git commit -m "{commit_message}"

# Push the changes to your 'main' branch on GitHub
print("Pushing changes to GitHub...")
!git push origin main

print("SUCCESS! Your notebook and log file have been updated on GitHub.")