In [None]:
from google.colab import drive, userdata
import os
from datetime import datetime

# This gives Colab access to your files in Google Drive.
drive.mount('/content/drive')

# 'GITHUB_USERNAME' and 'GITHUB_TOKEN' saved as secrets in Colab.
GITHUB_USERNAME = userdata.get('GITHUB_USERNAME')
GITHUB_TOKEN = userdata.get('GITHUB_TOKEN')
REPOSITORY_NAME = 'PyNucleus-Model' # Your repository name

# Path to your master notebook in Google Drive
NOTEBOOK_DRIVE_PATH = "/content/drive/MyDrive/PyNucleus Project/Capstone Project.ipynb"

# Name of notebook file in GitHub repo
NOTEBOOK_REPO_FILENAME = "Capstone Project.ipynb"
LOG_FILENAME = "update_log.txt"


In [None]:

# Clone the Repository
repo_path = f'/content/{REPOSITORY_NAME}'
!git clone https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPOSITORY_NAME}.git {repo_path}

# Change the current working directory to the repository
# All subsequent commands will run from inside the repo folder.
os.chdir(repo_path)

# Add a new line to your log file with the current date and time.
log_message = f"Notebook saved on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
with open(LOG_FILENAME, "a") as f:
    f.write(log_message + "\n")
print(f"Updated '{LOG_FILENAME}'")

# Copy the latest version of notebook from Drive into the cloned repo.
!cp "{NOTEBOOK_DRIVE_PATH}" "{NOTEBOOK_REPO_FILENAME}"
print(f"Copied '{NOTEBOOK_REPO_FILENAME}' from Google Drive.")

# Git identity for commenting
!git config user.name "{GITHUB_USERNAME}"
!git config user.email "{GITHUB_USERNAME}@users.noreply.github.com"

In [None]:
# --- Installation Cell (Run this ONCE per session) ---
print("Installing all required packages...")

!pip install -q langchain langchain-core langchain-community langchain-text-splitters

!pip install -q "unstructured[local-inference]"

# LLM & ML Libraries
!pip install -q transformers accelerate bitsandbytes torch sentence-transformers

# Vector Stores
!pip install -q chromadb faiss-cpu # faiss-gpu if you have a Pro Colab with a good GPU

# Data Handling & Utilities
!pip install -q pandas numpy tqdm PyYAML

print("All packages installed successfully.")

In [None]:
# --- Core Utilities & File Handling ---
import getpass
import yaml # Make sure PyYAML is installed
from pathlib import Path

# --- Data Handling & Progress Bars ---
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Document Loading ---
# UnstructuredFileLoader is now in langchain_community
from langchain_community.document_loaders import UnstructuredFileLoader
# For future testing of more specific loaders:
# from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader

# --- OCR Libraries ---
# No direct imports needed here if using UnstructuredFileLoader with local-inference,
# as it handles OCR internally.
# You would only import these for a manual OCR process:
# import pytesseract
# from pdf2image import convert_from_path
# from PIL import Image

# --- Core ML & LLM Libraries ---
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes
import accelerate # Often used with transformers

# --- Vector Stores & Embeddings ---
# Using ChromaDB as the primary vector store.
import chromadb
from langchain_community.vectorstores import Chroma # Chroma is now in langchain_community
from langchain_community.embeddings import HuggingFaceEmbeddings # Embeddings are also in langchain_community
# For future testing of a high-performance alternative:
# import faiss
# from langchain_community.vectorstores import FAISS

# --- RAG/Agent Frameworks ---
# Core LangChain components
import langchain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
# For future testing of an alternative RAG-focused toolkit:
# from llama_index.core import SimpleDirectoryReader, VectorStoreIndex

# **Data Ingestion and Preprocessing for RAG**

# **Data Scrapping**

# This is the last cell of the code

In [None]:
# Add the notebook to the staging area
!git add "{NOTEBOOK_REPO_FILENAME}"
!git add "{LOG_FILENAME}"


# Commit the changes with a more descriptive message
commit_message = f"Update notebook and log file on {datetime.now().strftime('%Y-%m-%d')}"
!git commit -m "{commit_message}"

# --- 5. Push to GitHub ---
# Push the committed changes to the 'main' branch of your repository.
print("\nPushing changes to GitHub...")
!git push origin main

print(f"\n Successfully saved '{NOTEBOOK_REPO_FILENAME}' to your GitHub repository!")
