In [1]:
import os
import json
import re
from datetime import datetime
from pathlib import Path
from shutil import copyfile
from urllib.parse import quote_plus

import nltk
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from kaggle_secrets import UserSecretsClient
!pip install --quiet langchain langchain-community unstructured[docx]

from sentence_transformers import SentenceTransformer
from transformers import pipeline
# from langchain_unstructured import UnstructuredLoader as UnstructuredFileLoader
from langchain_community.document_loaders import UnstructuredFileLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, asdict
import logging

try:
    from docx import Document
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False
    print("python-docx not available. Install with: pip install python-docx")

try:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    SKLEARN_AVAILABLE = True
except ImportError:
    SKLEARN_AVAILABLE = False
    print("scikit-learn not available. Install with: pip install scikit-learn")

try:
    import spacy
    # Try to load English model
    try:
        nlp = spacy.load("en_core_web_sm")
        SPACY_AVAILABLE = True
    except OSError:
        SPACY_AVAILABLE = False
        print("spaCy English model not available. Install with: python -m spacy download en_core_web_sm")
except ImportError:
    SPACY_AVAILABLE = False
    print("spaCy not available. Install with: pip install spacy")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.6/167.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.

2025-07-24 15:09:09.079828: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753369749.449540      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753369749.555701      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# # === NLTK Setup ===
# required_nltk_packages = [
#     "punkt", "averaged_perceptron_tagger", "maxent_ne_chunker", "words", "stopwords"
# ]

# def nltk_package_path(package):
#     return {
#         "punkt": "tokenizers/punkt",
#         "averaged_perceptron_tagger": "taggers/averaged_perceptron_tagger",
#         "maxent_ne_chunker": "chunkers/maxent_ne_chunker",
#         "words": "corpora/words",
#         "stopwords": "corpora/stopwords"
#     }.get(package, package)

# for package in required_nltk_packages:
#     try:
#         nltk.data.find(f"{nltk_package_path(package)}")
#     except LookupError:
#         nltk.download(package)

# # === Models ===
# embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")
# llm = pipeline("text2text-generation", model="google/flan-t5-large", device=0, max_new_tokens=512)

In [3]:
# === Config ===
# WATCH_FOLDER_ID = "17sSWn0cfX-jEFmgG4Hl-P9_NtG9kqS6D"
WATCH_FOLDER_ID="1tJhw1KbempeSjoHfkJ-Ig6Ikn-bKL20Q" #test folder
DOWNLOAD_PATH = Path("/tmp/gdrive_docs")
DOWNLOAD_PATH.mkdir(parents=True, exist_ok=True)

# === Auth ===
CRED_INPUT = "/kaggle/input/google-cred/credentials.json"
CRED_WORKING = "/kaggle/working/credentials.json"
copyfile(CRED_INPUT, CRED_WORKING)

gauth = GoogleAuth()
gauth.LoadCredentialsFile(CRED_WORKING)
if gauth.credentials is None:
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    gauth.Refresh()
else:
    gauth.Authorize()
gauth.SaveCredentialsFile(CRED_WORKING)
drive = GoogleDrive(gauth)


In [4]:
# === HELPERS ===
def sanitize_filename(title):
    return re.sub(r'[\\/*?:"<>|]', "_", title)

def detect_format_style(text):
    bold_qs = len(re.findall(r"\n[A-Z][^\n]{4,100}\?\s*\n", text))
    return "bolded_qs" if bold_qs > 3 else "narrative"

def llm_extract_qa_pairs(text):
    prompt = (
        """Extract question and answer pairs from the grant text below. Use format:
Q: [Question]
A: [Answer]

Text:
"""
        f"{text}"
    )
    try:
        output = llm(prompt.strip())[0]["generated_text"]
        pairs = re.findall(r"Q:\s*(.+?)\s*A:\s*(.+?)(?=\nQ:|\Z)", output, re.DOTALL)
        return [{"question": q.strip(), "answer": a.strip()} for q, a in pairs]
    except Exception as e:
        print(f"⚠️ LLM Q&A extraction failed: {e}")
        return []

def extract_qa_pairs(text):
    q_pattern = re.compile(r"(?=(?:^|\n)([^:\n]{4,100}[\?:])\s*\n?)", re.MULTILINE)
    splits = q_pattern.split(text)
    qa_pairs = []
    for i in range(1, len(splits), 2):
        question = splits[i].strip()
        answer = splits[i+1].strip() if i+1 < len(splits) else ""
        if len(answer) > 10 and len(question) > 5:
            qa_pairs.append({"question": question, "answer": answer})
    return qa_pairs

def categorize_chunk(text: str, question: str = "") -> list:
    categories = []
    lower_text = text.lower()
    lower_q = question.lower()
    if "problem" in lower_q or "we address" in lower_text:
        categories.append("Contact Information Problem")
    if "mission" in lower_q or "mission" in lower_text:
        categories.append("Mission Statement")
    if len(text) < 400:
        categories.append("Project Summary")
    if "goal" in lower_q or "vision" in lower_q:
        categories.append("Goals, Vision, or Objectives")
    if "approach" in lower_q or "methodology" in lower_text:
        categories.append("Our Solution or Approach")
    if "impact" in lower_q or "results" in lower_q:
        categories.append("Impact Results or Outcomes")
    if "who benefits" in lower_q or "target population" in lower_q:
        categories.append("Beneficiaries")
    if "unique" in lower_text:
        categories.append("Unique Value Proposition")
    if "timeline" in lower_q:
        categories.append("Plan and Timeline")
    if "budget" in lower_q or "$" in text:
        categories.append("Budget and Funding")
    if "sustainability" in lower_text:
        categories.append("Sustainability or Strategy")
    if "team" in lower_q or "lived experience" in lower_text:
        categories.append("Team Members and Descriptions")
    if re.search(r"\\bfounded\\b|\\bhistory\\b|\\baccelerator\\b", lower_text):
        categories.append("Organizational History")
    if re.search(r"(https?://\\S+)", lower_text):
        categories.append("Supplementary Materials")
    if not categories:
        categories.append("Miscellaneous")
    return categories

def annotate_chunk(chunk_text: str) -> dict:
    prompt = (
        """You are a grant document assistant.
Analyze the following text and return the most likely category it fits into.
Possible categories include: Mission Statement, Problem, Team, Impact, Sustainability, etc.
Return only JSON in the format:
{"question": "...", "answer": "...", "category": "..."}

TEXT:
"""
        f"{chunk_text}"
    )
    try:
        result = llm(prompt.strip())[0]['generated_text']
        match = re.search(r"\{.*\}", result, re.DOTALL)
        if match:
            return json.loads(match.group())
        else:
            raise ValueError("No JSON found in model output")
    except Exception as e:
        print(f"⚠️ LLM error: {e}")
        return {"question": None, "answer": None, "category": "Miscellaneous"}

In [9]:
# === Doc Chunker ===

@dataclass
class DocumentChunk:
    chunk_id: int
    header: str
    questions: List[str]
    content: str
    topics: List[str]
    word_count: int
    confidence_score: float = 0.0

class DocumentChunker:
    def __init__(self):
        self.setup_logging()
        
        # Patterns for different document types
        self.patterns = {
            'grant_application': {
                'header_patterns': [
                    r'\*\*([^*]+)\*\*',  # **Header**
                    r'^([A-Z][^a-z]*[A-Z])$',  # ALL CAPS
                    r'^([A-Z][A-Za-z\s]+)$',  # Title Case
                ],
                'question_patterns': [
                    r'^.+\?$',  # Ends with question mark
                    r'^\*?Please .+',  # Starts with "Please"
                    r'^How .+',  # Starts with "How"
                    r'^What .+',  # Starts with "What"
                    r'^Describe .+',  # Starts with "Describe"
                ],
                'section_markers': [
                'project summary', 'contact information', 'mission statement', 'mission','fit', 'alignment', 'grant', 'goals', 'vision', 'objectives', 'objective','solution', 'approach', 'impact', 'results', 'outcomes', 'outcome','beneficiaries', 'beneficiary', 'unique value proposition', 'value proposition','plan', 'timeline', 'budget', 'funding', 'sustainability', 'strategy','team members', 'team', 'descriptions', 'supplementary materials','organizational history', 'history', 'background', 'program', 'description','partnership', 'miscellaneous']
            }
        }
        
        # Topic keywords for classification
        self.topic_keywords = {
            'education': ['education', 'learning', 'student', 'curriculum', 'teaching', 'school'],
            'entrepreneurship': ['entrepreneur', 'startup', 'business', 'venture', 'innovation'],
            'solar': ['solar', 'renewable', 'energy', 'installation', 'panel', 'clean energy'],
            'workforce': ['job', 'employment', 'training', 'skills', 'workforce', 'career'],
            'community': ['community', 'neighborhood', 'local', 'resident', 'housing'],
            'BIPOC': ['BIPOC', 'youth', 'color', 'underestimated', 'marginalized'],
            'technology': ['technology', 'platform', 'digital', 'AI', 'tech'],
            'partnership': ['partner', 'collaboration', 'alliance', 'cooperation'],
            'funding': ['funding', 'grant', 'budget', 'revenue', 'cost', 'financial'],
            'metrics': ['metric', 'outcome', 'result', 'measure', 'data', 'statistics']
        }

    def setup_logging(self):
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)

    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text from DOCX file preserving some formatting"""
        if not DOCX_AVAILABLE:
            raise ImportError("python-docx required for DOCX files")
            
        doc = Document(file_path)
        text_parts = []
        
        for paragraph in doc.paragraphs:
            # Check if paragraph is bold (header-like)
            if paragraph.runs and any(run.bold for run in paragraph.runs):
                text_parts.append(f"**{paragraph.text}**")
            else:
                text_parts.append(paragraph.text)
                
        return '\n'.join(text_parts)

    def detect_document_type(self, text: str) -> str:
        """Detect document type based on content patterns"""
        text_lower = text.lower()
        
        # Check for grant application indicators
        grant_indicators = ['grant', 'funding', 'organization', 'mission', 'program description']
        grant_score = sum(1 for indicator in grant_indicators if indicator in text_lower)
        
        if grant_score >= 3:
            return 'grant_application'
        
        return 'generic'

    def extract_headers(self, text: str, doc_type: str = 'generic') -> List[Dict]:
        """Extract headers using multiple pattern matching approaches"""
        patterns = self.patterns.get(doc_type, self.patterns['grant_application'])
        headers = []
        
        lines = text.split('\n')
        
        for i, line in enumerate(lines):
            line = line.strip()
            if not line:
                continue
                
            # Try each header pattern
            for pattern in patterns['header_patterns']:
                match = re.match(pattern, line)
                if match:
                    header_text = match.group(1) if match.groups() else line
                    headers.append({
                        'text': header_text.strip('*').strip(),
                        'line_number': i,
                        'pattern_type': 'header'
                    })
                    break
            
            # Check for question patterns
            for pattern in patterns['question_patterns']:
                if re.match(pattern, line, re.IGNORECASE):
                    headers.append({
                        'text': line,
                        'line_number': i,
                        'pattern_type': 'question'
                    })
                    break

        return headers

    def chunk_by_headers(self, text: str, headers: List[Dict]) -> List[Dict]:
        """Split text into chunks based on detected headers"""
        lines = text.split('\n')
        chunks = []
        
        for i, header in enumerate(headers):
            start_line = header['line_number']
            end_line = headers[i + 1]['line_number'] if i + 1 < len(headers) else len(lines)
            
            # Extract content between headers
            content_lines = lines[start_line + 1:end_line]
            content = '\n'.join(content_lines).strip()
            
            # Separate questions from content
            questions = []
            content_parts = []
            
            for line in content_lines:
                line = line.strip()
                if line.endswith('?') and len(line.split()) <= 20:  # Likely a question
                    questions.append(line)
                elif line:
                    content_parts.append(line)
            
            if content or questions:
                chunk = {
                    'header': header['text'],
                    'questions': questions,
                    'content': '\n'.join(content_parts).strip(),
                    'start_line': start_line,
                    'end_line': end_line,
                    'pattern_type': header['pattern_type']
                }
                chunks.append(chunk)
        
        return chunks

    def extract_topics_tfidf(self, text: str, max_features: int = 3) -> List[str]:
        """Extract top 1-3 topics using TF-IDF"""
        if not SKLEARN_AVAILABLE:
            return self.extract_topics_keyword_matching(text)
        
        try:
            # Clean text
            text_clean = re.sub(r'[^\w\s]', ' ', text.lower())
            text_clean = re.sub(r'\s+', ' ', text_clean).strip()
            
            if len(text_clean.split()) < 10:  # Too short for TF-IDF
                return self.extract_topics_keyword_matching(text)
            
            vectorizer = TfidfVectorizer(
                max_features=max_features*2, #gets more candidates to filter
                stop_words='english',
                ngram_range=(1, 2),
                min_df=1,
                max_df=0.8
            )
            
            tfidf_matrix = vectorizer.fit_transform([text_clean])
            feature_names = vectorizer.get_feature_names_out()
            scores = tfidf_matrix.toarray()[0]
            
            # Get top terms
            topic_scores = list(zip(feature_names, scores))
            topic_scores.sort(key=lambda x: x[1], reverse=True)
            
            topics = [term for term, score in topic_scores if score > 0][:max_features]
            
            # Enhance with keyword matching if fewer than max
#            keyword_topics = self.extract_topics_keyword_matching(text)
            if len(topics) < max_features:
                keyword_topics = self.extract_topics_keyword_matching(text)
            # Combine and deduplicate
#            all_topics = list(set(topics + keyword_topics))
            for topic in keyword_topics:
                if topic not in topics and len(topics)<max_features:
                    topics.append(topic)
            return topics[:max_features]
            
        except Exception as e:
            self.logger.warning(f"TF-IDF topic extraction failed: {e}")
            return self.extract_topics_keyword_matching(text)

    def extract_topics_keyword_matching(self, text: str, max_topics: int = 3) -> List[str]:
        """Extract top 1-3 topics using keyword matching"""
        text_lower = text.lower()
        found_topics = []
        specific_terms = []
        topic_priority = [
        'education', 'entrepreneurship', 'solar', 'workforce', 'BIPOC',
        'community', 'technology', 'partnership', 'funding', 'metrics'
        ]
        for topic in topic_priority:
            if len(found_topics)>=max_topics:
                break
            keywords=self.topic_keywords[topic]
            if any(keyword in text_lower for keyword in keywords):
                found_topics.append(topic)
#        for topic, keywords in self.topic_keywords.items():
#            if any(keyword in text_lower for keyword in keywords):
#                found_topics.append(topic)
#        
        # Add specific terms found in text
        if len(found_topics)<max_topics:
            for word in text_lower.split():
                word = re.sub(r'[^\w]', '', word)
                if (len(word) > 3 and
                word not in ENGLISH_STOP_WORDS if SKLEARN_AVAILABLE else True and
                word.isalpha()):
                    specific_terms.append(word)
        
        # Get most frequent specific terms
        from collections import Counter
        term_counts = Counter(specific_terms)
        common_terms = [term for term, count in term_counts.most_common(3) if count > 1]
        for term in common_terms:
            if len(found_topics)>=max_topics:
                break
            if term not in found_topics:
                found_topics.append(term)
        return found_topics[:max_topics]

    def extract_topics_spacy(self, text: str) -> List[str]:
        """Extract topics using spaCy NER and noun phrases"""
        if not SPACY_AVAILABLE:
            return self.extract_topics_keyword_matching(text, max_topics=3)
        
        try:
            doc = nlp(text[:1000])  # Limit text length for performance
            
            # Extract named entities
            entities = [ent.text.lower() for ent in doc.ents 
                       if ent.label_ in ['ORG', 'PRODUCT', 'EVENT', 'WORK_OF_ART']]
            
            # Extract noun phrases
            noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks 
                           if len(chunk.text.split()) <= 3]
            
            # Combine with keyword matching
            keyword_topics = self.extract_topics_keyword_matching(text, max_topics=3)
            
            all_topics = list(set(entities + noun_phrases + keyword_topics))
            return [topic for topic in all_topics if len(topic) > 2][:3]
            
        except Exception as e:
            self.logger.warning(f"spaCy topic extraction failed: {e}")
            return self.extract_topics_keyword_matching(text, max_topics=3)

    def calculate_confidence_score(self, chunk: Dict) -> float:
        """Calculate confidence score for chunk quality"""
        score = 0.0
        
        # Header quality
        if chunk['header']:
            score += 0.3
            if len(chunk['header'].split()) > 1:
                score += 0.1
        
        # Content quality
        if chunk['content']:
            word_count = len(chunk['content'].split())
            if word_count > 20:
                score += 0.3
            if word_count > 100:
                score += 0.1
        
        # Questions present
        if chunk['questions']:
            score += 0.2
        
        # Topic relevance
        if len(chunk.get('topics', [])) >= 1:
            score += 0.1
            if len(chunk.get('topics',[]))>=2:
                score+=.05
        
        return min(score, 1.0)

    def process_document(self, file_path: str, output_path: Optional[str] = None) -> List[DocumentChunk]:
        """Main method to process a document and extract chunks"""
        self.logger.info(f"Processing document: {file_path}")
        
        # Read file
        file_path = Path(file_path)
        if file_path.suffix.lower() == '.docx':
            text = self.extract_text_from_docx(str(file_path))
        else:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
        
        # Detect document type
        doc_type = self.detect_document_type(text)
        self.logger.info(f"Detected document type: {doc_type}")
        
        # Extract headers
        headers = self.extract_headers(text, doc_type)
        self.logger.info(f"Found {len(headers)} headers/sections")
        
        # Create chunks
        raw_chunks = self.chunk_by_headers(text, headers)
        
        # Process chunks
        processed_chunks = []
        for i, chunk_data in enumerate(raw_chunks):
            # Extract topics
            full_text = f"{chunk_data['header']} {' '.join(chunk_data['questions'])} {chunk_data['content']}"
            
            # Try multiple topic extraction methods
            topics = self.extract_topics_tfidf(full_text, max_features=3)
            if not topics:
                topics = self.extract_topics_keyword_matching(full_text, max_topics=3)
            
            # Create DocumentChunk
            chunk = DocumentChunk(
                chunk_id=i + 1,
                header=chunk_data['header'],
                questions=chunk_data['questions'],
                content=chunk_data['content'],
                topics=topics,
                word_count=len(chunk_data['content'].split()) if chunk_data['content'] else 0,
                confidence_score=self.calculate_confidence_score(chunk_data)
            )
            
            processed_chunks.append(chunk)
        
        # Save to JSON if output path provided
        if output_path:
            self.save_chunks_to_json(processed_chunks, output_path, file_path.name)
        
        return processed_chunks

    def save_chunks_to_json(self, chunks: List[DocumentChunk], output_path: str, document_name: str):
        """Save chunks to JSON file"""
        output_data = {
            "document_title": document_name,
            "total_chunks": len(chunks),
            "chunks": [asdict(chunk) for chunk in chunks]
        }
        
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, indent=2, ensure_ascii=False)
        
        self.logger.info(f"Saved {len(chunks)} chunks to {output_path}")

    def batch_process(self, input_dir: str, output_dir: str):
        """Process multiple documents in a directory"""
        input_path = Path(input_dir)
        output_path = Path(output_dir)
        output_path.mkdir(exist_ok=True)
        
        supported_extensions = ['.txt', '.docx']
        files = [f for f in input_path.glob('*') if f.suffix.lower() in supported_extensions]
        
        self.logger.info(f"Found {len(files)} files to process")
        
        for file_path in files:
            try:
                output_file = output_path / f"{file_path.stem}_chunks.json"
                chunks = self.process_document(str(file_path), str(output_file))
                self.logger.info(f"Successfully processed {file_path.name}: {len(chunks)} chunks")
            except Exception as e:
                self.logger.error(f"Failed to process {file_path.name}: {e}")

# def main():
#     """Example usage"""
#     chunker = DocumentChunker()
    
#     # Process single document
#     # chunks = chunker.process_document("document.docx", "output_chunks.json")
    
#     # Process multiple documents
#     # chunker.batch_process("input_documents/", "output_chunks/")
    
#     # Example with the provided documents
#     sample_text = '''
#     **Organization Background**
#     Please provide a brief description of your organization's mission and history.
    
#     Cambio Labs was established in 2021 by Sebastián Martín, a social entrepreneur and educator, 
#     in response to educational and employment inequities faced by low-income BIPOC youth.
    
#     **Program Description**
#     Describe the program, its purpose and how it will be implemented.
    
#     Cambio Energy is an initiative to create access to clean energy jobs, utilities savings 
#     through community solar projects, and green entrepreneurship accelerators.
#     '''
    
#     # Save sample text to file for testing
#     with open('sample_doc.txt', 'w') as f:
#         f.write(sample_text)
    
#     # Process the sample
#     chunks = chunker.process_document('sample_doc.txt', 'sample_output.json')
    
#     # Print results
#     for chunk in chunks:
#         print(f"\nChunk {chunk.chunk_id}: {chunk.header}")
#         print(f"Topics: {chunk.topics}")
#         print(f"Word count: {chunk.word_count}")
#         print(f"Confidence: {chunk.confidence_score:.2f}")

# if __name__ == "__main__":
#     main()


In [11]:
# === FILE CHECKER ===
def check_for_new_files():
    print(f"\U0001F50D Checking for new files at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    chunker = DocumentChunker()
    file_list = drive.ListFile({'q': f"'{WATCH_FOLDER_ID}' in parents and trashed=false"}).GetList()

    for file in file_list:
        title = file['title']
        file_id = file['id']
        modified_time = file['modifiedDate']
        sanitized_title = sanitize_filename(title)
        file_path = DOWNLOAD_PATH / f"{sanitized_title}.docx"

        # existing = structured_col.find_one({"metadata.title": title})
        # if existing and existing.get("metadata", {}).get("modifiedDate") == modified_time:
        #     print(f"✅ Already processed: {title}")
        #     continue

        print(f"⬇ Downloading: {title}")
        file.GetContentFile(str(file_path), mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document')

        loader = UnstructuredFileLoader(str(file_path))
        pages = loader.load()
        raw_text = pages[0].page_content
        doc_id = sanitized_title.replace(" ", "_").lower()

        #USING CHUNKER
        try:
            print(f"Processing Chunks for : {title}")
            chunks_output_path = Path("/kaggle/working") / f"{sanitized_title}_chunks.json"
            chunks= chunker.process_document(str(file_path),str(chunks_output_path))
            print(f"✅ Successfully created {len(chunks)} chunks")
            for i, chunk in enumerate(chunks[:3]):
                print(f"Chunk{chunk.chunk_id}:{chunk.header}")
                print(f"    Topics: {chunk.topics}")
                print(f"    Word count: {chunk.word_count}")
                print(f"    Confidence: {chunk.confidence_score:.2f}")
                print()
            if len(chunks) > 3:
                print(f"  ... and {len(chunks) - 3} more chunks")
        except Exception as e:
            print(f"❌ Error processing chunks for {title}: {e}")
        # qa_pairs = extract_qa_pairs(raw_text)

        # doc_entry = {
        #     "doc_id": doc_id,
        #     "metadata": {
        #         "title": title,
        #         "doc_id": doc_id,
        #         "file_id": file_id,
        #         "modifiedDate": modified_time,
        #         "word_count": len(raw_text.split()),
        #     },
        #     "chunks": []
        # }

        # flat_chunks = []

        # for i, pair in enumerate(qa_pairs):
        #     q, a = pair['question'], pair['answer']
        #     categories = categorize_chunk(a, q)
        #     if categories == ["Miscellaneous"]:
        #         annotations = annotate_chunk(a)
        #         categories = [annotations["category"]] if annotations["category"] else ["Miscellaneous"]
        #     for cat in categories:
        #         chunk_id = f"{doc_id}_chunk_{i}_{cat.replace(' ', '_')}"
        #         try:
        #             embedding = embedding_model.encode(a).tolist()
        #         except Exception as e:
        #             print(f"⚠️ Embedding error: {e}")
        #             embedding = None

        #         chunk_data = {
        #             "chunk_id": chunk_id,
        #             "text": a,
        #             "embedding": embedding,
        #             "question": q,
        #             "answer": a,
        #             "word_count": len(a.split()),
        #             "metadata": {
        #                 "doc_id": doc_id,
        #                 "title": title,
        #                 "file_id": file_id,
        #                 "modifiedDate": modified_time,
        #                 "category": cat
        #             }
        #         }
        #         doc_entry["chunks"].append(chunk_data)
        #         flat_chunks.append(chunk_data)

        # structured_col.delete_many({"metadata.title": title})
        # flat_col.delete_many({"metadata.doc_id": doc_id})

        # structured_col.insert_one(doc_entry)
        # if flat_chunks:
        #     flat_col.insert_many(flat_chunks)
        # with open(f"/kaggle/working/{doc_id}_structured.json", "w") as f:
        #     json.dump(doc_entry, f, indent=2)
        # with open(f"/kaggle/working/{doc_id}_flat.json", "w") as f:
        #     json.dump(flat_chunks, f, indent=2)


        # print(f"✅ Processed and inserted: {title}")

        # try:
        #     os.remove(file_path)
        # except Exception as e:
        #     print(f"⚠️ Failed to delete file: {e}")

    print("✅ Check complete.\n")

if __name__ == "__main__":
    check_for_new_files()
# === LOOP ===
# while True:
#     check_for_new_files()
#     print("⏲️ Sleeping for 5 minutes...\n")
#     time.sleep(300)

🔍 Checking for new files at 2025-07-24 15:14:50
⬇ Downloading: Copy of DATA - 2024 Con Edison - Focus Grant Application
Processing Chunks for : Copy of DATA - 2024 Con Edison - Focus Grant Application
✅ Successfully created 19 chunks
Chunk1:Organization Background
    Topics: []
    Word count: 11
    Confidence: 0.40

Chunk2:Guidance: Please limit your response to approximately 300 words.
    Topics: ['education', 'entrepreneurship', 'solar']
    Word count: 206
    Confidence: 0.80

Chunk3:Program Description
    Topics: ['funding', 'program']
    Word count: 29
    Confidence: 0.70

  ... and 16 more chunks
⬇ Downloading: Copy of DATA - VELA 2022 - May, Microgrant
Processing Chunks for : Copy of DATA - VELA 2022 - May, Microgrant
✅ Successfully created 13 chunks
Chunk1:What is the name of your idea?
    Topics: ['education', 'entrepreneurship']
    Word count: 8
    Confidence: 0.40

Chunk2:Please describe the idea. What is the idea trying to do, and for whom?
    Topics: ['educatio

In [7]:
# # === Main ===
# def check_for_new_files():
#     print(f"🔍 Checking for new files at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
#     file_list = drive.ListFile({'q': f"'{WATCH_FOLDER_ID}' in parents and trashed=false"}).GetList()

#     for file in file_list:
#         title = file['title']
#         file_id = file['id']
#         modified_time = file['modifiedDate']

#         # Make filename safe
#         doc_id = sanitize_filename(title.replace(" ", "_").lower())

#         # Skip if structured JSON already exists
#         if Path(f"/kaggle/working/{doc_id}_structured.json").exists():
#             print(f"✅ Already processed locally: {title}")
#             continue

#         print(f"⬇ Downloading: {title}")
#         file_path = DOWNLOAD_PATH / f"{sanitize_filename(title)}.docx"
#         file.GetContentFile(str(file_path), mimetype='application/vnd.openxmlformats-officedocument.wordprocessingml.document')

#         loader = UnstructuredFileLoader(str(file_path))
#         pages = loader.load()
#         raw_text = pages[0].page_content
#         word_count = len(raw_text.split())

#         flat_chunks = []

#         for i, page in enumerate(pages):
#             chunks = chunk_text(page.page_content)
#             for j, chunk in enumerate(chunks):
#                 qa = generate_qa_pair(chunk)
#                 try:
#                     embedding = embedding_model.encode(chunk).tolist()
#                 except Exception as e:
#                     print(f"⚠️ Embedding error: {e}")
#                     embedding = None

#                 chunk_data = {
#                     "chunk_id": f"{doc_id}_chunk_{i}_{j}",
#                     "text": chunk,
#                     "embedding": embedding,
#                     "question": qa["question"],
#                     "answer": qa["answer"],
#                     "word_count": len(chunk.split()),
#                     "metadata": {
#                         "doc_id": doc_id,
#                         "title": title,
#                         "file_id": file_id,
#                         "modifiedDate": modified_time,
#                         "category": qa["category"]
#                     }
#                 }
#                 flat_chunks.append(chunk_data)

#         doc_entry = {
#             "doc_id": doc_id,
#             "metadata": {
#                 "title": title,
#                 "doc_id": doc_id,
#                 "file_id": file_id,
#                 "modifiedDate": modified_time,
#                 "word_count": word_count
#             },
#             "chunks": flat_chunks
#         }

#         # Save flat and structured JSONs in /kaggle/working
#         with open(f"/kaggle/working/{doc_id}_structured.json", "w") as f:
#             json.dump(doc_entry, f, indent=2)
#         with open(f"/kaggle/working/{doc_id}_flat.json", "w") as f:
#             json.dump(flat_chunks, f, indent=2)

#         try:
#             os.remove(file_path)
#         except Exception as e:
#             print(f"⚠️ Failed to delete file: {e}")

#     print("✅ Check complete.\n")

# if __name__ == "__main__":
#     check_for_new_files()


In [19]:
import os
import shutil
from pathlib import Path
import zipfile

WORKING_DIR = Path("/kaggle/working")
# STRUCTURED_DIR = WORKING_DIR / "full_rules_structured"
# FLAT_DIR = WORKING_DIR / "full_rules_flat"
All_Files = WORKING_DIR / "ALL_FILES"

# Create folders if they don't exist
# STRUCTURED_DIR.mkdir(exist_ok=True)
# FLAT_DIR.mkdir(exist_ok=True)
All_Files.mkdir(exist_ok=True)
# Move JSON files into their respective folders
for file in WORKING_DIR.glob("*.json"):
     if file.name.endswith("_chunks.json"):
        shutil.move(str(file), All_Files / file.name)
    # if file.name.endswith("_structured.json"):
    #     shutil.move(str(file), STRUCTURED_DIR / file.name)
    # elif file.name.endswith("_flat.json"):
    #     shutil.move(str(file), FLAT_DIR / file.name)

# Function to zip a folder
def zip_folder(folder_path: Path, zip_path: Path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for file in folder_path.rglob('*'):
            zipf.write(file, arcname=file.relative_to(folder_path))

# Zip both folders
# zip_folder(STRUCTURED_DIR, WORKING_DIR / "full_rules_structured.zip")
# zip_folder(FLAT_DIR, WORKING_DIR / "full_rules_flat.zip")
zip_folder(All_Files, WORKING_DIR / "ALL_FILES.zip")

print("Folders zipped to: ALL_FILES.zip")
# print(" - full_rules_structured.zip")
# print(" - full_rules_flat.zip")


Folders zipped to: ALL_FILES.zip
