In [None]:
# cell_1_setup.py
!pip install -q pinecone-client sentence-transformers langchain-community langchain-text-splitters pypdf pandas tqdm

import os
import pandas as pd
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader, DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import torch

# --- CONFIGURATION (Enter these temporarily in Colab) ---
PINECONE_API_KEY = "YOUR_PINECONE_API_KEY"
PINECONE_INDEX_NAME = "rancho-cordova"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize
pc = Pinecone(api_key=PINECONE_API_KEY)

if PINECONE_INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=PINECONE_INDEX_NAME,
        dimension=384, 
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(PINECONE_INDEX_NAME)
embed_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)

def process_files(file_list):
    batch_size = 100
    vectors = []
    print(f"üìÇ Processing {len(file_list)} files on {DEVICE}...")

    for file_path in file_list:
        try:
            print(f"   ‚ûú Reading: {file_path}")
            chunks = []
            
            if file_path.endswith('.pdf'):
                loader = PyPDFLoader(file_path)
                pages = loader.load()
                splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
                chunks = splitter.split_documents(pages)
                
            elif file_path.endswith('.csv'):
                df = pd.read_csv(file_path)
                df['text_context'] = df.apply(lambda x: " | ".join(x.dropna().astype(str)), axis=1)
                loader = DataFrameLoader(df, page_content_column="text_context")
                chunks = loader.load()

            for idx, chunk in enumerate(chunks):
                vector = embed_model.encode(chunk.page_content).tolist()
                doc_id = f"{os.path.basename(file_path)}_{idx}"
                # Tag content for specific agents
                agent_tag = "energy" if "energy" in file_path.lower() or "smud" in file_path.lower() else "customer"
                
                vectors.append({
                    "id": doc_id,
                    "values": vector,
                    "metadata": {
                        "text": chunk.page_content,
                        "source": os.path.basename(file_path),
                        "agent": agent_tag
                    }
                })

                if len(vectors) >= batch_size:
                    index.upsert(vectors=vectors)
                    vectors = []
                    print(f"      Uploaded batch...")

        except Exception as e:
            print(f"‚ùå Error: {e}")

    if vectors: index.upsert(vectors=vectors)
    print("‚úÖ Upload Complete!")

# UPLOAD FILES TO COLAB SIDEBAR, THEN RUN:
# files = ["your_file.csv", "your_doc.pdf"]
# process_files(files)