### üíª **<i>Notebook 01:</i> Data Ingestion and Vector Store Setup**

**For running on Google Colab only, run the code cell below:**

In [None]:
import os
from dotenv import load_dotenv
try:
    # This import ONLY works on the Colab web interface
    from google.colab import userdata

    print("Loading API keys from Colab Secrets...")

    # --- Key Retrieval and Injection into os.environ ---
    os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
    os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
    os.environ['GOOGLE_CSE_ID'] = userdata.get('GOOGLE_CSE_ID')
    os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGCHAIN_API_KEY')
    os.environ['LANGCHAIN_PROJECT'] = "retrogaming-qa-bot"

    # --- Verification ---
    if not os.environ.get('OPENAI_API_KEY'):
        raise ValueError("OPENAI_API_KEY was not found in Colab Secrets.")

    print("‚úÖ All API keys were successfully loaded from Colab Secrets.")

except ImportError:
    # Fallback for local environments (local VS Code)
    print("Loading API keys from local .env file (VS Code/Local Mode).")
    load_dotenv()

# --- Common Configurations ---
CHROMA_PATH = "./chroma_db_retrogaming"
EMBEDDING_MODEL = "text-embedding-ada-002"
LLM_MODEL = "gpt-3.5-turbo"

# Here, the rest of the notebook variables are defined (llm, embeddings, etc.)
# ... (should continue defining LLM/RAG configuration variables)

A carregar chaves API do Colab Secrets...
‚úÖ Todas as chaves API foram carregadas com sucesso a partir do Colab Secrets.


##### **00 - Imports**

In [3]:
import os
import pandas as pd
from dotenv import load_dotenv
from langchain_community.document_loaders import YoutubeLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
import re

##### **01 - Configuration, API keys Loadout, Yotube videos for processing**

In [4]:
# --- Configuration ---
# 1. Load environment variables from a .env file
load_dotenv()

# 2. Get the API Key from the environment
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found. Please set it in your .env file.")

# 3. List of YouTube URLs (Retrogaming optimization)
# ‚ö†Ô∏è ACTION REQUIRED: Replace these placeholders with your actual URLs.
YOUTUBE_URLS = [
    "https://www.youtube.com/watch?v=zW1vpDQ9Ijs",
    "https://www.youtube.com/watch?v=nXsHc0IUzLY",
    "https://www.youtube.com/watch?v=i1KrbTkU1sw",
    "https://www.youtube.com/watch?v=ktVc3So9XyY",
    "https://www.youtube.com/watch?v=_IOdts-CszU",
    "https://www.youtube.com/watch?v=xfegbR8UWqU",
    "https://www.youtube.com/watch?v=XDDIBEdINAE",
    "https://www.youtube.com/watch?v=cXEfpRsVUCk",
    "https://www.youtube.com/watch?v=eztgBP-K_1k",
    "https://www.youtube.com/watch?v=Ky-BuWEnS2A",
    "https://www.youtube.com/watch?v=6KHEgkCN9yE",
    "https://www.youtube.com/watch?v=eduuJml97Fc",
    "https://www.youtube.com/watch?v=dLkkF4iXBBM",
    "https://www.youtube.com/watch?v=eKV-phyaLFI",
    "https://www.youtube.com/watch?v=ZV-yb2WMrQk",
    "https://www.youtube.com/watch?v=ArstSQv0BVM",
    "https://www.youtube.com/watch?v=oUcAsShb0gk",
    "https://www.youtube.com/watch?v=rDeqmBIWTdk",
    "https://www.youtube.com/watch?v=49SMKnMHNtU",
    "https://www.youtube.com/watch?v=fXbtkg9-150",
    "https://www.youtube.com/watch?v=GxjbvS8Jd_0",
    "https://www.youtube.com/watch?v=cjzRlfOAEnU",
    "https://www.youtube.com/watch?v=qefseBgp3Ns",
    "https://www.youtube.com/watch?v=uvMDjf_Mmv4",
    "https://www.youtube.com/watch?v=Fix6u4pksrg",
    "https://www.youtube.com/watch?v=n-9DHfiS48A",
    "https://www.youtube.com/watch?v=jvujTeMJ8Lg",
    "https://www.youtube.com/watch?v=nkR5lc4tD5A",
    "https://www.youtube.com/watch?v=Sue4VhQgEH0",
    "https://www.youtube.com/watch?v=bE0SXJ7Vba8",
    "https://www.youtube.com/watch?v=sR1_-HveQQY",
    "https://www.youtube.com/watch?v=28u6RoYiCWI",
    "https://www.youtube.com/watch?v=VD3LPvnadZY",
    "https://www.youtube.com/watch?v=YY03OM7qfZ4",
    "https://www.youtube.com/watch?v=7j7F-e-sels",
    "https://www.youtube.com/watch?v=SiuNUlBz6yQ",
    "https://www.youtube.com/watch?v=6IBDG_GJKFw",
    "https://www.youtube.com/watch?v=UGeLqcwAjws"
]
WEB_URLS = [
    "https://pcsx2.net/docs/usage/general/",
    "https://pcsx2.net/docs",
	"https://pcsx2.net/docs/category/setup",
	"https://pcsx2.net/docs/setup/requirements",
	"https://pcsx2.net/docs/setup/bios",
	"https://pcsx2.net/docs/setup/discs",
	"https://pcsx2.net/docs/setup/running",
	"https://pcsx2.net/docs/category/configuration",
	"https://pcsx2.net/docs/configuration/general",
	"https://pcsx2.net/docs/configuration/controllers",
	"https://pcsx2.net/docs/configuration/memcards",
	"https://pcsx2.net/docs/category/troubleshooting",
	"https://pcsx2.net/docs/troubleshooting/general",
	"https://pcsx2.net/docs/troubleshooting/performance",
	"https://pcsx2.net/docs/troubleshooting/identify",
	"https://pcsx2.net/docs/troubleshooting/windows",
	"https://pcsx2.net/docs/troubleshooting/linux",
	"https://pt.dolphin-emu.org/?cr=pt",
	"https://dolphin-emu.org/download/",
	"https://dolphin-emu.org/docs/guides/",
	"https://dolphin-emu.org/compat/",
	"https://www.retroarch.com/",
	"https://www.retroarch.com/?page=platforms",
	"https://docs.libretro.com/start/understanding/",
	"https://docs.libretro.com/start/installation/",
	"https://www.mesen.ca/",
	"https://www.snes9x.com/",
	"https://www.snes9x.com/downloads.php",
	"https://mgba.io/",
	"https://mgba.io/downloads.html",
	"https://mgba.io/faq.html"
    ]
# --- RAG Parameters ---
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
CHROMA_PATH = "./chroma_db_retrogaming"
EMBEDDING_MODEL = "text-embedding-ada-002"

print("‚úÖ Configuration loaded. API Key is ready for use.")

‚úÖ Configuration loaded. API Key is ready for use.


##### **02 - Setup and Custom Transcript Function**

In [None]:
import re # For extracting the video ID
from langchain_core.documents import Document
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

# Initialize the API client once
ytt_api = YouTubeTranscriptApi()

# Function to extract Video ID from the URL
def extract_video_id(url):
    """Extracts the video ID from a cleaned YouTube URL."""
    # Pattern to find 'v=' followed by the ID, stopping at '&' or end of string
    match = re.search(r'(?<=v=)[a-zA-Z0-9_-]+', url)
    return match.group(0) if match else None

# Function to fetch transcript with language fallback
def get_transcript_text(video_id, languages=['en', 'es', 'pt']):
    """Fetches and formats transcript using the updated API syntax."""

    try:
        # Use the instance method 'fetch' with language preferences (descending priority)
        transcript = ytt_api.fetch(video_id, languages=languages)
    except Exception as e:
        # Re-raise the exception to be handled in the main loop
        raise e

    full_text = ''
    for snippet in transcript.snippets:
        full_text += snippet.text + ' '

    return full_text.strip()

print("‚úÖ Setup and Custom Transcript functions defined. Ready for data loading loop.")

‚úÖ Setup and Custom Transcript functions defined. Ready for data loading loop.


##### **03 - Custom LangChain Data Loading Loop**

**Option 1: For a firs-time running, run the below code cell to generate the CSV file<br>If a csv file already exists use Option 2**

In [None]:
"""
#new 1 runner
# --- NEW UNIFIED LOADING (Replaces Cell 10/First_Runner) ---
# The goal is to create a single list of documents (YouTube and Web) for the CSV.

def load_all_data_and_create_csv():
    """Loads data from YouTube and the Web, creates Documents, and saves to a CSV."""
    all_docs_raw = []
    
    # 1. Load YouTube Transcripts
    print(f"\n[1/3] Starting transcript loading for {len(YOUTUBE_URLS)} videos...")
    for url in YOUTUBE_URLS:
        video_id = extract_video_id(url)
        if not video_id:
            print(f"‚ùå Failed to extract ID from URL: {url}")
            continue
        
        try:
            transcript_text = get_transcript_text(video_id)
            all_docs_raw.append({
                'source': url,
                'source_type': 'youtube_transcript',
                'video_id': video_id,
                'content': transcript_text
            })
            print(f"‚úÖ Loaded transcript for ID {video_id}.")
        except (NoTranscriptFound, TranscriptsDisabled):
            print(f"‚ùå Failed (No Transcript): ID {video_id}")
        except Exception as e:
            print(f"‚ùå Failed (Generic Error): ID {video_id} - {e}")

    # 2. Load Web Pages
    print(f"\n[2/3] Starting web page loading for {len(WEB_URLS)} pages...")
    web_loader = WebBaseLoader(WEB_URLS)
    
    try:
        web_docs = web_loader.load()
        for doc in web_docs:
            all_docs_raw.append({
                'source': doc.metadata.get('source'),
                'source_type': 'web_page',
                'video_id': 'N/A',
                'content': doc.page_content
            })
        print(f"‚úÖ Loaded {len(web_docs)} web page documents.")
    except Exception as e:
        print(f"‚ùå Failed to load web pages: {e}")
        
    # 3. Save to Unified CSV
    df = pd.DataFrame(all_docs_raw)
    TRANSCRIPT_CSV_PATH = "./retrogaming_knowledge_cached.csv"
    df.to_csv(TRANSCRIPT_CSV_PATH, index=False)
    
    print(f"\n[3/3] ‚úÖ Complete. Unified data from {len(df)} sources saved to {TRANSCRIPT_CSV_PATH}.")
    
    # Returns the DataFrame for the next step (although 'Second_Runner' is the default)
    return df

# Uncomment the line below to run the loading and create the CSV (First_Runner)
df_all = load_all_data_and_create_csv()
"""

**Option 2: Loads the local CSV file previously created**

In [None]:
#--- Second_Runner (Reads the CSV and creates LangChain Documents) ---

TRANSCRIPT_CSV_PATH = "./retrogaming_knowledge_cached.csv"
all_docs = [] # Final list of LangChain Documents

if os.path.exists(TRANSCRIPT_CSV_PATH):
    print(f"\n‚úÖ Loading all documents from the cached CSV: {TRANSCRIPT_CSV_PATH}")
    df_cached = pd.read_csv(TRANSCRIPT_CSV_PATH)

    for index, row in df_cached.iterrows():
        content = str(row['content']) if pd.notna(row['content']) else ""

        if content:
            doc = Document(
                page_content=content,
                metadata={
                    'source': row['source'],
                    'source_type': row['source_type'],
                    'video_id': row['video_id']
                }
            )
            all_docs.append(doc)

    print(f"\nTotal LangChain documents loaded: {len(all_docs)}")

else:
    print("‚ùå CRITICAL ERROR: The cached data file was not found. Run 'load_all_data_and_create_csv()' and try again.")
    # Do not raise an error to allow the user to manually run the loading block.#


‚úÖ A carregar todos os documentos do CSV em cache: ./retrogaming_knowledge_cached.csv

Total de documentos LangChain carregados: 64


##### **04 - Split Documents (Chunking)**

In [None]:
# --- Imports needed for this section ---
# Assuming these were imported in the first block:
# from langchain_text_splitters import RecursiveCharacterTextSplitter

# 2. Split the documents into smaller chunks
print("Starting document chunking...")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,       # Defined as 1000 in Section 1
    chunk_overlap=CHUNK_OVERLAP, # Defined as 150 in Section 1
    length_function=len,
    is_separator_regex=False,
)

chunks = text_splitter.split_documents(all_docs)
print(f"‚úÖ Split {len(all_docs)} documents into {len(chunks)} total chunks.")

Starting document chunking...
‚úÖ Split 64 documents into 582 total chunks.


##### **05 - Create Embeddings and Store (Vector Store)**

In [None]:
# --- Imports needed for this section ---
# Assuming these were imported in the first block:
# from langchain_openai import OpenAIEmbeddings
# from langchain_community.vectorstores import Chroma

# 3. Create the Embeddings Model and Vector Store
print(f"Starting embedding generation and storage in {CHROMA_PATH}...")
# The OpenAIEmbeddings class automatically uses the OPENAI_API_KEY from the environment
embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)

# Create a persistent ChromaDB instance
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=CHROMA_PATH
)

# Persist the database to disk so it can be reloaded in Notebook 02
vector_db.persist()

print(f"‚úÖ Successfully created and persisted vector store with {vector_db._collection.count()} embeddings.")
print(f"   Vector store saved to: {CHROMA_PATH}")
print("\nNotebook 01 is complete. You can now proceed to Notebook 02!")

Starting embedding generation and storage in ./chroma_db_retrogaming...
‚úÖ Successfully created and persisted vector store with 582 embeddings.
   Vector store saved to: ./chroma_db_retrogaming

Notebook 01 is complete. You can now proceed to Notebook 02!


  vector_db.persist()
