In [12]:
# LangChain
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate

# Other modules
from pydantic import BaseModel, Field
import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv


In [13]:
# Load environment variables
load_dotenv()

True

## Defining our LLM

In [19]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") # Loading the OpenAI API Key

In [20]:
# Creating the LLM object
llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=OPENAI_API_KEY
)

## Setting the PDF data

### Loading the PDF data & transforming into chunks

In [25]:
loader = PyPDFLoader("data\Pensum.pdf")

data = loader.load() # Loading the data

# Creating our splitter object to chunk the data
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
    separators=["\n\n", "\n", " "]
)

chunks = text_splitter.split_documents(data) # Creating the chunks

In [30]:
chunks[0].page_content # ex: One of the chunks created

'UNIVERSIDAD TECNOLÓGICA DE PANAMÁFUTP-SG-JRHA-04-1SECRETARÍA GENERALPLAN DE ESTUDIOFACULTAD:INGENIERÍA DE SISTEMAS COMPUTACIONALESCARRERA:LICENCIATURA EN INGENIERIA DE SISTEMAS Y COMPUTACIONNUM.COD. REQUISITOSASIG.ASIG.ASIGNATURA(COD-ASIG.)CLAS.LAB.CRED.I AÑOPRIMER SEMESTRE17987             505CÁLCULO   I20855             22$$3DIBUJO ASISTIDO POR COMPUTADORAS30741**             324DESARROLLO LÓGICO Y ALGORÍTMOS48353**             32$$4TECNOLOGÍA DE INFORMACIÓN  Y COMUNICAC.50742             303REDACCIÓN DE INFORMES Y EXPRESIÓN ORAL60744             505INGLÉS CONVERSACIONALI AÑOSEGUNDO SEMESTRE77988 7987            505CÁLCULO II88322 7987            404CÁLCULO III90743** 0741            32$$4HERRAMIENTAS DE PROGRAMACIÓN APLICADA I108361**             31$$3SISTEMAS COLABORATIVOS110608             23$$3INTRODUCCIÓN A LA QUÍMICA121392             223ESTADÍSTICA APLICADA A TIC138461             313INGENIERIA AMBIENTALII AÑOVERANO141263             303GEOGRAFÍA DE PANAMÁ151274             3

## Create embeddings

In [33]:
def create_embeddings():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-small",
        api_key=OPENAI_API_KEY,
    )
    return embeddings

embedding_function = create_embeddings()

### Creating the vector database

In [39]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):
    # Create a list for unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # Ensure that only unique ids are stored
    unique_ids = set()
    unique_chunks = []

    for id, chunk in zip(ids, chunks):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunks)
    
    # Create a new Chroma database
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function,
        persist_directory=vectorstore_path
    )

    vectorstore.persist()

    return vectorstore

In [40]:
vectorstore = create_vectorstore(
    chunks=chunks,
    embedding_function=embedding_function,
    vectorstore_path="vectorstore_chroma"
)

AttributeError: 'list' object has no attribute 'page_content'