In [1]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
load_dotenv()

True

# GENERACIÓN DE LA BASE DE DATOS VECTORIAL

In [2]:
from qdrant_client import QdrantClient
import os

# Connect to Qdrant cloud cluster
url = os.getenv("QDRANT_URL")  
api_key = os.getenv("QDRANT_KEY")
qdrant = QdrantClient(url=url, api_key=api_key)

#Se proporcionan las credenciales para poder acceder a Qdrant en la Memoria. 

# Define the collection names and vector size
collection_names = [
    "OnlyContent_withStopwords", 
    "OnlyContent_withoutStopwords", 
    "Weighted_withStopwords", 
    "Weighted_withoutStopwords"
]
vector_size = 768

# Function to create a collection
def create_collection_if_not_exists(collection_name):
    try:
        if not qdrant.collection_exists(collection_name=collection_name):
            qdrant.create_collection(
                collection_name=collection_name,
                vectors_config={
                    "size": vector_size,
                    "distance": "Euclid",
                }
            )
            print(f"Colección '{collection_name}' creada correctamente.")
        else:
            print(f"La colección '{collection_name}' ya existe.")
    except Exception as e:
        print(f"Error al configurar Qdrant para la colección '{collection_name}': {e}")

# Create all collections
for name in collection_names:
    create_collection_if_not_exists(name)


La colección 'OnlyContent_withStopwords' ya existe.
La colección 'OnlyContent_withoutStopwords' ya existe.
La colección 'Weighted_withStopwords' ya existe.
La colección 'Weighted_withoutStopwords' ya existe.


In [3]:
import pandas as pd

# Define the file paths
file_paths = [
    "DATOS\BBDD_limpia_sin_quitar_stopwords.csv",
    "DATOS\BBDD_limpia_quitados_stopwords.csv"
]

# Function to load a dataset
def load_dataset(file_path):
    try:
        dataset = pd.read_csv(file_path, encoding='ISO-8859-1')
        print(f"Datos cargados correctamente desde {file_path}.")
        return dataset
    except FileNotFoundError as e:
        print(f"Error al cargar el archivo CSV en {file_path}: {e}")
    except Exception as e:
        print(f"Error desconocido al cargar el archivo CSV en {file_path}: {e}")
        return None

# Load both datasets
datasets = [load_dataset(path) for path in file_paths]


Datos cargados correctamente desde C:\Users\Adriana\OneDrive\Documents\Master_DS\TFM\LIMPIEZA DE DATOS\BBDD_limpia_sin_quitar_stopwords.csv.
Datos cargados correctamente desde C:\Users\Adriana\OneDrive\Documents\Master_DS\TFM\LIMPIEZA DE DATOS\BBDD_limpia_quitados_stopwords.csv.


In [4]:
docs_content_stopwords = datasets[0][['Contenido']] 
docs_content_NOstopwords = datasets[1][['Contenido']] 
docs_all_stopwords = datasets[0] 
docs_all_NOstopwords = datasets[1] 

In [5]:
from langchain_community.document_loaders import DataFrameLoader
from langchain.schema import Document

loader_content_stopwords = DataFrameLoader(docs_content_stopwords, page_content_column="Contenido")
documents_content_stopwords = loader_content_stopwords.load()

loader_content_NOstopwords = DataFrameLoader(docs_content_NOstopwords, page_content_column="Contenido")
documents_content_NOstopwords = loader_content_NOstopwords.load()

def dataframe_to_documents_with_metadata(df, content_column):
    documents = []
    for _, row in df.iterrows():
        content = row[content_column]
        metadata = row.drop(labels=[content_column]).to_dict()
        documents.append(Document(page_content=content, metadata=metadata))
    return documents
    
documents_all_stopwords = dataframe_to_documents_with_metadata(docs_all_stopwords, content_column = 'Contenido')

documents_all_NOstopwords = dataframe_to_documents_with_metadata(docs_all_NOstopwords, content_column = 'Contenido')

In [6]:
document_texts = [doc.page_content for doc in documents_content_stopwords]

In [7]:
from sentence_transformers import SentenceTransformer
from qdrant_client.models import PointStruct

# Load the SentenceTransformer model
embeddings_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Connect to Qdrant cloud cluster
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_KEY")
qdrant = QdrantClient(url=url, api_key=api_key)

collection_name = "OnlyContent_withStopwords"
# Generate embeddings
try:
    document_embeddings = embeddings_model.encode(document_texts)
    print("Embeddings generados correctamente.")
except Exception as e:
    print(f"Error al generar embeddings: {e}")

try:
# Create points for Qdrant
    points = [
        PointStruct(
            id=i,
            vector=embedding.tolist(),  
            payload={"text": document_texts[i]}  
        )
        for i, embedding in enumerate(document_embeddings)
]
# Upload points to Qdrant
    qdrant.upsert(
        collection_name=collection_name,
        points=points
    )
    print("Datos almacenados en Qdrant correctamente.")
except Exception as e:
    print(f"Error al almacenar los datos en Qdrant: {e}")


  from tqdm.autonotebook import tqdm, trange


Embeddings generados correctamente.
Datos almacenados en Qdrant correctamente.


In [8]:
document_texts = [doc.page_content for doc in documents_content_NOstopwords]

In [9]:
# Load the SentenceTransformer model
embeddings_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Connect to Qdrant cloud cluster
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_KEY")
qdrant = QdrantClient(url=url, api_key=api_key)

collection_name = "OnlyContent_withoutStopwords"
# Generate embeddings
try:
    document_embeddings = embeddings_model.encode(document_texts)
    print("Embeddings generados correctamente.")
except Exception as e:
    print(f"Error al generar embeddings: {e}")

try:
# Create points for Qdrant
    points = [
        PointStruct(
            id=i,
            vector=embedding.tolist(),  
            payload={"text": document_texts[i]}  
        )
        for i, embedding in enumerate(document_embeddings)
]
# Upload points to Qdrant
    qdrant.upsert(
        collection_name=collection_name,
        points=points
    )
    print("Datos almacenados en Qdrant correctamente.")
except Exception as e:
    print(f"Error al almacenar los datos en Qdrant: {e}")


Embeddings generados correctamente.
Datos almacenados en Qdrant correctamente.


In [10]:
document_texts = [doc for doc in documents_all_stopwords]

In [11]:
from sentence_transformers import SentenceTransformer
from qdrant_client.models import PointStruct
import numpy as np

# Load the SentenceTransformer model
embeddings_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Connect to Qdrant cloud cluster
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_KEY")
qdrant = QdrantClient(url=url, api_key=api_key)

collection_name = "Weighted_withStopwords"

#Function to Combine Embeddings with Weights
def weighted_embed_document(document, embedding_model, content_weight=0.7, title_weight=0.2, author_weight=0.1):
    # Compute embeddings for each part
    content_embedding = embedding_model.encode(document.page_content) * content_weight
    title_embedding = embedding_model.encode(document.metadata.get('Title', '')) * title_weight
    author_embedding = embedding_model.encode(document.metadata.get('Author', '')) * author_weight
    
    # Combine embeddings with the specified weights
    combined_embedding = content_embedding + title_embedding + author_embedding
    return combined_embedding
    
# Generate embeddings
#Funtion to Embed All Documents
def embed_documents(documents, embedding_model, content_weight=0.7, title_weight=0.2, author_weight=0.1):
    embeddings = []
    for doc in documents:
        embedding = weighted_embed_document(doc, embedding_model, content_weight, title_weight, author_weight)
        embeddings.append(embedding)
    return embeddings

try:
    document_embeddings = embed_documents(document_texts, embeddings_model)
    print("Embeddings generados correctamente.")
except Exception as e:
    print(f"Error al generar embeddings: {e}")

try:
# Create points for Qdrant
    points = [
    PointStruct(
        id=i,  # Unique identifier for each point
        vector=embeddings.tolist(),  # Convert numpy.ndarray to list
        payload={
            "content": document.page_content,  # Main content
            "title": document.metadata.get('Title', ''),  # Title metadata
            "author": document.metadata.get('Author', '')  # Author metadata
        }
    )
    for i, (embeddings, document) in enumerate(zip(document_embeddings, document_texts))
]

# Upload points to Qdrant
    qdrant.upsert(
        collection_name=collection_name,
        points=points
    )
    print("Datos almacenados en Qdrant correctamente.")
except Exception as e:
    print(f"Error al almacenar los datos en Qdrant: {e}")

Embeddings generados correctamente.
Datos almacenados en Qdrant correctamente.


In [12]:
document_texts = [doc for doc in documents_all_NOstopwords]

In [13]:
# Load the SentenceTransformer model
embeddings_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# Connect to Qdrant cloud cluster
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_KEY")
qdrant = QdrantClient(url=url, api_key=api_key)

collection_name = "Weighted_withoutStopwords"

#Function to Combine Embeddings with Weights
def weighted_embed_document(document, embedding_model, content_weight=0.7, title_weight=0.2, author_weight=0.1):
    # Compute embeddings for each part
    content_embedding = embedding_model.encode(document.page_content) * content_weight
    title_embedding = embedding_model.encode(document.metadata.get('Title', '')) * title_weight
    author_embedding = embedding_model.encode(document.metadata.get('Author', '')) * author_weight
    
    # Combine embeddings with the specified weights
    combined_embedding = content_embedding + title_embedding + author_embedding
    return combined_embedding
    
# Generate embeddings
#Funtion to Embed All Documents
def embed_documents(documents, embedding_model, content_weight=0.7, title_weight=0.2, author_weight=0.1):
    embeddings = []
    for doc in documents:
        embedding = weighted_embed_document(doc, embedding_model, content_weight, title_weight, author_weight)
        embeddings.append(embedding)
    return embeddings

try:
    document_embeddings = embed_documents(document_texts, embeddings_model)
    print("Embeddings generados correctamente.")
except Exception as e:
    print(f"Error al generar embeddings: {e}")

try:
# Create points for Qdrant
    points = [
    PointStruct(
        id=i,  # Unique identifier for each point
        vector=embeddings.tolist(),  # Convert numpy.ndarray to list
        payload={
            "content": document.page_content,  # Main content
            "title": document.metadata.get('Title', ''),  # Title metadata
            "author": document.metadata.get('Author', '')  # Author metadata
        }
    )
    for i, (embeddings, document) in enumerate(zip(document_embeddings, document_texts))
]

# Upload points to Qdrant
    qdrant.upsert(
        collection_name=collection_name,
        points=points
    )
    print("Datos almacenados en Qdrant correctamente.")
except Exception as e:
    print(f"Error al almacenar los datos en Qdrant: {e}")

Embeddings generados correctamente.
Datos almacenados en Qdrant correctamente.
