<a href="https://colab.research.google.com/github/Sebas-gifPaz777/Qubika_reto/blob/main/Proyecto_Qubika.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
!pip install langchain_openai langchain_community
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.document_loaders import TextLoader

##BeautifulSoup para web scrapping

In [None]:
url = "https://www.bbc.com/mundo/topics/c06gq9v4xp3t?page=1"
pages = 20
def scrape_bbc_news(url,pages):

    # Lista para almacenar los resultados
    news_data = []
    for i in range(pages) :
        parts = re.split("page",url)
        url = str(parts[0])+'page='+str(i+1)
        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")
        # Extraer noticias
        articles = soup.find_all("li", class_="bbc-t44f9r")


        for article in articles:
            link = article.find("a", href=True)["href"] if article.find("a", href=True) else "No link"
            if(link != "No link"):
                response = requests.get(link)
                soup = BeautifulSoup(response.content, "html.parser")

                title = soup.find("h1", class_="bbc-14gqcmb e1p3vdyi0").text if soup.find("h1", class_="bbc-14gqcmb e1p3vdyi0") else "No title"
                content =" ".join([p.get_text() for p in soup.find_all('p', class_="bbc-hhl7in e17g058b0")])
                content = re.split("Haz clic aquí para leer más historias",content)[0].strip()
                news_data.append({
                    "title": title,
                    "content": content
                })

    return pd.DataFrame(news_data)

# Ejecutar scraping
df_news = scrape_bbc_news(url,pages)
print(df_news.__len__())


##Limpieza de datos

In [None]:
df_news_unique = df_news.drop_duplicates(subset=["title"])
df_news_unique = df_news_unique.drop_duplicates(subset=["content"])

print(f"Noticias originales: {len(df_news)}, Noticias sin duplicados: {len(df_news_unique)}")


In [None]:
def preprocess_text(text):
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'\s+', ' ', text)  # Eliminar espacios múltiples
    text = re.sub(r'[^\w\s]', '', text)  # Eliminar caracteres especiales
    return text.strip()

# Aplicar preprocesamiento
df_news_unique['title'] = df_news_unique['title'].apply(preprocess_text)
df_news_unique['content'] = df_news_unique['content'].apply(preprocess_text)

# Mostrar algunos ejemplos
print(df_news_unique.head())

In [None]:
df_news_unique.to_csv("news.csv", index=False)

##Se añaden los datos a la base de datos vectorial

In [None]:
df_news_unique = pd.read_csv("news.csv")

In [None]:
from langchain.docstore.document import Document

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
!pip install chromadb

In [None]:
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
from chromadb import Client

In [None]:
persist_directory = "chromadb_data"
client = Client(Settings(persist_directory=persist_directory))

# Crear colección
collection_name = "news_collection9"
collection = client.get_or_create_collection(name=collection_name)

# Modelo de Hugging Face para embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')  # Cambiar por otro modelo si prefieres

# Lista de documentos combinados
list_documents = []
for index, row in df_news_unique.iterrows():
    list_documents.append(str(row['title']) + " " + str(row['content']))

# Crear objetos Document
documents = [Document(page_content=text) for text in list_documents]

print(documents[3].page_content)

In [None]:
# Dividir en chunks
def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

# Crear chunks para todos los documentos
chunks_list = split_text(documents)

# Generar embeddings para los chunks
texts = [chunk.page_content for chunk in chunks_list]
metadatas = [{"start_index": chunk.metadata["start_index"]} for chunk in chunks_list]
embeddings = model.encode(texts, show_progress_bar=True)

# Agregar los chunks a ChromaDB
ids = [f"chunk_{i}" for i in range(len(texts))]

collection.add(
    ids=ids,
    documents=texts,
    metadatas=metadatas,
    embeddings=embeddings,
)

# Confirmar almacenamiento
print(f"Se almacenaron {collection.count()} chunks en la colección.")

# Realizar una consulta
query = "trump y elon musk"
query_embedding = model.encode([query])[0]

results = collection.query(query_embeddings=[query_embedding], n_results=2)
print("Resultados de la búsqueda:")
for doc, metadata in zip(results["documents"], results["metadatas"],results["ids"]):
    print(f"Documento: {doc}")
    print(f"Metadata: {metadata}")


In [None]:
import json

In [None]:
all_docs = collection.get(include=["documents", "metadatas","embeddings"])
all_docs["embeddings"] = [embedding.tolist() for embedding in all_docs["embeddings"]]

# Guardar en un archivo JSON
with open("collection_backup.json", "w") as f:
    json.dump(all_docs, f)

## RAG (Retrival Augmented Generation)

In [None]:
import google.generativeai as genai

In [None]:
# Cargar el modelo generativo
genai.configure(api_key="AIzaSyCFma-VNWVHgNPKHrFnfuyHKc8q1t8klT0")

modelGemini = genai.GenerativeModel(
    model_name='gemini-1.5-pro',
    tools='code_execution')

In [None]:
def retrieve_documents(query, top_k=2):
    query_embedding = model.encode([query])[0]
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    retrieved_docs = [doc for doc, metadata in zip(results["documents"], results["metadatas"])]
    print(retrieved_docs)
    return retrieved_docs

def generate_answer(query, top_k=3):
    retrieved_docs = retrieve_documents(query, top_k=top_k)
    context = "\n".join(str(retrieved_docs))
    prompt = f"A continuación tenemos el siguiente contexto separado con saltos de linea, las partes que esten relacionados con la consulta usalos: {context}\n ---------------- \n Responde la consulta teniendo en cuenta las partes de contexto utiles y hazlo en español: {query}\n --------------- \nRespuesta:"
    response = modelGemini.generate_content((prompt))
    #response = generator(prompt, max_length=max_length, num_return_sequences=1)
    answer = response.text
    return answer


##Evaluación

In [None]:
!pip install deepeval

In [None]:
from deepeval import evaluate

In [None]:

# Función para evaluar la calidad de las respuestas
def evaluate_responses(queries, ground_truths, top_k=3):
    generated_responses = [generate_answer(query, top_k) for query in queries]
    metrics = evaluate(generated_responses, ground_truths)

    print("Resultados de Evaluación:")
    print("Exactitud:", metrics['accuracy'])
    print("BLEU Score:", metrics['bleu'])
    print("ROUGE-L:", metrics['rouge'])
    return metrics


In [None]:
queries = [
    "Hablame sobre el megapuerto chancay en perú"
]

# Respuestas esperadas (ground truths)
ground_truths = [
    "El presidente electo de Estados Unidos, Donald Trump, anunció este martes que encargó a Elon Musk, propietario de Tesla y actualmente el hombre más rico del mundo, que lidere junto al ex candidato presidencial republicano Vivek Ramaswamy."
]

#RAG
generated_responses = [generate_answer(query) for query in queries]
print(generated_responses)
# Evaluar
metrics = evaluate_responses(queries, ground_truths)

##Comparación de noticias (BBC y El Tiempo)

In [None]:
def web_scraping_bbc_new(url):
  new_data = ""
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")
  title = soup.find("h1", class_="bbc-14gqcmb e1p3vdyi0").text if soup.find("h1", class_="bbc-14gqcmb e1p3vdyi0") else "No title"
  content =" ".join([p.get_text() for p in soup.find_all('p', class_="bbc-hhl7in e17g058b0")])
  content = re.split("Haz clic aquí para leer más historias",content)[0].strip()
  new_data+=f"{title} \n{content}"
  print(new_data)
  return new_data

In [None]:
def web_scraping_el_tiempo_new(url):
  new_data = ""
  response = requests.get(url)
  soup = BeautifulSoup(response.content, "html.parser")
  div = soup.find("div", class_="c-detail-content")
  title = soup.find("h1", class_="c-detail__title").text if soup.find("h1", class_="c-detail__title") else "No title"

  paragraphs = soup.find_all("div", class_="paragraph")
  content = ""
  for paragraph in paragraphs:
      for child in paragraph.children:
          if child.name == "b":
              # Si es una etiqueta <b>
              content +=f"{child.get_text(strip=True)} "
          elif child.name is None:
              # Si es un texto sin etiqueta
              print("Entra no bold")
              content+=child.strip()+" "

  new_data+=f"{title} \n{content}"
  print(new_data)
  return new_data

In [None]:
from transformers import pipeline

# Cargar el modelo de resumen
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Función para dividir el texto en fragmentos manejables
def split_text(text, max_length=1024):
    sentences = text.split(". ")  # Dividir por oraciones
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    # Agregar el último fragmento
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Fragmentar el texto
news_1 = split_text(web_scraping_bbc_new("https://www.bbc.com/mundo/articles/cnvj2z7evp8o"))
news_2 = split_text(web_scraping_el_tiempo_new("https://www.eltiempo.com/mundo/eeuu-y-canada/donald-trump-nombra-a-elon-musk-como-director-del-departamento-de-eficiencia-gubernamental-de-estados-unidos-3399054"))


# Generar resúmenes para cada fragmento

def summarize_text(text_chunks):
  summaries = []
  for chunk in text_chunks:
      summary = summarizer(chunk, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
      summaries.append(summary)
  final_summary = " ".join(summaries)
  return final_summary

news_1 = summarize_text(news_1)
news_2 = summarize_text(news_2)

print("Resumen completo 1:")
print(news_1)

print("Resumen completo 2:")
print(news_2)

In [None]:
answer = generate_answer(f"Compara estas dos noticias, noticia 1:{news_1} \n noticia 2:{news_2}")

##Narración de la respuesta

In [None]:
pip install TTS

In [None]:
from TTS.api import TTS

# Cargar el modelo preentrenado
tts = TTS(model_name="tts_models/es/mai/tacotron2-DDC", progress_bar=True)

# Texto a convertir a voz
text = "Este es un ejemplo de narración en español usando Coqui TTS."

# Guardar como archivo de audio
tts.tts_to_file(text=answer, file_path="narracion_coqui.wav")
print("Archivo guardado como narracion_coqui.wav")