In [None]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell"

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import uuid
import random
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import requests
import torch

ModuleNotFoundError: No module named 'findspark'

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)


In [None]:
spark = SparkSession \
    .builder \
    .appName('RealtimeKafkaML') \
    .getOrCreate()

In [None]:
df_raw = spark \
  .readStream \
  .format('kafka') \
  .option('kafka.bootstrap.servers', "broker:29092") \
  .option("startingOffsets", "latest") \
  .option('subscribe', "datipipe") \
  .load()

In [None]:
df_json = df_raw.selectExpr('CAST(value AS STRING) as json')

In [None]:
article_schema = StructType([
        StructField("url", StringType(), True),
        StructField("publishedAt", StringType(), True),
        StructField("description", StringType(), True),
        StructField("source", StructType([
            StructField("name", StringType(), True),
            StructField("id", StringType(), True)
        ]), True),
        StructField("title", StringType(), True),
        StructField("urlToImage", StringType(), True),
        StructField("content", StringType(), True),
        StructField("author", StringType(), True)
    ])

    # Definisci lo schema per l'intero JSON
schema = StructType([
    StructField("@timestamp", StringType(), True),
    StructField("articles", article_schema, True),
    StructField("@version", StringType(), True),
    StructField("status", StringType(), True),
    StructField("totalResults", StringType(), True)
    ])

In [None]:
class CustomEmbeddingFunction:
    def __init__(self, ):
        self.model = model

    def __call__(self, input):
        if isinstance(input, list):
            return [self.generate_embeddings(text) for text in input]
        else:
            return [self.generate_embeddings(input)]

    def generate_embeddings(self, text):
        if text:
            embeddings = self.model.encode([text], convert_to_tensor=False)
            return embeddings.tolist()[0]
        else:
            return []


Column<'json'>

In [None]:
import chromadb
# Creazione dell'istanza della classe di funzione di embedding
embedding_function = CustomEmbeddingFunction()

# Utilizzo dell'istanza per aggiungere testi
client = chromadb.PersistentClient()
collection = client.get_or_create_collection(name="test", embedding_function=embedding_function)


In [None]:

# Preparazione dei dati di testo
sentences = ["Who is Laurens van der Maaten?", "What is machine learning?"]

# Aggiunta dei testi e degli embeddings calcolati alla collezione
collection.add(documents=sentences, ids = ["id4","id5"])

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)


In [None]:
import hashlib
def generate_sha256_hash_from_text(text) -> str:
    # Create a SHA256 hash object
    sha256_hash = hashlib.sha256()
    # Update the hash object with the text encoded to bytes
    sha256_hash.update(text.encode('utf-8'))
    # Return the hexadecimal representation of the hash
    return sha256_hash.hexdigest()

In [None]:
#df_transformed = df_json.select(from_json(col("json"), article_schema).alias('data'))
#df_descriptions = df_transformed.select("data.articles.description")
import uuid

# Definisci una funzione per inviare i dati a ChromaDB
def send_to_chroma(batch_df, epoch_id, collection):
    # Itera su ogni riga nel DataFrame del batch
    for row in batch_df.collect():
        document = row['content']  # Assumi che 'description' sia il campo di interesse
        splits = text_splitter.split_text(document)
        doc_id = str(uuid.uuid4())
        unique = collection.get(ids = [generate_sha256_hash_from_text(splits[i]) for i in range(len(splits))])
        if (unique["ids"] != [] == True):
            print("Testo gia presente")
        try:
            # Invio del documento a ChromaDB
            collection.add(documents=splits,ids=[doc_id])
            #print(f"\rDocumento inviato a ChromaDB: {document}",end = "")
        except Exception as e:
            print(f"\rErrore nell'invio del documento a ChromaDB: {e}",end = "")

#query = df_descriptions.writeStream.foreachBatch(lambda df, epoch_id: send_to_chroma(df, epoch_id, collection)).start()

#query.awaitTermination()


In [None]:
def send_to_chroma(batch_df, epoch_id, collection):
    # Itera su ogni riga nel DataFrame del batch
    for row in batch_df.collect():
        document = row['content']  # Assumi che 'description' sia il campo di interesse
        splits = text_splitter.split_text(document)
        #doc_id = str(uuid.uuid4())
        hashs = [generate_sha256_hash_from_text(splits[i]) for i in range(len(splits))]
        unique = collection.get(ids = hashs )
        bool_unique = unique["ids"] != []
        if (bool_unique == True):
            print("Testo gia presente")
            return
        try:
            # Invio del documento a ChromaDB
            collection.add(documents=splits,ids=hashs)
            #print(f"\rDocumento inviato a ChromaDB: {document}",end = "")
        except Exception as e:
            print(f"\rErrore nell'invio del documento a ChromaDB: {e}",end = "")


In [None]:
df_json.select(from_json(df_json.json, schema).alias('rowdata')) \
  .select('rowdata.articles.content') \
  .writeStream \
  .foreachBatch(lambda df, epoch_id: send_to_chroma(df, epoch_id, collection))\
  .start()\
  .awaitTermination()

In [None]:
query = collection.query(query_texts=["what is euclidian telescope"])
query 

In [None]:

# Preparazione dei dati di testo
sentences = ["Who is Laurens van der Maaten?", "What is machine learning?"]

# Aggiunta dei testi e degli embeddings calcolati alla collezione
collection.add(documents=sentences, ids = [generate_sha256_hash() for _ in range(len(sentences))])

In [None]:
sentences = ["Who is Laurenssaas van der Maaten?", "What is machisdadane learning?"]

caxx = ["Who is Laurens van der Maaten?", "What is machine learning?"]
hash2  = [generate_sha256_hash_from_text(caxx[i]) for i in range(len(caxx))]
hash  = [generate_sha256_hash_from_text(sentences[i]) for i in range(len(sentences))]
o = collection.get(ids = [generate_sha256_hash_from_text(sentences[i]) for i in range(len(sentences))])
