In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, monotonically_increasing_id, udf, broadcast
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, FloatType, ArrayType
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

spark = SparkSession.builder.appName("OptimizedExport") \
       .config("spark.sql.shuffle.partitions", "8") \
       .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
schema = StructType([
    StructField("nombre", StringType(), True),
    StructField("precio", StringType(), True),
    StructField("descuento", BooleanType(), True),
    StructField("categoria", StringType(), True),
    StructField("tienda", StringType(), True)])
df = spark.read.option("header", True).csv("../data/combined_products.csv")
df = df.withColumn("precio", regexp_replace(col("precio"), "^\\$", ""))
df = df.withColumn("precio", regexp_replace(col("precio"), ",", "."))
df = df.withColumn("precio", df["precio"].cast("float"))
df = df.dropDuplicates(["nombre", "categoria"])
df = df.withColumn("id", monotonically_increasing_id())
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_sentence_embedding(text):
    embedding = model.encode(text)
    return embedding.tolist()

get_embeddings_udf = udf(get_sentence_embedding, ArrayType(FloatType()))
df = df.withColumn("embeddings", get_embeddings_udf(df["nombre"])).cache()
df_pandas = df.toPandas()
df_pandas.to_parquet("../data/productos_embeddings.parquet", index=False)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, regexp_replace, monotonically_increasing_id, udf, broadcast
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, FloatType, ArrayType
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Cargar el archivo en Pandas
df_pandas = pd.read_parquet("productos_embeddings.parquet")

def calculate_similarity(embedding, query_embedding):
    embedding_np = np.array(embedding)
    query_np = np.array(query_embedding)
    return float(cosine_similarity([embedding_np], [query_np])[0][0])

def consultaProductos(producto, df):
    query_embedding = model.encode(producto).tolist()
    df["score"] = df["embeddings"].apply(lambda x: calculate_similarity(x, query_embedding))
    resultado = df.sort_values(by=["score"], ascending=[False]).head(10)
    resultado = resultado.sort_values(by=["precio"], ascending=[True]).head(1)
    return resultado



In [None]:
productos = ["pan", "leche", "arroz", "sal","naranja","vino"]

resultados_finales = pd.DataFrame()

for producto in productos:
    resultado = consultaProductos(producto, df_pandas)
    resultado["producto_consultado"] = producto
    resultados_finales = pd.concat([resultados_finales, resultado], ignore_index=True)


resultados_finales.drop(columns=["embeddings"], errors="ignore").to_csv('Resultados_Finales.csv', index=False)

In [None]:
total_precio = resultados_finales["precio"].sum()
total_fila = pd.DataFrame({"nombre": ["TOTAL"], "precio": [total_precio]})
resultados_finales = pd.concat([resultados_finales, total_fila], ignore_index=True)
resultados_finales.drop(columns=["embeddings","id"],errors="ignore").head(20)

Unnamed: 0,nombre,precio,descuento,categoria,tienda,score,producto_consultado
0,Pan Gusano Granel,0.37,False,Panadería,MEGAMAXI,0.584263,pan
1,Leche Entera Tru 1L,1.24,False,Lácteos y Derivados,Coral Hipermercados,0.596479,leche
2,Tamo De Arroz Go,0.56,False,Alimentos secos y despensa,Coral Hipermercados,0.647715,arroz
3,Sal Yodada Cris-Sal Cris-Sal,0.53,False,Abarrotes,GRAN AKI,0.555953,sal
4,Naranja Kg,1.06,False,Frutas y verduras,Coral Hipermercados,0.82796,naranja
5,Vinagre De Manzana Fruveca,1.51,False,Condimentos y aderezos,Coral Hipermercados,0.573226,vino
6,TOTAL,5.27,,,,,
