In [21]:
import pandas as pd
import numpy as np
import ast
import faiss
from openai import AzureOpenAI

from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient

import os

import json

In [22]:
load_dotenv('/home/xavaki/DAMM/linkedin_gen_contents/.env')

True

In [23]:
blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))
embeddings_container_client = blob_service_client.get_container_client('relevant-articles-summaries-embeddings')
relevant_articles_list_container_client = blob_service_client.get_container_client('relevant-articles-list')
relevant_articles_summaries_container_client = blob_service_client.get_container_client('relevant-articles-summaries')


def read_embeddings_from_blob() -> pd.DataFrame:
    all_embeddings = []
    for blob in embeddings_container_client.list_blobs():
        blob_client = embeddings_container_client.get_blob_client(blob)
        run_embeddings = json.loads(blob_client.download_blob().readall())
        all_embeddings.extend(run_embeddings)

    return pd.DataFrame(all_embeddings)

def read_relevant_articles_list_from_blob() -> pd.DataFrame:
    all_relevant_articles = []
    for blob in relevant_articles_list_container_client.list_blobs():
        runid = blob.name.split("--")[0]
        blob_client = relevant_articles_list_container_client.get_blob_client(blob)
        run_relevant_articles = json.loads(blob_client.download_blob().readall())
        for a in run_relevant_articles:
            a['RUNID'] = runid
        all_relevant_articles.extend(run_relevant_articles)

    return pd.DataFrame(all_relevant_articles)

def read_relevant_articles_summaries_from_blob() -> pd.DataFrame:
    all_summaries = []
    for blob in relevant_articles_summaries_container_client.list_blobs():
        blob_client = relevant_articles_summaries_container_client.get_blob_client(blob)
        run_summaries = json.loads(blob_client.download_blob().readall())
        all_summaries.extend(run_summaries)

    return pd.DataFrame(all_summaries)

pd_embeddings = read_embeddings_from_blob()
pd_relevant_articles = read_relevant_articles_list_from_blob()
pd_relevant_articles_summaries = read_relevant_articles_summaries_from_blob()
pd_all_info = pd_embeddings.merge(pd_relevant_articles, on="article_id", how="inner")
pd_all_info = pd_all_info.merge(pd_relevant_articles_summaries, on="article_id", how="inner")

In [24]:
embedding_matrix = np.array(pd_all_info["summary_embedding"].tolist()).astype("float32")
embedding_dim = len(embedding_matrix[0, :])

In [25]:
# Create an index (flat, exact search)
index = faiss.IndexFlatL2(embedding_dim)
index.add(embedding_matrix)  # store embeddings in index

In [26]:
centroid = np.mean(embedding_matrix, axis=0).reshape(1, -1).astype("float32")
distances, indices = index.search(centroid, k=5)

In [27]:
pd_all_info.iloc[indices[0]]["article_title"].values

array(['Early AI investor Elad Gil finds his next big bet: AI-powered rollups',
       "Los adolescentes deberían entrenarse para ser 'ninjas' de la IA, según el CEO de Google DeepMind",
       'Así están convirtiendo la ansiedad por la IA en una ventaja los equipos de contratación',
       'El CEO de Duolingo revela 5 formas en las que la IA le ayudará a decidir el futuro de su plantilla',
       'El CEO de Anthropic advierte de que la IA podría eliminar la mitad de los empleos de oficina'],
      dtype=object)

In [28]:
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=os.environ.get('AZURE_OPENAI_ENDPOINT'),
    api_key=os.environ.get('AZURE_OPENAI_API_KEY'),
)

In [29]:
query = "Insights related to the food industry"

response = client.embeddings.create(
        input=[query],
        model="text-embedding-3-small",
    )

query_embedding = response.data[0].embedding
query_vector = np.array([query_embedding], dtype="float32")
distances, indices = index.search(query_vector, k=5)

In [33]:
for x in pd_all_info.iloc[indices[0]]["summary"].values:
    print(x)

Swedish foodtech startup Millow has opened its first commercial-scale factory in Gothenburg, repurposing a former LEGO production hall to manufacture mycoprotein through a patented dry fermentation process. Using its innovative method, mycelium grows rapidly by feeding on oats, creating a meat alternative that doesn’t require binders or added flavorings. This new process promises significant environmental benefits, including a 95% reduction in water use, 67% less energy consumption, and cuts greenhouse gas emissions by up to 97% compared to beef production.

Millow’s approach addresses key criticisms of traditional plant-based meat products, such as taste and ingredient transparency, and aims to scale production to 500kg of mycoprotein per day. With fermentation increasingly becoming a bright spot in the alternative protein sector amidst overall declining investment, Millow’s factory showcases the potential for more scalable and sustainable meat substitutes. For professionals in foodte