In [1]:
import pandas as pd
import numpy as np
import ast
import faiss
from openai import AzureOpenAI

from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient

import os

import json

In [2]:
load_dotenv('/home/xavaki/DAMM/linkedin_gen_contents/.env')

True

In [3]:
blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))
embeddings_container_client = blob_service_client.get_container_client('relevant-articles-summaries-embeddings')
relevant_articles_list_container_client = blob_service_client.get_container_client('relevant-articles-list')
relevant_articles_summaries_container_client = blob_service_client.get_container_client('relevant-articles-summaries')
relevant_articles_content_container_client = blob_service_client.get_container_client('relevant-articles-content')

def read_embeddings_from_blob() -> pd.DataFrame:
    all_embeddings = []
    for blob in embeddings_container_client.list_blobs():
        blob_client = embeddings_container_client.get_blob_client(blob)
        run_embeddings = json.loads(blob_client.download_blob().readall())
        all_embeddings.extend(run_embeddings)

    return pd.DataFrame(all_embeddings)

def read_relevant_articles_list_from_blob() -> pd.DataFrame:
    all_relevant_articles = []
    for blob in relevant_articles_list_container_client.list_blobs():
        runid = blob.name.split("--")[0]
        blob_client = relevant_articles_list_container_client.get_blob_client(blob)
        run_relevant_articles = json.loads(blob_client.download_blob().readall())
        for a in run_relevant_articles:
            a['RUNID'] = runid
        all_relevant_articles.extend(run_relevant_articles)

    return pd.DataFrame(all_relevant_articles)

def read_relevant_articles_summaries_from_blob() -> pd.DataFrame:
    all_summaries = []
    for blob in relevant_articles_summaries_container_client.list_blobs():
        blob_client = relevant_articles_summaries_container_client.get_blob_client(blob)
        run_summaries = json.loads(blob_client.download_blob().readall())
        all_summaries.extend(run_summaries)

    return pd.DataFrame(all_summaries)

def read_relevant_articles_content_from_blob() -> pd.DataFrame:
    all_content = []
    for blob in relevant_articles_content_container_client.list_blobs():
        blob_client = relevant_articles_content_container_client.get_blob_client(blob)
        run_content = json.loads(blob_client.download_blob().readall())
        all_content.extend(run_content)

    return pd.DataFrame(all_content)

In [4]:
pd_embeddings = read_embeddings_from_blob()
pd_relevant_articles = read_relevant_articles_list_from_blob()
pd_relevant_articles_summaries = read_relevant_articles_summaries_from_blob()
pd_relevant_articles_content = read_relevant_articles_content_from_blob()
pd_relevant_articles_content.drop(columns=['content'], inplace=True, axis=1)

pd_all_info = pd_embeddings.merge(pd_relevant_articles, on="article_id", how="inner")
pd_all_info = pd_all_info.merge(pd_relevant_articles_summaries, on="article_id", how="inner")
pd_all_info = pd_all_info.merge(pd_relevant_articles_content, on="article_id", how="inner")

In [5]:
pd_all_info.head()

Unnamed: 0,article_id,summary_embedding,embedding_model,model_x,run_id_x,task_name_x,relevance,article_language,source_name,article_title,article_url,article_keywords,crawled_at,RUNID,model_y,summary,run_id_y,task_name_y,title,publish_date
0,the_next_web_20250530144751581654,"[-0.009674579836428165, 0.006264630705118179, ...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_1,relevance_check_v0,2,en,the_next_web,Elon Musk’s Grok chatbot banned by a quarter o...,https://thenextweb.com/news/elon-musks-grok-ch...,"[Elon Musk, chatbot, data security]",2025-05-30 14:47:51,RUNID_1,gpt-4o-2024-11-20,A recent report from cybersecurity firm Netsko...,RUNID_1,article_summarization_v0,Elon Musk’s Grok chatbot banned by a quarter o...,2025-05-27
1,itespresso_20250530144751622699,"[0.011496803723275661, 0.06475908309221268, 0....",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_1,relevance_check_v0,2,es,itespresso,Cómo mantener el impulso de tu startup durante...,https://www.itespresso.es/startups-en-navidad-...,"[startups, christmas season]",2025-05-30 14:47:51,RUNID_1,gpt-4o-2024-11-20,The article provides strategies for startups t...,RUNID_1,article_summarization_v0,Cómo mantener el impulso de tu startup durante...,2023-12-08
2,the_next_web_20250530144751551046,"[-0.007235630415380001, -0.013063831254839897,...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_1,relevance_check_v0,2,en,the_next_web,‘Purest meat alternative’ to grow in Swedish m...,https://thenextweb.com/news/purest-meat-altern...,"[meat alternative, sustainability, factory]",2025-05-30 14:47:51,RUNID_1,gpt-4o-2024-11-20,Swedish foodtech startup Millow has opened its...,RUNID_1,article_summarization_v0,‘Purest meat alternative’ to grow in Swedish m...,2025-05-28
3,business_insider_20250530144751673882,"[0.022377390414476395, -0.007725527510046959, ...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_1,relevance_check_v0,2,es,business_insider,Las ventas de Tesla siguen desplomándose en Eu...,https://www.businessinsider.es/tecnologia/vent...,"[Tesla, coches eléctricos, ventas, Europa]",2025-05-30 14:47:51,RUNID_1,gpt-4o-2024-11-20,El artículo señala una crisis de marca y caída...,RUNID_1,article_summarization_v0,Las ventas de Tesla siguen desplomándose en Eu...,2025-05-27
4,itespresso_20250530144751612511,"[0.007954786531627178, -0.029804697260260582, ...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_1,relevance_check_v0,2,es,itespresso,Apple es la compañía tecnológica que más ganan...,https://www.itespresso.es/apple-ganancias-empl...,"[Apple, employee earnings]",2025-05-30 14:47:51,RUNID_1,gpt-4o-2024-11-20,A recent study by agencyreviews.io highlights ...,RUNID_1,article_summarization_v0,Apple es la compañía tecnológica que más ganan...,2023-12-19


In [6]:
last_published_article = pd_all_info.sort_values(by="publish_date", ascending=False).iloc[0]
print(last_published_article[["article_title", "publish_date"]])

article_title    Tecnología Musk rompe el silencio digital: el ...
publish_date                                            2025-06-17
Name: 33, dtype: object


In [7]:
len(pd_all_info)

208

In [8]:
embedding_matrix = np.array(pd_all_info["summary_embedding"].tolist()).astype("float32")
embedding_dim = len(embedding_matrix[0, :])

In [9]:
from sklearn.cluster import KMeans

# Define the number of clusters (topics)
num_clusters = 10

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embedding_matrix)

# Add the cluster labels to the dataframe
pd_all_info['topic'] = kmeans.labels_

In [15]:
pd_all_info[pd_all_info['topic'] == 4].head()[["article_title", "summary", "topic"]]

Unnamed: 0,article_title,summary,topic
6,El ecosistema tecnológico español crece un 22%...,The Spanish startup ecosystem is experiencing ...,4
10,ENISA ya cuenta con más de medio millar de sta...,The Empresa Nacional de Innovación (ENISA) lau...,4
28,Telefónica da una pista de su nueva era estrat...,Telefónica has announced its intention to inve...,4
55,Applivery obtiene 1 millón de euros de inversión,"Applivery, a Madrid-based SaaS platform specia...",4
57,Las startups valencianas progresaron adecuadam...,Valencia has emerged as a significant startup ...,4


In [9]:
embedding_matrix.shape

(111, 1536)

In [10]:
# Create an index (flat, exact search)
index = faiss.IndexFlatL2(embedding_dim)
index.add(embedding_matrix)  # store embeddings in index

In [11]:
centroid = np.mean(embedding_matrix, axis=0).reshape(1, -1).astype("float32")
distances, indices = index.search(centroid, k=5)

In [12]:
pd_all_info.iloc[indices[0]]["article_title"].values

array(['Early AI investor Elad Gil finds his next big bet: AI-powered rollups',
       'El CEO de Duolingo revela 5 formas en las que la IA le ayudará a decidir el futuro de su plantilla',
       'Así están convirtiendo la ansiedad por la IA en una ventaja los equipos de contratación',
       "Los adolescentes deberían entrenarse para ser 'ninjas' de la IA, según el CEO de Google DeepMind",
       'La inteligencia artificial toma el mando: las empresas dejan de contratar si una IA puede hacer el trabajo'],
      dtype=object)

In [13]:
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=os.environ.get('AZURE_OPENAI_ENDPOINT'),
    api_key=os.environ.get('AZURE_OPENAI_API_KEY'),
)

In [14]:
query = "Insights related to digital transformation of large enterprises"

response = client.embeddings.create(
        input=[query],
        model="text-embedding-3-small",
    )

query_embedding = response.data[0].embedding
query_vector = np.array([query_embedding], dtype="float32")
distances, indices = index.search(query_vector, k=5)

In [15]:
for x in pd_all_info.iloc[indices[0]]["summary"].values:
    print(x)

Watch&Act emphasizes that successful technological integration requires prioritizing people, building their skills, and fostering a corporate culture that supports the overall strategy. The Spanish consulting firm argues that adopting appropriate technologies enhances efficiency, decision-making, and talent attraction, but the success of these transformations ultimately relies on employee engagement. 

To guide companies, Watch&Act outlines a seven-step roadmap focusing on key actions such as identifying impactful technologies, aligning organizational structures and culture, addressing skill gaps through reskilling/upskilling plans, and establishing efficiency metrics tied to business outcomes. For 2024, the firm highlights challenges related to integrating emerging technologies, fostering agile and innovative organizational setups, and ensuring actionable implementations within time and budget constraints. 

This approach matters to professionals and business leaders because it unders