In [1]:
import pandas as pd
import numpy as np
import ast
import faiss
from openai import AzureOpenAI

from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient

import os

import json

from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
import plotly.express as px

from random import random
from textwrap import dedent

In [2]:
load_dotenv('/home/xavaki/DAMM/linkedin_gen_contents/.env')

True

In [3]:
blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))
embeddings_container_client = blob_service_client.get_container_client('relevant-articles-summaries-embeddings')
relevant_articles_list_container_client = blob_service_client.get_container_client('relevant-articles-list')
relevant_articles_summaries_container_client = blob_service_client.get_container_client('relevant-articles-summaries')
relevant_articles_content_container_client = blob_service_client.get_container_client('relevant-articles-content')

def read_embeddings_from_blob() -> pd.DataFrame:
    all_embeddings = []
    for blob in embeddings_container_client.list_blobs():
        blob_client = embeddings_container_client.get_blob_client(blob)
        run_embeddings = json.loads(blob_client.download_blob().readall())
        all_embeddings.extend(run_embeddings)

    return pd.DataFrame(all_embeddings)

def read_relevant_articles_list_from_blob() -> pd.DataFrame:
    all_relevant_articles = []
    for blob in relevant_articles_list_container_client.list_blobs():
        runid = blob.name.split("--")[0]
        blob_client = relevant_articles_list_container_client.get_blob_client(blob)
        run_relevant_articles = json.loads(blob_client.download_blob().readall())
        for a in run_relevant_articles:
            a['RUNID'] = runid
        all_relevant_articles.extend(run_relevant_articles)

    return pd.DataFrame(all_relevant_articles)

def read_relevant_articles_summaries_from_blob() -> pd.DataFrame:
    all_summaries = []
    for blob in relevant_articles_summaries_container_client.list_blobs():
        blob_client = relevant_articles_summaries_container_client.get_blob_client(blob)
        run_summaries = json.loads(blob_client.download_blob().readall())
        all_summaries.extend(run_summaries)

    return pd.DataFrame(all_summaries)

def read_relevant_articles_content_from_blob() -> pd.DataFrame:
    all_content = []
    for blob in relevant_articles_content_container_client.list_blob_names():
        blob_client = relevant_articles_content_container_client.get_blob_client(blob)
        run_content = json.loads(blob_client.download_blob().readall())
        run_content.pop("content")
        all_content.append(run_content)

    return pd.DataFrame(all_content)

In [4]:
personas = ["LAURA GIL","FEDE SEGARRA","ELÍSABETH HERNÁNDEZ","JAUME ALEMANY", "RICARDO LECHUGA","JORGE VILLAVECCHIA","SALVADOR MARTÍNEZ","JOFRE RIERA"]

In [5]:
read_relevant_articles_content_from_blob()

Unnamed: 0,article_id,title,publish_date
0,business_insider_20250530144751643191,"Meta, la matriz de Facebook, planea abrir tien...",2025-05-29
1,business_insider_20250530144751663668,El ecosistema tecnológico español crece un 22%...,2025-05-28
2,business_insider_20250530144751673882,Las ventas de Tesla siguen desplomándose en Eu...,2025-05-27
3,business_insider_20250530144751694527,El CEO de Duolingo revela 5 formas en las que ...,2025-05-27
4,business_insider_20250530144751704745,Esta es la historia jamás contada de cómo la s...,2025-05-26
...,...,...,...
1455,the_next_web_20250616113906108578,European VCs just made a record bet on this fu...,2025-06-11
1456,the_next_web_20250616113906118818,Opinion: Space startups are pivoting to defenc...,2025-06-13
1457,world_economic_forum_20250616113906461633,What is The First Movers Coalition?,
1458,world_economic_forum_20250616113906472087,Voices for Nature: The call for a new economic...,


In [6]:
pd_embeddings = read_embeddings_from_blob()
pd_relevant_articles = read_relevant_articles_list_from_blob()
pd_relevant_articles_summaries = read_relevant_articles_summaries_from_blob()
pd_relevant_articles_content = read_relevant_articles_content_from_blob()
# pd_relevant_articles_content.drop(columns=['content'], inplace=True, axis=1)

pd_all_info = pd_embeddings.merge(pd_relevant_articles, on="article_id", how="inner")
pd_all_info = pd_all_info.merge(pd_relevant_articles_summaries, on="article_id", how="inner")
pd_all_info = pd_all_info.merge(pd_relevant_articles_content, on="article_id", how="inner")



In [7]:
pd_all_info = pd_all_info[pd_all_info["run_id_x"].isin(["RUNID_18"])]

In [8]:
pd_all_info

Unnamed: 0,article_id,summary_embedding,embedding_model,model_x,run_id_x,task_name_x,relevance,article_language,source_name,article_title,...,article_image_url,ddgs_search_query,query_original_personas,crawling_source,model_y,summary,run_id_y,task_name_y,title,publish_date
1198,businessinsider.es_20250718095108846432,"[-0.0008376312907785177, -0.04853260517120361,...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 0, 'ELÍSABETH...",es,businessinsider.es,Tesla's geofence innovation for robotaxis,...,,,,jina,gpt-4o-2024-11-20,Tesla has expanded its geofence for robotaxi o...,RUNID_18,article_summarization_v0,Tesla's geofence innovation for robotaxis,2025-07-15
1199,businessinsider.es_20250718095108794859,"[0.03213481232523918, 0.02532116509974003, 0.0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,OpenAI Economist preps children for a world wi...,...,,,,jina,gpt-4o-2024-11-20,"Ronnie Chatterji, Chief Economist at OpenAI, h...",RUNID_18,article_summarization_v0,OpenAI Economist preps children for a world wi...,2025-07-16
1200,businessinsider.es_20250718095108970217,"[0.0363992340862751, 0.028270242735743523, 0.0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,Detecting scams in SMS and WhatsApp messages,...,,,,jina,gpt-4o-2024-11-20,The article discusses the growing sophisticati...,RUNID_18,article_summarization_v0,Detecting scams in SMS and WhatsApp messages,2025-07-11
1201,businessinsider.es_20250718095108805194,"[-0.01577611267566681, 0.002103183651342988, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,Sam Altman and OpenAI develop innovative browser,...,,,,jina,gpt-4o-2024-11-20,"OpenAI, known for initiating the AI boom with ...",RUNID_18,article_summarization_v0,Sam Altman and OpenAI develop innovative browser,2025-07-16
1202,businessinsider.es_20250718095108856636,"[0.002676026662811637, -0.030920324847102165, ...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,Tesla faces trial for fatal Autopilot accident,...,,,,jina,gpt-4o-2024-11-20,A federal jury in Florida is set to evaluate w...,RUNID_18,article_summarization_v0,Tesla faces trial for fatal Autopilot accident,2025-07-14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,weforum.org_20250718095109471637,"[0.02796872705221176, -0.010471106506884098, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 0, 'ELÍSABETH...",English,weforum.org,Global Cybersecurity Outlook 2024,...,,,,jina,gpt-4o-2024-11-20,The World Economic Forum’s Global Cybersecurit...,RUNID_18,article_summarization_v0,Global Cybersecurity Outlook 2024,
1289,weforum.org_20250718095109461399,"[0.041655149310827255, 0.011486723087728024, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",English,weforum.org,Top 10 Emerging Technologies of 2025,...,,,,jina,gpt-4o-2024-11-20,"The ""Top 10 Emerging Technologies of 2025"" rep...",RUNID_18,article_summarization_v0,Top 10 Emerging Technologies of 2025,
1290,weforum.org_20250718095109491980,"[0.029985709115862846, -0.0054162670858204365,...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 1, 'FEDE SEGARRA': 0, 'ELÍSABETH...",English,weforum.org,The Future of Financial Advice,...,,,,jina,gpt-4o-2024-11-20,The white paper examines transformative trends...,RUNID_18,article_summarization_v0,The Future of Financial Advice,
1291,weforum.org_20250718095109502290,"[0.014725111424922943, -0.04100026935338974, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 0, 'FEDE SEGARRA': 0, 'ELÍSABETH...",English,weforum.org,Promoting Health and Well-Being: Employer Stra...,...,,,,jina,gpt-4o-2024-11-20,Investing in employee well-being offers organi...,RUNID_18,article_summarization_v0,Promoting Health and Well-Being: Employer Stra...,


In [9]:
last_published_article = pd_all_info.sort_values(by="publish_date", ascending=False).iloc[0]
print(last_published_article[["article_title", "publish_date"]])

article_title    Del cumplimiento a la convicción: por qué las ...
publish_date                                            2025-07-18
Name: 1229, dtype: object


In [10]:
len(pd_all_info)

95

In [11]:
embedding_matrix = np.array(pd_all_info["summary_embedding"].tolist()).astype("float32")
embedding_dim = len(embedding_matrix[0, :])

In [12]:
from sklearn.cluster import KMeans

# Define the number of clusters (topics)
num_clusters = 10

# Perform K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(embedding_matrix)

# Add the cluster labels to the dataframe
pd_all_info['topic'] = kmeans.labels_

In [13]:
pd_all_info[pd_all_info['topic'] == 4].head()

Unnamed: 0,article_id,summary_embedding,embedding_model,model_x,run_id_x,task_name_x,relevance,article_language,source_name,article_title,...,ddgs_search_query,query_original_personas,crawling_source,model_y,summary,run_id_y,task_name_y,title,publish_date,topic
1210,businessinsider.es_20250718095108959792,"[0.009057179093360901, -0.008331652730703354, ...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,"Amazon's AI project ""Starfish"" aims to gather ...",...,,,jina,gpt-4o-2024-11-20,Amazon is leveraging generative AI to transfor...,RUNID_18,article_summarization_v0,"Amazon's AI project ""Starfish"" aims to gather ...",2025-07-11,4
1214,businessinsider.es_20250718095108928684,"[0.027814090251922607, 0.02969183586537838, 0....",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,WhatsApp's AI calling feature stirs consumer p...,...,,,jina,gpt-4o-2024-11-20,"WhatsApp is introducing new functionalities, i...",RUNID_18,article_summarization_v0,WhatsApp's AI calling feature stirs consumer p...,2025-07-12,4
1217,businessinsider.es_20250718095108897782,"[0.010809161700308323, 0.005169728305190802, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",es,businessinsider.es,AI reshapes Hollywood and animations,...,,,jina,gpt-4o-2024-11-20,Artificial Intelligence (AI) is revolutionizin...,RUNID_18,article_summarization_v0,AI reshapes Hollywood and animations,2025-07-13,4
1237,entrepreneur.com_20250718095108035422,"[0.005941552110016346, 0.008063982240855694, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 0, 'ELÍSABETH...",en,entrepreneur.com,Upgrade Your LinkedIn Profile With AI-Generate...,...,,,jina,gpt-4o-2024-11-20,"The article highlights ResumePhoto, an AI-powe...",RUNID_18,article_summarization_v0,Upgrade Your LinkedIn Profile With AI-Generate...,2025-07-17,4
1244,hbr.org_20250718095106373519,"[0.03865043446421623, 0.007953500375151634, 0....",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,"{'LAURA GIL': 2, 'FEDE SEGARRA': 1, 'ELÍSABETH...",en,hbr.org,Can Gen AI and Copyright Coexist?,...,,,jina,gpt-4o-2024-11-20,"A recent industry report highlights that ""core...",RUNID_18,article_summarization_v0,Can Gen AI and Copyright Coexist?,2025-07-16,4


In [14]:
embedding_matrix.shape

(95, 1536)

In [15]:
# Create an index (flat, exact search)
index = faiss.IndexFlatL2(embedding_dim)
index.add(embedding_matrix)  # store embeddings in index

In [16]:
centroid = np.mean(embedding_matrix, axis=0).reshape(1, -1).astype("float32")
distances, indices = index.search(centroid, k=5)

In [17]:
pd_all_info.iloc[indices[0]]["article_title"].values

array(['CEOs implement AI in various aspects of daily life',
       'La IA irrumpe en Cannes Lions 2025: Amazon, Apple y Mars lideran el debate creativo',
       'La confianza y la colaboración entre humanos y la IA definirá la próxima era de IA basada en agentes',
       "5 shifts in a new era for entrepreneurs: Reflections from 'Summer Davos' 2025",
       'Day 1 at ‘Summer Davos’ 2025: AI, trade and the global economy in focus'],
      dtype=object)

In [18]:
client = AzureOpenAI(
    api_version="2024-12-01-preview",
    azure_endpoint=os.environ.get('AZURE_OPENAI_ENDPOINT'),
    api_key=os.environ.get('AZURE_OPENAI_API_KEY'),
)

In [19]:
query = "Insights related to digital transformation of large enterprises"

response = client.embeddings.create(
        input=[query],
        model="text-embedding-3-small",
    )

query_embedding = response.data[0].embedding
query_vector = np.array([query_embedding], dtype="float32")
distances, indices = index.search(query_vector, k=5)

In [20]:
for x in pd_all_info.iloc[indices[0]]["summary"].values:
    print(x)

The article discusses the transformative impact of generative AI on the insurance industry, highlighting how it has shifted from experimental use to mainstream application over the past two years. Generative AI enables insurers to leverage extensive customer data from documents, contact centers, and service processes, offering potential to achieve a unified customer view comparable to that of the banking sector. The sector's digital transformation emphasizes data sovereignty, privacy concerns, and the importance of educating users about risks linked to free AI tools, such as data misuse.

For professionals, the conversation underscores the urgency of adapting to AI advancements while addressing challenges like shadow IT, talent re-skilling, and ROI for scalable AI pilots. Business leaders are reminded of the need to balance innovation with ethical data practices and customer trust, key drivers of long-term success in highly competitive industries.
The article highlights how the misuse 

In [21]:
cluster_labels = pd_all_info.topic.values

# Reduce dimensions with PCA for visualization (2D)
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embedding_matrix)
article_titles = pd_all_info["article_title"].values

# PCA
pca = PCA(n_components=3)
reduced_embeddings = pca.fit_transform(embedding_matrix)

# DataFrame
df_plot = pd.DataFrame({
    "x": reduced_embeddings[:, 0],
    "y": reduced_embeddings[:, 1],
    "z": reduced_embeddings[:, 2],
    "cluster": cluster_labels.astype(str),
    "title": article_titles
})

# Interactive plot
fig = px.scatter_3d(
    df_plot,
    x="x",
    y="y",
    z="z",
    color="cluster",
    hover_data=["title"],
    title="Embeddings Cluster Visualization with Article Titles",
    labels={"x": "PCA 1", "y": "PCA 2", "z" : "PCA 3" },
)

fig.show()

In [22]:
df_expanded_relevance = pd_all_info["relevance"].apply(pd.Series)
pd_final = pd.concat([pd_all_info, df_expanded_relevance], axis=1)
pd_final.drop(columns=["relevance"], inplace=True, axis=1)

In [23]:
pd_final

Unnamed: 0,article_id,summary_embedding,embedding_model,model_x,run_id_x,task_name_x,article_language,source_name,article_title,article_url,...,publish_date,topic,LAURA GIL,FEDE SEGARRA,ELÍSABETH HERNÁNDEZ,JAUME ALEMANY,RICARDO LECHUGA,JORGE VILLAVECCHIA,SALVADOR MARTÍNEZ,JOFRE RIERA
1198,businessinsider.es_20250718095108846432,"[-0.0008376312907785177, -0.04853260517120361,...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,es,businessinsider.es,Tesla's geofence innovation for robotaxis,https://www.businessinsider.es/tecnologia/gran...,...,2025-07-15,8,2,0,0,1,0,1,0,0
1199,businessinsider.es_20250718095108794859,"[0.03213481232523918, 0.02532116509974003, 0.0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,es,businessinsider.es,OpenAI Economist preps children for a world wi...,https://www.businessinsider.es/tecnologia/econ...,...,2025-07-16,3,2,1,2,1,2,2,1,0
1200,businessinsider.es_20250718095108970217,"[0.0363992340862751, 0.028270242735743523, 0.0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,es,businessinsider.es,Detecting scams in SMS and WhatsApp messages,https://www.businessinsider.es/tecnologia/dos-...,...,2025-07-11,0,2,1,1,1,0,0,0,0
1201,businessinsider.es_20250718095108805194,"[-0.01577611267566681, 0.002103183651342988, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,es,businessinsider.es,Sam Altman and OpenAI develop innovative browser,https://www.businessinsider.es/tecnologia/sam-...,...,2025-07-16,6,2,1,0,1,0,1,0,0
1202,businessinsider.es_20250718095108856636,"[0.002676026662811637, -0.030920324847102165, ...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,es,businessinsider.es,Tesla faces trial for fatal Autopilot accident,https://www.businessinsider.es/tecnologia/tesl...,...,2025-07-14,8,2,1,0,1,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,weforum.org_20250718095109471637,"[0.02796872705221176, -0.010471106506884098, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,English,weforum.org,Global Cybersecurity Outlook 2024,https://www.weforum.org/publications/global-cy...,...,,1,2,0,1,0,1,1,1,0
1289,weforum.org_20250718095109461399,"[0.041655149310827255, 0.011486723087728024, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,English,weforum.org,Top 10 Emerging Technologies of 2025,https://www.weforum.org/publications/top-10-em...,...,,1,2,1,1,1,1,2,1,0
1290,weforum.org_20250718095109491980,"[0.029985709115862846, -0.0054162670858204365,...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,English,weforum.org,The Future of Financial Advice,https://www.weforum.org/publications/the-futur...,...,,0,1,0,0,0,0,2,2,0
1291,weforum.org_20250718095109502290,"[0.014725111424922943, -0.04100026935338974, 0...",text-embedding-3-small,gpt-4o-2024-11-20,RUNID_18,relevance_check_v0,English,weforum.org,Promoting Health and Well-Being: Employer Stra...,https://www.weforum.org/publications/promoting...,...,,9,0,0,2,0,2,1,0,0


In [24]:
for x in pd_final[pd_final["FEDE SEGARRA"] == 2].article_title.values:
    print(x)

Los concursos de publicidad de la administración pública van a peor
Del cumplimiento a la convicción: por qué las empresas deben liderar el reporting de sostenibilidad más allá de la regulación
Francisco Rionda (aea): 'Los brand manager nos cansamos antes de nuestras propias comunicaciones que el consumidor'
“Sé lo que hicisteis el último verano” toma una playa española
Aumentan los concursos publicitarios, pero empeoran las condiciones
NB SCORE 2024: SCOPEN analiza el nuevo negocio en agencias creativas y de medios
Los concursos públicos de publicidad se disparan… y se endurecen
Las nuevas Pepsi no solo se saborean, también se lucen en las calles en la colección cápsula con ScrapWorld
Alberto Knapp es nombrado como nuevo presidente de WPP en España
La «crème de la crème» de las marcas en Europa: estas son las firmas más fuertes y valiosas
Domino’s Pizza lanza TRIBUTE, el primer documental sobre la historia del gaming en España
La nueva campaña de ‘La Chingona’ de KFC, ‘Vuelve La Ching

In [25]:
# display column names
pd_final.columns

Index(['article_id', 'summary_embedding', 'embedding_model', 'model_x',
       'run_id_x', 'task_name_x', 'article_language', 'source_name',
       'article_title', 'article_url', 'article_keywords', 'crawled_at',
       'RUNID', 'source_url', 'article_date', 'article_body',
       'article_image_url', 'ddgs_search_query', 'query_original_personas',
       'crawling_source', 'model_y', 'summary', 'run_id_y', 'task_name_y',
       'title', 'publish_date', 'topic', 'LAURA GIL', 'FEDE SEGARRA',
       'ELÍSABETH HERNÁNDEZ', 'JAUME ALEMANY', 'RICARDO LECHUGA',
       'JORGE VILLAVECCHIA', 'SALVADOR MARTÍNEZ', 'JOFRE RIERA'],
      dtype='object')

In [26]:
pd_final[["article_title", "article_url", "publish_date", "summary", "source_name", "article_language"] + personas].rename(columns={
    "article_title": "título",
    "article_url": "url",
    "publish_date": "fecha de publicación",
    "summary": "resumen",
    "source_name": "fuente", "article_language" : "idioma"}).replace({2 : "R", 1 : "NR", 0 : "NR"}).to_excel("./seleccion_articulos_18-7-25.xlsx", index=False)

In [27]:
pd_past_posts = pd.read_json("/home/xavaki/DAMM/linkedin_gen_contents/calendario_linkedin_2023_2024_articles.json", encoding="utf-8-sig")

In [28]:
pd_past_posts.head()

Unnamed: 0,mes,perfil,fecha,tipo_de_contenido,tema,informacion,segmento,copy,link,article_source
0,enero 2023,ANGEL GUARCH,2022-01-19,CURATED,BANCA EN LA S,,,La banca en la sombra o paralela escapa de la ...,https://elperiodicodemexico.com/nota.php?id=10...,https://elperiodicodemexico.com
1,enero 2023,ANGEL GUARCH,2023-01-25,CURATED,MERCADO DE CAPITALES EUROPEO,,,Todo parece indicar la unión del mercado de ca...,https://www.lavanguardia.com/economia/20221125...,https://www.lavanguardia.com
2,enero 2023,ANGEL GUARCH,2023-01-28,CURATED,ECONOMÍA Y NATURALEZA,,,"Según el Foro Económico Mundial, la mitad de l...",https://www.eleconomista.es/opinion/noticias/1...,https://www.eleconomista.es
3,enero 2023,ELÍSABETH HERNÁNDEZ,2022-01-04,CURATED,HABILIDADES + VALORADAS,,,Las llamadas habilidades duras son y seguirán ...,https://www.rrhhpress.com/tendencias/56109-las...,https://www.rrhhpress.com
4,enero 2023,ELÍSABETH HERNÁNDEZ,2022-01-11,CURATED,DIGITALIZACIÓN DPTO. PERSONAS,,,Las empresas españolas siguen apostando por la...,https://www.silicon.es/el-53-de-las-empresas-e...,https://www.silicon.es


In [29]:
pd_personas = pd.read_json("/home/xavaki/DAMM/linkedin_gen_contents/personas.json", encoding="utf-8-sig")

In [30]:
pd_personas

Unnamed: 0,name,role,linkedin_bio,linkedin_job_description
0,LAURA GIL,Chief Data Analytics Officer,Aquellas personas que me conocen saben que mi ...,-
1,FEDE SEGARRA,Chief Communications Officer,Embajador de la comunicación de Damm y sus mar...,-
2,ELÍSABETH HERNÁNDEZ,Human Resources Development Director,Tras 15 años trabajando en la gestión de perso...,Como Directora de Desarrollo de Personas en Da...
3,JAUME ALEMANY,Chief Marketing Officer,Leading talented and creative teams focused on...,-
4,RICARDO LECHUGA,HR Director,Desde el departamento de Recursos Humanos de D...,-
5,JORGE VILLAVECCHIA,President,"Durante más de veinte años, he tenido el privi...",-
6,SALVADOR MARTÍNEZ,Chief Financial Officer,Mi trayectoria profesional ha estado siempre v...,-
7,JOFRE RIERA,Sponsorships Manager,Vivo con pasión y compromiso mi actividad como...,-


In [31]:
def get_example_copys(perfil, n=5):
    pd_perfil = pd_past_posts[pd_past_posts["perfil"] == perfil]
    pd_perfil = pd_perfil.sample(min(n, len(pd_perfil)))
    example_copys = [x[0] for x in pd_perfil[["copy"]].values.tolist()]
    return example_copys

In [32]:
def generate_prompt(persona_description : str, example_copys : list[str]) -> str:
    task_instructions = [
        "Use the article summary to craft a short LinkedIn post (max 50 words)",
        "Keep the language clear and thoughtful",
        "Highlight the key insight, and add a brief reflection or perspective from their point of view",
        "Avoid generic buzzwords — focus on what feels real and actionable"
    ]

    if random() < 0.5:
        task_instructions.append("Only when appropriate, include a call to action or question to engage her audience, but don't overdo it")

    base_post_generation_prompt = f"""
You are a content writer creating LinkedIn posts for spanish brewing company DAMM business executive. Your job is to turn article summaries into short, thoughtful LinkedIn posts that reflect her tone and professional perspective.

Here's the executive's linkedin bio:
{persona_description}

Your task:
{"\n-".join(task_instructions)}

Here are some examples of their past posts to guide your writing style and tone:

""" + "\n\n".join([f"Example {i+1}:\n{post}" for i, post in enumerate(example_copys)])
    
    base_post_generation_prompt = dedent(base_post_generation_prompt).strip()

    return base_post_generation_prompt

In [33]:
copys = []

for persona_name in personas:
    print(f"Generating posts for {persona_name}")
    persona_description = pd_personas[pd_personas["name"] == persona_name]["linkedin_bio"].values[0]
    example_copys = get_example_copys(persona_name, n=5)
    base_prompt = generate_prompt(persona_description, example_copys)

    pd_persona_summaries = pd_final[pd_final[persona_name] == 2][["article_title", "article_language", "summary", "article_url"]]
    persona_summaries = pd_persona_summaries.sample(min(5, len(pd_persona_summaries))).to_dict(orient="records")


    for x in persona_summaries:
        article_title = x["article_title"]
        article_language = x["article_language"]
        summary = x["summary"]
        article_url = x["article_url"]

        print(f"Generating post for article: {article_title}")

        base_prompt += f""" 
    Generate a Linkedin post in the following language: {article_language}.
        
    Here's the article summary you need to work with:

    """
        messages = [
            {"role": "system", "content": base_prompt},
            {"role": "user", "content": dedent(article_title + "\n" + summary).strip()},
        ]
        response = client.chat.completions.create(
            model="gpt-4o-mini", messages=messages, temperature=0.7
        )

        response_raw = response.choices[0].message.content
        
        copys.append({"perfil" : persona_name, "copy": response_raw, "article_title": article_title, "article_language": article_language, "summary": summary, "article_url" : article_url })        


Generating posts for LAURA GIL
Generating post for article: What’s next for AI in 2025
Generating post for article: 43% of CFOs say half of business travel could be replaced by virtual meetings
Generating post for article: Google Veo 3 and playable world models
Generating post for article: Google rolls out Veo 3 globally
Generating post for article: ‘Europe is not the US’: Tech insiders call for smarter AI rules
Generating posts for FEDE SEGARRA
Generating post for article: Marca, comunicación y cultura: las claves para crear una agencia
Generating post for article: Cómo combatir la desinformación para fortalecer la confianza en la ciencia
Generating post for article: L’Oréal impulsa el uso de cosméticos recargables con una campaña global por la sostenibilidad
Generating posts for ELÍSABETH HERNÁNDEZ
Generating post for article: The Remote Work Paradox: Higher Engagement, Lower Wellbeing
Generating post for article: Judith Planella (Remote): “Hay que implementar políticas claras y just

In [34]:
for persona_name in personas:
    copys_persona = [x for x in copys if x["perfil"] == persona_name]
    with open(f"./{persona_name}_copys_ejemplo.txt", "w", encoding="utf-8") as f:
        for x in copys_persona:
            f.write(f"Title: {x['article_title']}\n")
            # f.write(f"Language: {x['article_language']}\n")
            # f.write(f"Summary: {x['summary']}\n")
            f.write(f"URL: {x['article_url']}\n")
            f.write(f"Copy:\n{x['copy']}\n\n")