In [None]:
from duckduckgo_search import DDGS
from time import sleep
import pandas as pd
from pydantic import BaseModel, ValidationError
from time import time
import json
import os
from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient

In [105]:
load_dotenv('/home/xavaki/DAMM/linkedin_gen_contents/.env')
runid = "DEVRUN"


In [None]:
def search_news_by_keywords(keyword, persona_name, max_results=10000):
    search_term = keyword 
    with DDGS() as ddgs:
        results = ddgs.news(
            keywords=search_term,
            region="es-es",
            safesearch="Moderate",
            timelimit="w",
            max_results=max_results
        )
        return [{"persona": persona_name, "query": search_term, **r} for r in results]


In [107]:
persona_keywords = {
    "Laura Gil": [
        "inteligencia artificial", "datos", "transformación digital",
        "industria 4.0", "automatización", "cultura de innovación"
    ],
    "Fede Segarra": [
        "comunicación corporativa", "reputación de marca", "PR", 
        "estrategias de comunicación", "imagen pública"
    ],
    "Elísabeth Hernández": [
        "recursos humanos", "RSC", "desarrollo del talento", 
        "formación corporativa", "comunicación interna", "voluntariado"
    ],
    "Jaume Alemany": [
        "marketing", "branding", "consumo", "bebidas", 
        "innovación de producto", "campañas publicitarias"
    ],
    "Ricardo Lechuga": [
        "transformación cultural", "talento interno", "digitalización RRHH", 
        "engagement de empleados", "liderazgo cercano"
    ],
    "Jorge Villavecchia": [
        "liderazgo empresarial", "estrategia corporativa", "sostenibilidad", 
        "transformación de negocios", "expansión internacional"
    ],
    "Salvador Martínez": [
        "finanzas sostenibles", "crecimiento responsable", 
        "decisiones basadas en datos", "ética empresarial"
    ],
    "Jofre Riera": [
        "patrocinios deportivos", "marketing cultural", 
        "experiencias de marca", "impacto social"
    ]
}


In [108]:
all_results = []
for persona, keywords in persona_keywords.items():
    keyword_results = []
    for keyword in keywords:
        results = search_news_by_keywords(keyword, persona)
        keyword_results.extend(results)
        print(f"Results for {persona} on '{keyword}': {len(results)} found.")
        print(f"Sleeping for 20 seconds to avoid rate limiting...")
        sleep(20)
        break 
        
    all_results.extend(keyword_results)

Results for Laura Gil on 'inteligencia artificial': 55 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Fede Segarra on 'comunicación corporativa': 89 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Elísabeth Hernández on 'recursos humanos': 55 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Jaume Alemany on 'marketing': 58 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Ricardo Lechuga on 'transformación cultural': 49 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Jorge Villavecchia on 'liderazgo empresarial': 48 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Salvador Martínez on 'finanzas sostenibles': 67 found.
Sleeping for 20 seconds to avoid rate limiting...
Results for Jofre Riera on 'patrocinios deportivos': 28 found.
Sleeping for 20 seconds to avoid rate limiting...


In [109]:
df_results = pd.DataFrame(all_results)
df_results.head()

Unnamed: 0,persona,query,date,title,body,url,image,source
0,Laura Gil,inteligencia artificial,2025-06-23T07:01:00+00:00,La inteligencia artificial revela cuál es la m...,Por todo el Archipiélago se celebra la noche m...,https://www.msn.com/es-es/viajes/noticias/la-i...,https://fotografias.larazon.es/clipping/cmsima...,La Razón
1,Laura Gil,inteligencia artificial,2025-06-23T05:00:00+00:00,Esta es la ciudad de España donde viven las pe...,"En primer lugar, esta tecnología indica que es...",https://www.abc.es/recreo/ciudad-espana-viven-...,https://s3.abcstatics.com/abc/www/multimedia/r...,ABC
2,Laura Gil,inteligencia artificial,2025-06-23T04:15:24+00:00,Las empresas aplican la Inteligencia Artificia...,Las empresas comienzan a utilizar la Inteligen...,https://www.msn.com/es-es/tecnología/inteligen...,https://img-s-msn-com.akamaized.net/tenant/amp...,elEconomista.es
3,Laura Gil,inteligencia artificial,2025-06-22T17:16:45+00:00,Avances y retos de la predicción con inteligen...,¿Hasta dónde puede llegar la predicción con in...,https://www.msn.com/es-es/tecnología/inteligen...,https://img-s-msn-com.akamaized.net/tenant/amp...,Núcleo Visual
4,Laura Gil,inteligencia artificial,2025-06-22T17:29:50+00:00,Más de la mitad de los españoles usan la Intel...,Más de la mitad de los españoles reconoce que ...,https://www.msn.com/es-es/noticias/tecnologia/...,https://img.europapress.es/fotoweb/fotonoticia...,Europa Press


In [110]:
len(df_results)

449

In [118]:
class RawArticle(BaseModel):
    run_id: str
    source_name: str
    source_url: str
    article_id: str
    article_date: str
    article_title: str
    article_url: str
    article_body: str
    article_image_url: str
    article_language: str
    crawled_at: str
    ddgs_search_query: list[str]
    query_original_personas: list[str]

In [112]:
df_results = pd.DataFrame(all_results)
df_results.head()

agg_clause = {col : 'first' for col in df_results.columns}
agg_clause.pop('url')
agg_clause['persona'] = lambda x: list(x)
agg_clause['query'] = lambda x: list(x)
df_results = df_results.groupby('url').agg(agg_clause).reset_index()

df_results["source_url"] = df_results["url"].apply(lambda x: "/".join(x.split("/")[:3]) if pd.notna(x) else None)
df_results["source"] = df_results["source"].apply(lambda x: x.replace(" ", "_").lower() if pd.notna(x) else None)
df_results["article_id"] = df_results["source"].apply(lambda x: x + "_" + str(time()).replace(".", "") if pd.notna(x) else None)
df_results.dropna(subset=["source_url", "source", "article_id"], inplace=True)
df_results["crawled_at"] = pd.to_datetime("now").isoformat()
df_results["article_language"] = "es"  
df_results["run_id"] = runid

df_results.rename(columns={
    "title": "article_title",
    "url": "article_url",
    "body": "article_body",
    "image": "article_image_url",
    "source": "source_name",
    "query": "ddgs_search_query",
    "persona": "query_original_personas",
    "date" : "article_date"
}, inplace=True)

df_results.head()

Unnamed: 0,article_url,query_original_personas,ddgs_search_query,article_date,article_title,article_body,article_image_url,source_name,source_url,article_id,crawled_at,article_language,run_id
0,http://campechehoy.mx/2025/06/17/celebran-la-l...,[Jorge Villavecchia],[liderazgo empresarial],2025-06-17T18:02:00+00:00,Celebran la llegada de Claudia,"CDMX - Con pancartas, banderas y gritos de apo...",http://campechehoy.mx/wp-content/uploads/2025/...,campeche_hoy,http://campechehoy.mx,campeche_hoy_17506663546176462,2025-06-23T10:12:34.618966,es,DEVRUN
1,https://abcnoticias.mx/local/2025/6/19/asegura...,[Jorge Villavecchia],[liderazgo empresarial],2025-06-20T05:24:00+00:00,Asegura Estado que en NL hay 3 millones de tra...,El estado cuenta con una Población Económicame...,https://abcnoticias.mx/u/fotografias/m/2025/6/...,abc_noticias,https://abcnoticias.mx,abc_noticias_17506663546176527,2025-06-23T10:12:34.618966,es,DEVRUN
2,https://acento.com.do/economia/abancord-y-ceri...,[Ricardo Lechuga],[transformación cultural],2025-06-21T00:00:00+00:00,Abancord y Cerise+SPTF firman acuerdo para cap...,Con el respaldo del Banco Europeo de Inversion...,https://media.acento.com.do/media/storage02/up...,acento,https://acento.com.do,acento_17506663546176546,2025-06-23T10:12:34.618966,es,DEVRUN
3,https://acento.com.do/opinion/americo-lugo-con...,[Ricardo Lechuga],[transformación cultural],2025-06-19T04:03:00+00:00,"Américo Lugo, conciencia de nación",La democracia dominicana no alcanza su mayor e...,https://media.acento.com.do/media/storage02/up...,acento,https://acento.com.do,acento_17506663546176555,2025-06-23T10:12:34.618966,es,DEVRUN
4,https://actualidadaeroespacial.com/enaire-refu...,[Fede Segarra],[comunicación corporativa],2025-06-19T05:48:00+00:00,ENAIRE refuerza su transformación digital con ...,El Ministerio de Transportes y Movilidad Soste...,https://actualidadaeroespacial.com/wp-content/...,actualidad_aeroespacial,https://actualidadaeroespacial.com,actualidad_aeroespacial_1750666354617656,2025-06-23T10:12:34.618966,es,DEVRUN


In [124]:
# convert to RawArticle model
raw_articles = []
for _, row in df_results.iterrows():
    try:
        article = RawArticle(
            run_id=row["run_id"],
            source_name=row["source_name"],
            source_url=row["source_url"],
            article_id=row["article_id"],
            article_date=row["article_date"],
            article_title=row["article_title"],
            article_url=row["article_url"],
            article_body=row["article_body"],
            article_image_url=row["article_image_url"],
            article_language=row["article_language"],
            crawled_at=row["crawled_at"],
            ddgs_search_query=row["ddgs_search_query"],
            query_original_personas=row["query_original_personas"]
        )
        raw_articles.append(article.dict())
    except ValidationError as e:
        print(f"Validation error for row {row['article_id']}: {e}")

/tmp/ipykernel_44829/581287695.py:20: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  raw_articles.append(article.dict())


In [125]:
raw_articles

[{'run_id': 'DEVRUN',
  'source_name': 'campeche_hoy',
  'source_url': 'http://campechehoy.mx',
  'article_id': 'campeche_hoy_17506663546176462',
  'article_date': '2025-06-17T18:02:00+00:00',
  'article_title': 'Celebran la llegada de Claudia',
  'article_url': 'http://campechehoy.mx/2025/06/17/celebran-la-llegada-de-claudia/',
  'article_body': 'CDMX - Con pancartas, banderas y gritos de apoyo, la comunidad mexicana en Canadá dio una cálida bienvenida a la presidenta Claudia Sheinbaum a su llegada a Kananaskis, donde participará en la Cumbre',
  'article_image_url': 'http://campechehoy.mx/wp-content/uploads/2025/06/WhatsApp-Image-2025-06-16-at-5.48.37-PM.jpeg',
  'article_language': 'es',
  'crawled_at': '2025-06-23T10:12:34.618966',
  'ddgs_search_query': ['liderazgo empresarial'],
  'query_original_personas': ['Jorge Villavecchia']},
 {'run_id': 'DEVRUN',
  'source_name': 'abc_noticias',
  'source_url': 'https://abcnoticias.mx',
  'article_id': 'abc_noticias_17506663546176527',
  '

In [127]:
blob_service_client = BlobServiceClient.from_connection_string(os.getenv('STORAGE_ACCOUNT_CONNECTION_STRING'))
output_container_name = 'raw-articles-list-ddgs'
output_container = blob_service_client.get_container_client(output_container_name)
output_blob_name = f"{runid}--{output_container_name.replace("-", "_")}.json"
output_blob_client = output_container.get_blob_client(output_blob_name)
output_blob_client.upload_blob(json.dumps(raw_articles, indent=4), overwrite=True)
print(f"Relevant articles list saved to blob storage as {output_blob_name}")

Relevant articles list saved to blob storage as DEVRUN--raw_articles_list_ddgs.json
