## Extracción de datos de AP news, The Independent, CNN España

In [14]:
import requests
import newspaper
import pandas as pd 

from newspaper import Article
from bs4 import BeautifulSoup
import re

In [15]:

# * Careful because these aren't articles, they are portals
news_sources_world = {
    'bbc': 'https://www.bbc.com/news/world',
    'time':'https://time.com/section/world',
    'france24':'https://www.france24.com/en/world',
    'apnews': 'https://apnews.com/world-news',
    'the_independent': 'https://www.independent.co.uk/world',
    'cnn_espana': 'https://cnnespanol.cnn.com/seccion/mundo/'
}

In [16]:
news_sources_world = {
    'bbc': 'https://www.bbc.com/news/world',
    'time':'https://time.com/section/world',
    'france24':'https://www.france24.com/en/',
    'apnews': 'https://apnews.com/world-news',
    'the_independent': 'https://www.independent.co.uk/world',
    'cnn_espana': 'https://cnnespanol.cnn.com/seccion/mundo/'
}


def scrape_bbc(url):
    """Scrape BBC for article links."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
        
    # Encontrar todos los enlaces <a> que contienen noticias
    enlaces = soup.find_all('a', href=True)
        
    # Definir patrones de expresiones regulares para las clases parciales
    clase_promo = re.compile(r'.*PromoLink.*')
    clase_post = re.compile(r'.*LinkPostLink.*')
        
    # Filtrar los enlaces que contienen las clases parciales y no comienzan con "https"
    links = ["https://www.bbc.com" + enlace['href'] for enlace in enlaces if (clase_promo.match(' '.join(enlace.get('class', []))) or clase_post.match(' '.join(enlace.get('class', [])))) and not enlace['href'].startswith('https')]
    return links

def scrape_time(url):
    """Scrape Time for article links."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    divs = soup.find_all('div', class_='taxonomy-tout')
    
    links = []
    for div in divs:
        enlace = div.find('a', href=True)
        if enlace:
            links.append("https://time.com" + enlace['href'])
    return links

def scrape_france24(url):
    # Definir el encabezado de agente de usuario, si lo quitas es 403
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    # Realizar solicitud HTTP a la página web con el encabezado de agente de usuario
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    enlaces = soup.find_all('a', href=True, attrs={'data-article-item-link': True})
    
    links = []
    for enlace in enlaces:
        enlace_completo = "https://www.france24.com" + enlace['href']
        if enlace_completo not in links:  # Verificar si el enlace es nuevo
            links.append(enlace_completo)  # Agregar a la lista de enlaces a retornar
    return links

def scrape_apnews(url):
    """Scrape AP News for article links."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a.parent.name == 'h3' and 'PagePromo-title' in a.parent.get('class', [])]
    return links

def scrape_the_independent(url):
    """Scrape The Independent for article links."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = ['https://www.independent.co.uk' + a['href'] for a in soup.find_all('a', class_='title')]
    return links

def scrape_cnn_espana(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if 'news__media-item' in a.get('class', []) or (a.parent.name == 'h2' and 'news__title' in a.parent.get('class', []))]
    return links


def extract_articles_from_sources():
    
    bbc_links = scrape_bbc(news_sources_world['bbc'])
    time_links = scrape_time(news_sources_world['time'])
    france24_links = scrape_france24(news_sources_world['france24'])
    apnews_links = scrape_apnews(news_sources_world['apnews'])
    the_independent_links = scrape_the_independent(news_sources_world['the_independent'])
    cnn_espana_links = scrape_cnn_espana(news_sources_world['cnn_espana'])
    
    #print('The Independent Links:', the_independent_links)
    #print('AP News Links:', apnews_links)
    #print('CNN España Links:', cnn_espana_links)
    articles = {'bbc_articles':bbc_links, 'time_articles':time_links, 'france24_articles':france24_links, 'ap_news_articles': apnews_links, 'the_independent_articles': the_independent_links,'cnn_espana_articles': cnn_espana_links}
    return articles


In [17]:
print("Articles from the WORLD news page from today:")
extract_articles_from_sources()

Articles from the WORLD news page from today:


{'bbc_articles': ['https://www.bbc.com/news/world-europe-68423229',
  'https://www.bbc.com/news/world-europe-68425869',
  'https://www.bbc.com/news/world-us-canada-68428697',
  'https://www.bbc.com/news/world-us-canada-68422118',
  'https://www.bbc.com/news/world-europe-68423990',
  'https://www.bbc.com/news/world-africa-68353437',
  'https://www.bbc.com/news/world-us-canada-68420543',
  'https://www.bbc.com/news/world-europe-68428670',
  'https://www.bbc.com/news/world-africa-68424863',
  'https://www.bbc.com/news/world-australia-68420791',
  'https://www.bbc.com/news/world-us-canada-68420519',
  'https://www.bbc.com/news/world-asia-68402139',
  'https://www.bbc.com/news/world-middle-east-68419581',
  'https://www.bbc.com/news/world-middle-east-68401909',
  'https://www.bbc.com/news/world-middle-east-68380776',
  'https://www.bbc.com/news/world-middle-east-68375460',
  'https://www.bbc.com/news/world-europe-68425869',
  'https://www.bbc.com/news/world-europe-68419267',
  'https://www.

## Preprocessing from the newspapers3k API: 

In [18]:
def extract_article_details(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    article_details = {
        'title':article.title, 
        'text': article.text, 
        'authors': article.authors, 
        'date': article.publish_date, 
        'keywords': article.keywords
    }
    return article_details

In [19]:
def process_article_urls(article_urls_dict):
    articles_data = []

    for source, urls in article_urls_dict.items():
        for url in urls:
            try:
                article_details = extract_article_details(url)
                article_details['source'] = source  # Adding the source information
                articles_data.append(article_details)
            except Exception as e:
                print(f"Failed to process {url} from {source}: {str(e)}")
    
    return articles_data

In [20]:
articles_data = process_article_urls(extract_articles_from_sources())
df = pd.DataFrame(articles_data)

df.head()

Failed to process https://www.france24.com//observers.france24.com/en/tv-shows/the-observers/20240227-gaza-why-are-israeli-protesters-blocking-humanitarian-aid-for-gaza from france24_articles: Article `download()` failed with 404 Client Error: Not Found for url: https://www.france24.com//observers.france24.com/en/tv-shows/the-observers/20240227-gaza-why-are-israeli-protesters-blocking-humanitarian-aid-for-gaza on URL https://www.france24.com//observers.france24.com/en/tv-shows/the-observers/20240227-gaza-why-are-israeli-protesters-blocking-humanitarian-aid-for-gaza
Failed to process https://www.france24.com//observers.france24.com/en/asia-pacific/20240220-misinformation-endangering-rohingya-refugees-indonesia from france24_articles: Article `download()` failed with 404 Client Error: Not Found for url: https://www.france24.com//observers.france24.com/en/asia-pacific/20240220-misinformation-endangering-rohingya-refugees-indonesia on URL https://www.france24.com//observers.france24.com/en

KeyboardInterrupt: 

In [None]:
df.tail()

Unnamed: 0,title,text,authors,date,keywords,source
328,Japón registra un mínimo histórico de natalida...,Japón le paga a familias para que salgan de To...,[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[es, una, la, que, los, octavo, registra, nata...",cnn_espana_articles
329,La Policía de Australia encuentra restos en la...,"Luke Davies, izquierda, y Jesse Baird fueron v...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[se, por, una, del, los, y, pareja, gay, resto...",cnn_espana_articles
330,La Policía de Australia encuentra restos en la...,"Luke Davies, izquierda, y Jesse Baird fueron v...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[se, por, una, del, los, y, pareja, gay, resto...",cnn_espana_articles
331,ANÁLISIS | Los peligrosos paralelismos entre l...,"Vladimir Putin y Xi Jinping, dos líderes fuert...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[sobre, reclamaciones, que, una, los, en, puti...",cnn_espana_articles
332,ANÁLISIS | Los peligrosos paralelismos entre l...,"Vladimir Putin y Xi Jinping, dos líderes fuert...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[sobre, reclamaciones, que, una, los, en, puti...",cnn_espana_articles


NEWS API exploration
Posible exploración para conectar al agente, a la hora de extraer las noticias de manera más generalista: 

In [21]:
news_api_key = '6ddc83d9e2974d0fabbac57924805fa3'

In [40]:
def search_articles(query, from_date, to_date, language='en',page_size=50):
    base_url = "https://newsapi.org/v2/everything?"
    api_key = news_api_key
    
    # Constructing the API request URL with parameters
    search_url = f"{base_url}q={query}&from={from_date}&to={to_date}&language={language}&apiKey={api_key}&pageSize={page_size}"
    print(search_url)
    # Making the request to NewsAPI
    response = requests.get(search_url)
    
    # Checking if the request was successful
    if response.status_code == 200:
        return response.json()  # Return the JSON response if successful
    else:
        return None


In [23]:
# Example usage:
from_date = "2024-02-01"
to_date = "2024-02-27"
query = "Ábalos&PSOE"
articles_response = search_articles(query, from_date, to_date, 'es')

if articles_response:
    # Loop through the articles and print some details
    for article in articles_response['articles']:
        print(f"Title: {article['title']}")
        print(f"Author: {article['author']}")
        print(f"Published At: {article['publishedAt']}")
        print(f"Source: {article['source']['name']}")
        print(f"URL: {article['url']}\n")
else:
    print("Failed to retrieve articles")

https://newsapi.org/v2/everything?q=Ábalos&PSOE&from=2024-02-01&to=2024-02-27&language=es&apiKey=6ddc83d9e2974d0fabbac57924805fa3
Title: Ábalos no cede a la presión del PSOE y mantiene su acta de diputado: "No estoy acusado de nada"
Author: marca.com
Published At: 2024-02-27T13:58:19Z
Source: Marca
URL: https://www.marca.com/tiramillas/actualidad/2024/02/27/65ddc005268e3ef7618b4576.html

Title: Wyoming ironiza sobre la negativa de Ábalos a dejar su acta de diputado: "Qué poco le gusta moverse"
Author: marca.com
Published At: 2024-02-27T09:05:18Z
Source: Marca
URL: https://www.marca.com/tiramillas/television/2024/02/27/65dda557e2704e34b68b45a4.html

Title: Ábalos descarta dimitir pero admite que si el 'caso Koldo' hubiera estallado cuando era ministro sí lo habría hecho
Author: Europa Press
Published At: 2024-02-24T21:23:31Z
Source: El Mundo
URL: https://www.elmundo.es/espana/2024/02/24/65da5dabe4d4d8bb3b8b459a.html

Title: El PP apunta a Sánchez por el 'caso Koldo': "Le afecta directam

# Uso de Maisa con noticias en texto de la API

In [34]:
def convert_articles_to_dict(articles_list):
    # Crear un diccionario para almacenar las URLs categorizadas por fuente
    article_urls_dict = {}
    for article in articles_list:
        # Obtenemos el ID de la fuente
        source_id = article['source']['id']
        # Si la fuente aún no está en el diccionario, la añadimos
        if source_id not in article_urls_dict:
            article_urls_dict[source_id] = []
        # Añadimos la URL del artículo a la lista correspondiente a su fuente
        article_urls_dict[source_id].append(article['url'])
    return article_urls_dict

In [41]:
# Example usage:
from_date = "2024-02-26"
to_date = "2024-02-27"
query = "Ábalos&PSOE"
articles_response = search_articles(query, from_date, to_date, 'es', 3)

if articles_response:
    articles_data = process_article_urls(convert_articles_to_dict(articles_response['articles']))
    df = pd.DataFrame(articles_data)
else:
    print("Failed to retrieve articles")

https://newsapi.org/v2/everything?q=Ábalos&PSOE&from=2024-02-26&to=2024-02-27&language=es&apiKey=6ddc83d9e2974d0fabbac57924805fa3&pageSize=3


In [82]:
api_key = "sk_live_QgMwc89NHPxhEaD6CeYPMx_kLFsPqUz8ciTP3ErH5L24t" #Enter the API key
from maisa import Maisa
import requests

maisa = Maisa(
    # Jesus's Api
    api_key= api_key,
)

language = 'es' # Si no se especifica es inglés 
summary_length = 'medium' #(short, medium, long)

In [96]:
# Topic and Purpose
# Target Audience
# Quotes or Sources

def query_generator(df_texts,df_sources, language_query='en',purpose='breaking news',target='general public'):
    i=0
    query = f"You are tasked with crafting a comprehensive news article for a specific publication. This article will address a current issue or topic of interest, drawing upon {len(df_texts)+1} articles to provide a well-rounded perspective in the language {language_query}. "
    for i in range(0, len(df_texts)):
        summary = maisa.capabilities.summarize(
            text = df_texts[i], # (Required)
            lang = language,
            length = summary_length,
        )
        query += f"This is the article number {i+1} from the source "+ df_sources[i] +": "
        query += summary.summary+ " "
    query += "After providing the details of the source articles, you will be asked to integrate this information into a coherent and well-structured news article, adhering to a style for the "+ target +" and the style is of "+ purpose +"."
    return query

In [95]:
# Example
query_generator(df['text'],df['source'],'es')

'You are tasked with crafting a comprehensive news article for a specific publication. This article will address a current issue or topic of interest, drawing upon 4 articles to provide a well-rounded perspective in the language es. This is the article number 1 from the source marca: El exministro José Luis Ábalos ha decidido mantener su escaño en el Congreso de los Diputados a pesar del ultimátum del PSOE para que dejara su acta de diputado. Ábalos anunció su dimisión como presidente de la Comisión de Interior, pero mantuvo su acta de diputado. En una comparecencia en la sala de prensa del Congreso, Ábalos afirmó que no está acusado de nada y que no tiene ningún enriquecimiento ilícito. Además, anunció que pasará al grupo mixto del Congreso. El PSOE ha suspendido de militancia a Ábalos tras esta decisión.\nThis is the article number 2 from the source marca: El panorama político está a la espera de la decisión de José Luis Ábalos sobre si dejará su acta de diputado con el PSOE. En el p

In [97]:
url = "https://api.maisa.ai/v1/kpu/run?explain_steps=false&retries=1"

query = query_generator(df['text'],df['source'],'es')

payload = "-----011000010111000001101001\r\nContent-Disposition: form-data; name=\"query\"\r\n\r\n"+ query +"\r\n-----011000010111000001101001--\r\n\r\n"

headers = {
    "X-API-Key": api_key,
    "accept": "application/json",
    "content-type": "multipart/form-data; boundary=---011000010111000001101001"
}

response = requests.post(url, data=payload, headers=headers)

print(response.text)

<html>
<head><title>504 Gateway Time-out</title></head>
<body>
<center><h1>504 Gateway Time-out</h1></center>
</body>
</html>

