## Extracción de datos de AP news, The Independent, CNN España

In [24]:
import requests
import newspaper
import pandas as pd 

from newspaper import Article
from bs4 import BeautifulSoup

In [1]:

# * Careful because these aren't articles, they are portals
news_sources_world = {
    'apnews': 'https://apnews.com/world-news',
    'the_independent': 'https://www.independent.co.uk/world',
    'cnn_espana': 'https://cnnespanol.cnn.com/seccion/mundo/'
}

In [12]:
news_sources_world = {
    'apnews': 'https://apnews.com/world-news',
    'the_independent': 'https://www.independent.co.uk/world',
    'cnn_espana': 'https://cnnespanol.cnn.com/seccion/mundo/'
}

def scrape_apnews(url):
    """Scrape AP News for article links."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if a.parent.name == 'h3' and 'PagePromo-title' in a.parent.get('class', [])]
    return links

def scrape_the_independent(url):
    """Scrape The Independent for article links."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = ['https://www.independent.co.uk' + a['href'] for a in soup.find_all('a', class_='title')]
    return links

def scrape_cnn_espana(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = [a['href'] for a in soup.find_all('a', href=True) if 'news__media-item' in a.get('class', []) or (a.parent.name == 'h2' and 'news__title' in a.parent.get('class', []))]
    return links

def extract_articles_from_sources():
    
    apnews_links = scrape_apnews(news_sources_world['apnews'])
    the_independent_links = scrape_the_independent(news_sources_world['the_independent'])
    cnn_espana_links = scrape_cnn_espana(news_sources_world['cnn_espana'])
    
    #print('The Independent Links:', the_independent_links)
    #print('AP News Links:', apnews_links)
    #print('CNN España Links:', cnn_espana_links)
    articles = {'ap_news_articles': apnews_links, 'the_independent_articles': the_independent_links,'cnn_espana_articles': cnn_espana_links}
    return articles


In [13]:
print("Articles from the WORLD news page from today:")
extract_articles_from_sources()

Articles from the WORLD news page from today:
AP News Links: ['https://apnews.com/article/gaza-hamas-israel-ceasefire-deal-2ef0d5b960c4f132bfe9d91b19878a12', 'https://apnews.com/article/israel-hamas-war-news-02-27-2024-e17abedeaf5a005fcd5e7095fecacb7a', 'https://apnews.com/article/israel-hamas-war-latest-02-27-2024-5c62b0deecf503de9f1e02f70a6504b6', 'https://apnews.com/article/russia-crackdown-prison-navalny-karamurza-putin-3e5b9f5d3cfde3256819fbde5e405067', 'https://apnews.com/article/russia-ukraine-orlov-nobel-sentenced-8539517d8b2c846706607584ba5f9bbb', 'https://apnews.com/article/pakistan-mariam-nawaz-chief-minister-punjab-6b5296c5adfb42bd9ec5003196ab292c', 'https://apnews.com/article/senegal-elections-president-sall-173327b00f6026070d4095d90d0cdf34', 'https://apnews.com/article/senate-president-election-hun-sen-manet-c8613f0cc226f938f3e09b28e65a565e', 'https://apnews.com/article/tuvalu-prime-minister-feleti-teo-bae1874651a7a0fc86594f04b9ebe7dd', 'https://apnews.com/25-elections-in

{'ap_news_articles': ['https://apnews.com/article/gaza-hamas-israel-ceasefire-deal-2ef0d5b960c4f132bfe9d91b19878a12',
  'https://apnews.com/article/israel-hamas-war-news-02-27-2024-e17abedeaf5a005fcd5e7095fecacb7a',
  'https://apnews.com/article/israel-hamas-war-latest-02-27-2024-5c62b0deecf503de9f1e02f70a6504b6',
  'https://apnews.com/article/russia-crackdown-prison-navalny-karamurza-putin-3e5b9f5d3cfde3256819fbde5e405067',
  'https://apnews.com/article/russia-ukraine-orlov-nobel-sentenced-8539517d8b2c846706607584ba5f9bbb',
  'https://apnews.com/article/pakistan-mariam-nawaz-chief-minister-punjab-6b5296c5adfb42bd9ec5003196ab292c',
  'https://apnews.com/article/senegal-elections-president-sall-173327b00f6026070d4095d90d0cdf34',
  'https://apnews.com/article/senate-president-election-hun-sen-manet-c8613f0cc226f938f3e09b28e65a565e',
  'https://apnews.com/article/tuvalu-prime-minister-feleti-teo-bae1874651a7a0fc86594f04b9ebe7dd',
  'https://apnews.com/25-elections-in-2024-that-could-chang

## Preprocessing from the newspapers3k API: 

In [11]:
def extract_article_details(url):
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    article_details = {
        'title':article.title, 
        'text': article.text, 
        'authors': article.authors, 
        'date': article.publish_date, 
        'keywords': article.keywords
    }
    return article_details

In [14]:
def process_article_urls(article_urls_dict):
    articles_data = []

    for source, urls in article_urls_dict.items():
        for url in urls:
            try:
                article_details = extract_article_details(url)
                article_details['source'] = source  # Adding the source information
                articles_data.append(article_details)
            except Exception as e:
                print(f"Failed to process {url} from {source}: {str(e)}")
    
    return articles_data

In [17]:
articles_data = process_article_urls(extract_articles_from_sources())
df = pd.DataFrame(articles_data)

df.head()

AP News Links: ['https://apnews.com/article/gaza-hamas-israel-ceasefire-deal-2ef0d5b960c4f132bfe9d91b19878a12', 'https://apnews.com/article/israel-hamas-war-news-02-27-2024-e17abedeaf5a005fcd5e7095fecacb7a', 'https://apnews.com/article/israel-hamas-war-latest-02-27-2024-5c62b0deecf503de9f1e02f70a6504b6', 'https://apnews.com/article/russia-crackdown-prison-navalny-karamurza-putin-3e5b9f5d3cfde3256819fbde5e405067', 'https://apnews.com/article/russia-ukraine-orlov-nobel-sentenced-8539517d8b2c846706607584ba5f9bbb', 'https://apnews.com/article/pakistan-mariam-nawaz-chief-minister-punjab-6b5296c5adfb42bd9ec5003196ab292c', 'https://apnews.com/article/senegal-elections-president-sall-173327b00f6026070d4095d90d0cdf34', 'https://apnews.com/article/senate-president-election-hun-sen-manet-c8613f0cc226f938f3e09b28e65a565e', 'https://apnews.com/article/tuvalu-prime-minister-feleti-teo-bae1874651a7a0fc86594f04b9ebe7dd', 'https://apnews.com/25-elections-in-2024-that-could-change-the-world', 'https://a

Unnamed: 0,title,text,authors,date,keywords,source
0,A deal between Israel and Hamas appears to be ...,CAIRO (AP) — Israel and Hamas are inching towa...,[],2024-02-27 17:47:47,"[deal, egyptian, shape, hostages, official, is...",ap_news_articles
1,Israel and Hamas indicate no deal is imminent ...,JERUSALEM (AP) — Israel and Hamas on Tuesday p...,[],2024-02-27 05:31:50,"[rafah, deal, women, offensive, hostages, kill...",ap_news_articles
2,The Latest | Biden says Israel is willing to h...,President Joe Biden says Israel would be willi...,[],2024-02-27 05:46:31,"[military, deal, latest, hostages, war, hostag...",ap_news_articles
3,What’s life like for Russia’s political prison...,"TALLINN, Estonia (AP) — Vladimir Kara-Murza co...",[Litvinova Is An Associated Press Corresponden...,2024-02-27 05:03:06,"[life, punishment, russias, arbitrary, karamur...",ap_news_articles
4,Prominent Russian human rights activist Oleg O...,A veteran human rights campaigner who criticiz...,[],2024-02-27 09:44:52,"[oleg, russian, sentenced, prison, court, stat...",ap_news_articles


In [18]:
df.tail()

Unnamed: 0,title,text,authors,date,keywords,source
207,La Policía de Australia encuentra restos en la...,"Luke Davies, izquierda, y Jesse Baird fueron v...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[se, que, gay, del, policía, en, y, la, pareja...",cnn_espana_articles
208,ANÁLISIS | Los peligrosos paralelismos entre l...,"Vladimir Putin y Xi Jinping, dos líderes fuert...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[xi, paralelismos, que, una, su, en, sobre, la...",cnn_espana_articles
209,ANÁLISIS | Los peligrosos paralelismos entre l...,"Vladimir Putin y Xi Jinping, dos líderes fuert...",[Melissa Velásquez Loaiza],2024-02-27 00:00:00,"[xi, paralelismos, que, una, su, en, sobre, la...",cnn_espana_articles
210,"Antes de morir, Navalny estuvo posiblemente a ...",¿Cómo sigue la política rusa tras la muerte de...,[],2024-02-26 00:00:00,"[se, que, estuvo, navalny, posiblemente, su, p...",cnn_espana_articles
211,"Antes de morir, Navalny estuvo posiblemente a ...",¿Cómo sigue la política rusa tras la muerte de...,[],2024-02-26 00:00:00,"[se, que, estuvo, navalny, posiblemente, su, p...",cnn_espana_articles


NEWS API exploration
Posible exploración para conectar al agente, a la hora de extraer las noticias de manera más generalista: 

In [19]:
news_api_key = '6ddc83d9e2974d0fabbac57924805fa3'

In [30]:
def search_articles(query, from_date, to_date, language='en'):
    base_url = "https://newsapi.org/v2/everything?"
    api_key = news_api_key
    
    # Constructing the API request URL with parameters
    search_url = f"{base_url}q={query}&from={from_date}&to={to_date}&language={language}&apiKey={api_key}"
    print(search_url)
    # Making the request to NewsAPI
    response = requests.get(search_url)
    
    # Checking if the request was successful
    if response.status_code == 200:
        return response.json()  # Return the JSON response if successful
    else:
        return None


In [34]:
# Example usage:
from_date = "2024-02-01"
to_date = "2024-02-27"
query = "Ábalos&PSOE"
articles_response = search_articles(query, from_date, to_date, 'es')

if articles_response:
    # Loop through the articles and print some details
    for article in articles_response['articles']:
        print(f"Title: {article['title']}")
        print(f"Author: {article['author']}")
        print(f"Published At: {article['publishedAt']}")
        print(f"Source: {article['source']['name']}")
        print(f"URL: {article['url']}\n")
else:
    print("Failed to retrieve articles")

https://newsapi.org/v2/everything?q=Ábalos&PSOE&from=2024-02-01&to=2024-02-27&language=es&apiKey=6ddc83d9e2974d0fabbac57924805fa3
Title: Ábalos descarta dimitir pero admite que si el 'caso Koldo' hubiera estallado cuando era ministro sí lo habría hecho
Author: Europa Press
Published At: 2024-02-24T21:23:31Z
Source: El Mundo
URL: https://www.elmundo.es/espana/2024/02/24/65da5dabe4d4d8bb3b8b459a.html

Title: El PP apunta a Sánchez por el 'caso Koldo': "Le afecta directamente"
Author: Juanma Lamet
Published At: 2024-02-22T10:48:07Z
Source: El Mundo
URL: https://www.elmundo.es/espana/2024/02/22/65d7232ce85ece84228b45c5.html

Title: El alcalde de León (PSOE) revela cómo fue amenazado por Koldo García: "Me dijo que le quedaban tres años para joderme"
Author: Marta Belver
Published At: 2024-02-22T09:29:59Z
Source: El Mundo
URL: https://www.elmundo.es/espana/2024/02/22/65d70b69fdddff1a7f8b45a5.html

Title: La Audiencia Nacional intervino los teléfonos de la trama de Koldo tras 17 meses de inve