In [1]:
import requests
import json
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

# NewsAPI Test

In [23]:
load_dotenv()
key = os.getenv("NEWS_API_KEY")
url = 'https://newsapi.org/v2/everything'

In [24]:
params = {
    'q' : 'bitcoin OR cryptocurrency OR inflation OR interest rates',
    'language' : 'en',
    'pageSize': 5,
    'apiKey': key
}

In [25]:
r = requests.get(url, params=params)
data= r.json()

In [None]:
with open("../data/00/sample_newsapi.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("NewsApi: ", len(data.get("articles", [])))
print('Primer titulo:', data["articles"][0]["title"])

NewsApi:  5
Primer titulo: Bitcoin Flash Crash Roils Crypto Market


# CryptoCompare News API Test

In [27]:
url = "https://min-api.cryptocompare.com/data/v2/news/"
params = {'lang':'EN'}
r = requests.get(url, params=params)
data = r.json()

In [None]:
with open("../data/00/sample_cryptocompare.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("CryptoCompare: ", len(data.get("Data", [])), "articles")
print("Primer titulo:", data["Data"][0]["title"])

CryptoCompare:  50 articles
Primer titulo: Bitcoin Price Astounding Surge: BTC Rockets Above $112,000


# Infobae scraping test

In [29]:
url = "https://www.infobae.com/economia/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

In [None]:
titulos = [h2.get_text(strip=True) for h2 in soup.find_all("h2")[:5]]

with open("../data/00/sample_infobae.html", "w", encoding="utf-8") as f:
    f.write(r.text)
with open("../data/sample_infobae_titles.json", 'w', encoding='utf-8') as f:
    json.dump(titulos, f, ensure_ascii=False, indent=2)

print("Infobae: ", len(titulos), "artículos")
print("Primer título:", titulos[0] if titulos else "No disponible")

Infobae:  5 artículos
Primer título: El Gobierno asegura que llegó a un acuerdo con los controladores aéreos y que se levantará la medida de  fuerza


In [2]:
base_url = "https://www.infobae.com"
search_url = "https://www.infobae.com/buscar/cripto/"

In [3]:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
containers = soup.find_all('div', class_='queryly_item_container')

In [7]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

BASE_URL = "https://www.infobae.com"
SEARCH_URL = "https://www.infobae.com/buscar/cripto/"

headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(SEARCH_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

containers = soup.find_all("div", class_="queryly_item_container")

data = []
for cont in containers:
    title_tag = cont.find("a", class_="resultlink double-teaser__title")
    desc_tag = cont.find("span", class_="queryly_item_description")
    date_tag = cont.find("div", class_="queryly_item_pubdate")
    img_tag = cont.find("div", class_="queryly_item_imagecontainer")

    if not title_tag:
        continue

    # URL absoluta
    link = title_tag.get("href")
    if link.startswith("/"):
        link = BASE_URL + link

    # Generar ID a partir de la URL
    id_hash = hashlib.md5(link.encode()).hexdigest()

    # Extraer imagen del estilo inline
    img_url = None
    if img_tag and "background-image" in img_tag.get("style", ""):
        style = img_tag["style"]
        start = style.find("url(")
        end = style.find(")", start)
        if start != -1 and end != -1:
            img_url = style[start+4:end].replace("'", "").replace('"', "")

    item = {
        "id": id_hash,
        "title": title_tag.get_text(strip=True),
        "description": desc_tag.get_text(strip=True) if desc_tag else None,
        "content": None,  # pendiente al entrar al artículo
        "url": link,
        "source": "Infobae",
        "published_at": date_tag.get_text(strip=True) if date_tag else None,
        "collected_at": datetime.utcnow().isoformat(),
        "extra": {"image_url": img_url}
    }
    data.append(item)

print(f"Encontradas {len(data)} noticias")
for d in data[:2]:
    print(d)


Encontradas 0 noticias


In [12]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

BASE_URL = "https://www.cronista.com"
SECTION_URL = "https://www.cronista.com/criptomonedas/"

headers = {"User-Agent": "Mozilla/5.0"}

# Primero sacamos la lista de artículos (tarjetas)
response = requests.get(SECTION_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article", class_=lambda x: x and "item" in x)

data = []

for art in articles:
    a_tag = art.find("a", class_="link")
    if not a_tag:
        continue

    title_tag = a_tag.find("h2", class_="title")
    kicker_tag = a_tag.find("span", class_="kicker")
    img_tag = art.find("div", class_="image").find("img") if art.find("div", class_="image") else None

    link = a_tag.get("href")
    if link.startswith("/"):
        link = BASE_URL + link

    id_hash = hashlib.md5(link.encode()).hexdigest()
    img_url = img_tag["src"] if img_tag else None

    # ---- Extraer contenido completo y fecha ----
    try:
        art_resp = requests.get(link, headers=headers)
        art_soup = BeautifulSoup(art_resp.text, "html.parser")

        # Contenido: todos los <p> dentro de content-media
        body = art_soup.find("div", class_="block-content")
        content = "\n".join([p.get_text(strip=True) for p in body.find_all("p")]) if body else None

        # Fecha: <time> o <span class="date">
        time_tag = art_soup.find("time") or art_soup.find("span", class_="date")
        published_at = time_tag.get_text(strip=True) if time_tag else None

    except Exception as e:
        print(f"Error al extraer {link}: {e}")
        content = None
        published_at = None

    item = {
        "id": id_hash,
        "title": title_tag.get_text(strip=True) if title_tag else None,
        "description": kicker_tag.get_text(strip=True) if kicker_tag else None,
        "content": content,
        "url": link,
        "source": "El Cronista",
        "published_at": published_at,
        "collected_at": datetime.utcnow().isoformat(),
        "extra": {"image_url": img_url}
    }

    data.append(item)

print(f"Se extrajeron {len(data)} artículos con contenido completo")
for d in data[:5]:
    print(d)


Se extrajeron 27 artículos con contenido completo
{'id': 'f968a8c03beee75ab68e4b60372ae127', 'title': 'Ethereum: a cuánto cotiza este sábado 30 de agosto', 'description': 'Mundo cripto', 'content': 'La cotización de la criptomonedaEthereumy eleurode este sábado, 30 de agosto de 2025 en España es de5.104,15 euros. En base a este precio, la variación de dicho activo digital en comparación con el día pasado es de 0,22%.\nEl precio del Ethereum y el euro ha mostrado una tendencia positiva en los últimos días. Esto indica que ambos activos están en un proceso de crecimiento sostenido.\nEn la última semana, la cotización de la criptomoneda Ethereum ha experimentado un leve descenso del -0.32%, lo que sugiere una estabilidad en su valor a corto plazo. Sin embargo, al observar su evolución en el último año, se destaca un notable incremento del 41.81%, evidenciando un crecimiento significativo y una rentabilidad atractiva para los inversores a largo plazo. Esta combinación de resultados refleja

In [6]:
import praw
from newspaper import Article, Config
import re
from dotenv import load_dotenv
import os
import time
import random
import json

# -------- CONFIG --------
load_dotenv()
client_id = os.getenv("client_id")
client_secret = os.getenv("client_secret")
user_agent = os.getenv("user_agent")

# Newspaper headers (para que parezca navegador real)
news_config = Config()
news_config.browser_user_agent = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/128.0.0.0 Safari/537.36"
)

# -------- INIT REDDIT --------
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# -------- FUNCIONES --------
def es_articulo(url: str) -> bool:
    """Filtra URLs que sean links de noticias y no imágenes/otros recursos."""
    if not url:
        return False
    if re.search(r"\.(jpg|jpeg|png|gif|mp4|pdf)$", url, re.IGNORECASE):
        return False
    if "reddit.com" in url:
        return False
    return True

def extraer_contenido(url: str) -> str:
    """Intenta scrapear el contenido de un artículo."""
    try:
        art = Article(url, config=news_config)
        art.download()
        art.parse()
        return art.text.strip()
    except Exception as e:
        return f"[ERROR extrayendo contenido: {e}]"

# -------- MAIN --------
def main(limit=50, flair="GENERAL-NEWS"):
    subreddit = reddit.subreddit("CryptoCurrency")
    posts = subreddit.hot(limit=limit)

    resultados = []

    for post in posts:
        # Filtramos por flair
        if flair and (post.link_flair_text or "").upper() != flair.upper():
            continue

        url = post.url
        if not es_articulo(url):
            continue

        # Sleep aleatorio 1-2 segundos para evitar rate-limit
        time.sleep(random.uniform(1, 2))

        content = extraer_contenido(url)

        resultados.append({
            "title": post.title,
            "url": url,
            "flair": post.link_flair_text,
            "content": content
        })

        # Preview consola
        print("=" * 80)
        print("TITLE:", post.title)
        print("FLAIR:", post.link_flair_text)
        print("URL:", url)
        print("CONTENT (preview):")
        print(content[:400], "..." if len(content) > 400 else "")

    # Guardar en JSON
    with open("reddit_news.json", "w", encoding="utf-8") as f:
        json.dump(resultados, f, ensure_ascii=False, indent=4)

    print(f"\n✅ Guardados {len(resultados)} artículos en reddit_news.json")

if __name__ == "__main__":
    main(limit=100)

TITLE: Banking giant Morgan Stanley reportedly plans to introduce crypto trading on E*Trade
FLAIR: GENERAL-NEWS
URL: https://www.aol.com/finance/banking-giant-morgan-stanley-reportedly-193439102.html
CONTENT (preview):
Morgan Stanley, one of the world’s largest investment banks, plans to introduce crypto trading on its consumer platform in the latest move by a traditional financial institution to capitalize on President Donald Trump’s deregulation of the crypto industry.

The banking giant plans to allow its customers to buy and sell crypto on its subsidiary, E*Trade, starting sometime next year, according to a  ...
TITLE: Bitwise Forecasts $1.3M Bitcoin as Institutional Giants Could Deploy $5 Trillion
FLAIR: GENERAL-NEWS
URL: https://news.bitcoin.com/bitwise-forecasts-1-3m-bitcoin-as-institutional-giants-could-deploy-5-trillion/
CONTENT (preview):
 
TITLE: Trump Family’s $750 Million Crypto Deal Raises Questions Ahead of WLFI Token Debut
FLAIR: GENERAL-NEWS
URL: https://beincrypto.com