In [1]:
import requests
import json
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

# NewsAPI Test

In [2]:
load_dotenv()
key = os.getenv("NEWS_API_KEY")
url = 'https://newsapi.org/v2/everything'

In [24]:
params = {
    'q' : 'bitcoin OR cryptocurrency OR inflation OR interest rates',
    'language' : 'en',
    'pageSize': 5,
    'apiKey': key
}

In [25]:
r = requests.get(url, params=params)
data= r.json()

In [None]:
with open("../data/00/sample_newsapi.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("NewsApi: ", len(data.get("articles", [])))
print('Primer titulo:', data["articles"][0]["title"])

NewsApi:  5
Primer titulo: Bitcoin Flash Crash Roils Crypto Market


# CryptoCompare News API Test

In [27]:
url = "https://min-api.cryptocompare.com/data/v2/news/"
params = {'lang':'EN'}
r = requests.get(url, params=params)
data = r.json()

In [None]:
with open("../data/00/sample_cryptocompare.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("CryptoCompare: ", len(data.get("Data", [])), "articles")
print("Primer titulo:", data["Data"][0]["title"])

CryptoCompare:  50 articles
Primer titulo: Bitcoin Price Astounding Surge: BTC Rockets Above $112,000


# Infobae scraping test

In [29]:
url = "https://www.infobae.com/economia/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

In [None]:
titulos = [h2.get_text(strip=True) for h2 in soup.find_all("h2")[:5]]

with open("../data/00/sample_infobae.html", "w", encoding="utf-8") as f:
    f.write(r.text)
with open("../data/sample_infobae_titles.json", 'w', encoding='utf-8') as f:
    json.dump(titulos, f, ensure_ascii=False, indent=2)

print("Infobae: ", len(titulos), "artículos")
print("Primer título:", titulos[0] if titulos else "No disponible")

Infobae:  5 artículos
Primer título: El Gobierno asegura que llegó a un acuerdo con los controladores aéreos y que se levantará la medida de  fuerza


In [2]:
base_url = "https://www.infobae.com"
search_url = "https://www.infobae.com/buscar/cripto/"

In [3]:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
containers = soup.find_all('div', class_='queryly_item_container')

In [7]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

BASE_URL = "https://www.infobae.com"
SEARCH_URL = "https://www.infobae.com/buscar/cripto/"

headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(SEARCH_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

containers = soup.find_all("div", class_="queryly_item_container")

data = []
for cont in containers:
    title_tag = cont.find("a", class_="resultlink double-teaser__title")
    desc_tag = cont.find("span", class_="queryly_item_description")
    date_tag = cont.find("div", class_="queryly_item_pubdate")
    img_tag = cont.find("div", class_="queryly_item_imagecontainer")

    if not title_tag:
        continue

    # URL absoluta
    link = title_tag.get("href")
    if link.startswith("/"):
        link = BASE_URL + link

    # Generar ID a partir de la URL
    id_hash = hashlib.md5(link.encode()).hexdigest()

    # Extraer imagen del estilo inline
    img_url = None
    if img_tag and "background-image" in img_tag.get("style", ""):
        style = img_tag["style"]
        start = style.find("url(")
        end = style.find(")", start)
        if start != -1 and end != -1:
            img_url = style[start+4:end].replace("'", "").replace('"', "")

    item = {
        "id": id_hash,
        "title": title_tag.get_text(strip=True),
        "description": desc_tag.get_text(strip=True) if desc_tag else None,
        "content": None,  # pendiente al entrar al artículo
        "url": link,
        "source": "Infobae",
        "published_at": date_tag.get_text(strip=True) if date_tag else None,
        "collected_at": datetime.utcnow().isoformat(),
        "extra": {"image_url": img_url}
    }
    data.append(item)

print(f"Encontradas {len(data)} noticias")
for d in data[:2]:
    print(d)


Encontradas 0 noticias


In [12]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

BASE_URL = "https://www.cronista.com"
SECTION_URL = "https://www.cronista.com/criptomonedas/"

headers = {"User-Agent": "Mozilla/5.0"}

# Primero sacamos la lista de artículos (tarjetas)
response = requests.get(SECTION_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article", class_=lambda x: x and "item" in x)

data = []

for art in articles:
    a_tag = art.find("a", class_="link")
    if not a_tag:
        continue

    title_tag = a_tag.find("h2", class_="title")
    kicker_tag = a_tag.find("span", class_="kicker")
    img_tag = art.find("div", class_="image").find("img") if art.find("div", class_="image") else None

    link = a_tag.get("href")
    if link.startswith("/"):
        link = BASE_URL + link

    id_hash = hashlib.md5(link.encode()).hexdigest()
    img_url = img_tag["src"] if img_tag else None

    # ---- Extraer contenido completo y fecha ----
    try:
        art_resp = requests.get(link, headers=headers)
        art_soup = BeautifulSoup(art_resp.text, "html.parser")

        # Contenido: todos los <p> dentro de content-media
        body = art_soup.find("div", class_="block-content")
        content = "\n".join([p.get_text(strip=True) for p in body.find_all("p")]) if body else None

        # Fecha: <time> o <span class="date">
        time_tag = art_soup.find("time") or art_soup.find("span", class_="date")
        published_at = time_tag.get_text(strip=True) if time_tag else None

    except Exception as e:
        print(f"Error al extraer {link}: {e}")
        content = None
        published_at = None

    item = {
        "id": id_hash,
        "title": title_tag.get_text(strip=True) if title_tag else None,
        "description": kicker_tag.get_text(strip=True) if kicker_tag else None,
        "content": content,
        "url": link,
        "source": "El Cronista",
        "published_at": published_at,
        "collected_at": datetime.utcnow().isoformat(),
        "extra": {"image_url": img_url}
    }

    data.append(item)

print(f"Se extrajeron {len(data)} artículos con contenido completo")
for d in data[:5]:
    print(d)


Se extrajeron 27 artículos con contenido completo
{'id': 'f968a8c03beee75ab68e4b60372ae127', 'title': 'Ethereum: a cuánto cotiza este sábado 30 de agosto', 'description': 'Mundo cripto', 'content': 'La cotización de la criptomonedaEthereumy eleurode este sábado, 30 de agosto de 2025 en España es de5.104,15 euros. En base a este precio, la variación de dicho activo digital en comparación con el día pasado es de 0,22%.\nEl precio del Ethereum y el euro ha mostrado una tendencia positiva en los últimos días. Esto indica que ambos activos están en un proceso de crecimiento sostenido.\nEn la última semana, la cotización de la criptomoneda Ethereum ha experimentado un leve descenso del -0.32%, lo que sugiere una estabilidad en su valor a corto plazo. Sin embargo, al observar su evolución en el último año, se destaca un notable incremento del 41.81%, evidenciando un crecimiento significativo y una rentabilidad atractiva para los inversores a largo plazo. Esta combinación de resultados refleja

In [28]:
import requests
import time

def get_coindesk_articles(max_articles=100, page_size=20, pause=1.0):
    base_url = "https://www.coindesk.com/api/v1/articles/timeline"
    articles = []
    last_id = None
    last_display_date = None

    # Usamos session para mantener cookies
    session = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.coindesk.com/latest-crypto-news",
        "X-Requested-With": "XMLHttpRequest",
        "Origin": "https://www.coindesk.com",
        "Connection": "keep-alive",
        "Sec-Fetch-Site": "same-origin",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Dest": "empty",
    }

    # Request inicial para obtener cookies válidas
    session.get("https://www.coindesk.com/latest-crypto-news", headers=headers)

    while len(articles) < max_articles:
        params = {"size": page_size, "lang": "en"}
        if last_id and last_display_date:
            params["lastId"] = last_id
            params["lastDisplayDate"] = last_display_date

        response = session.get(base_url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"Request failed with status: {response.status_code}")
            break

        data = response.json()
        new_articles = data.get("articles", [])
        if not new_articles:
            print("No more articles returned by API.")
            break

        articles.extend(new_articles)

        # Actualizar lastId y lastDisplayDate para la siguiente página
        last_article = new_articles[-1]
        last_id = last_article["_id"]
        last_display_date = last_article["articleDates"]["displayDate"]

        # Pausa entre requests
        time.sleep(pause)

    return articles[:max_articles]


# Ejemplo de uso
articles = get_coindesk_articles(max_articles=100)
for a in articles:
    print(a["title"], "-", a["articleDates"]["displayDate"])
print(f"Total articles fetched: {len(articles)}")


Asia Morning Briefing: August ETF Flows Show the Massive Scale of BTC to ETH Rotation  - 2025-09-01T01:00:59.985Z
Rich Bitcoiners Are Reportedly Spending BTC on Luxury Holidays: Does This Really Make Sense? - 2025-08-31T19:53:44.421Z
Yen-Backed Stablecoin Can’t Come at a Better Time as BOJ Seen Raising Rates - 2025-08-31T18:00:44.714Z
Major Bitcoin Breakout Could be Brewing as Retail and Institutions Stack ‘Relentlessly’ - 2025-08-31T17:15:15.954Z
Bitcoin's Rough August Wiped Out Summer Rally; What September Might Bring - 2025-08-31T12:00:00.000Z
Bitcoin or Gold: Which Is the Better Hedging Asset in 2025? - 2025-08-31T08:12:41.427Z
DOGE Rebounds From $0.21 Floor, 'Cup-and-Handle' Pattern Targets $0.30 - 2025-08-31T05:34:05.686Z
XRP Bullish Patterns Point to $5 as Korean Buyers Start to Accumulate - 2025-08-31T05:25:59.675Z
Crypto Charts Look 'So Broken and Bearish They’re Bullish' Ahead of Fed Meeting, Says Analyst - 2025-08-30T21:00:00.000Z
Web3 Funding Hit $9.6B in Q2 Despite Fewer D

In [29]:
import tweepy

BEARER_TOKEN = "AAAAAAAAAAAAAAAAAAAAAPeO3wEAAAAAS7lfw1Z%2F1aENBLB5mYf6AW5hB14%3DiHWH3AqioR36UqQDppVcdrZkW7473xFe6VAVy4d8CZLRhbOff9"

client = tweepy.Client(bearer_token=BEARER_TOKEN)

# Probamos con un solo tweet
tweets = client.search_recent_tweets(query="bitcoin", max_results=5, tweet_fields=["created_at", "public_metrics"])
for tweet in tweets.data:
    print(tweet.id, tweet.text[:50])


TooManyRequests: 429 Too Many Requests
Too Many Requests

In [32]:
import praw
from newspaper import Article

# Configuración de Reddit API
reddit = praw.Reddit(
        client_id="wcgoWTJ2JQdpvKvWDWq0fg",
        client_secret="ipr9OEOC6USZ3iALk1DRnLkMIBMXQw",
        user_agent="financial-news-sentiment"
    )

# Elegimos el subreddit
subreddit = reddit.subreddit("CryptoCurrency")

# Extraer posts recientes con link externo
posts = []
for post in subreddit.new(limit=50):
    if not post.is_self:  # evita posts de solo texto
        url = post.url
        try:
            article = Article(url)
            article.download()
            article.parse()
            posts.append({
                "reddit_title": post.title,
                "reddit_url": f"https://reddit.com{post.permalink}",
                "article_title": article.title,
                "article_text": article.text,
                "article_url": url
            })
        except Exception as e:
            posts.append({
                "reddit_title": post.title,
                "reddit_url": f"https://reddit.com{post.permalink}",
                "article_title": None,
                "article_text": None,
                "article_url": url,
                "error": str(e)
            })

# Ejemplo: mostrar los primeros 2 resultados
for p in posts[:2]:
    print("\n---")
    print("Reddit title:", p["reddit_title"])
    print("Reddit url:", p["reddit_url"])
    print("Article title:", p["article_title"])
    print("Article url:", p["article_url"])
    print("Texto (primeros 300 chars):", (p["article_text"] or "")[:300])



---
Reddit title: Metaplanet buys $112 million worth of bitcoin; total holdings reach 20,000 BTC
Reddit url: https://reddit.com/r/CryptoCurrency/comments/1n5gp3j/metaplanet_buys_112_million_worth_of_bitcoin/
Article title: None
Article url: https://www.theblock.co/post/368866/metaplanet-bitcoin-total-holdings-20000?utm_source=news.xml&utm_medium=rss
Texto (primeros 300 chars): 

---
Reddit title: Every run counts Vitalik.Run
Reddit url: https://reddit.com/r/CryptoCurrency/comments/1n5g4jq/every_run_counts_vitalikrun/
Article title: Vitalik Run
Article url: https://vitalik.run
Texto (primeros 300 chars): Navigate the digital realm using WASD keys or arrow keys to move, hold space to code



On mobile devices, tap the screen to control your character through the digital space.

← Back


In [33]:
headers = {"User-Agent": "Mozilla/5.0"}

def get_article_title(url):
    try:
        resp = requests.get(url, headers=headers, timeout=10)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            # primero intento con <title>
            if soup.title and soup.title.string:
                return soup.title.string.strip()
            # si no, intento con og:title
            og_title = soup.find("meta", property="og:title")
            if og_title and og_title.get("content"):
                return og_title["content"].strip()
    except Exception as e:
        return None
    return None

subreddit = reddit.subreddit("CryptoCurrency")

for submission in subreddit.hot(limit=30):  # probá con 30 primero
    print("---")
    print("Reddit title:", submission.title)
    print("Reddit url:", submission.url)

    if submission.url and submission.url.startswith("http"):
        art_title = get_article_title(submission.url)
        print("Article title:", art_title)
        print("Article url:", submission.url)

---
Reddit title: Daily Crypto Discussion - September 1, 2025 (GMT+0)
Reddit url: https://www.reddit.com/r/CryptoCurrency/comments/1n5a1og/daily_crypto_discussion_september_1_2025_gmt0/
Article title: Reddit - The heart of the internet
Article url: https://www.reddit.com/r/CryptoCurrency/comments/1n5a1og/daily_crypto_discussion_september_1_2025_gmt0/
---
Reddit title: [AMA] Polygon Foundation AMA with Sandeep Nailwal. Questions answered on August 27
Reddit url: https://www.reddit.com/r/CryptoCurrency/comments/1mzozcz/ama_polygon_foundation_ama_with_sandeep_nailwal/
Article title: Reddit - The heart of the internet
Article url: https://www.reddit.com/r/CryptoCurrency/comments/1mzozcz/ama_polygon_foundation_ama_with_sandeep_nailwal/
---
Reddit title: Never forget what we used to celebrate 12 months ago
Reddit url: https://i.redd.it/bagvcvq9mbmf1.jpeg
Article title: None
Article url: https://i.redd.it/bagvcvq9mbmf1.jpeg
---
Reddit title: Trump Family’s $750 Million Crypto Deal Raises Ques