In [1]:
import requests
import json
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

# NewsAPI Test

In [23]:
load_dotenv()
key = os.getenv("NEWS_API_KEY")
url = 'https://newsapi.org/v2/everything'

In [24]:
params = {
    'q' : 'bitcoin OR cryptocurrency OR inflation OR interest rates',
    'language' : 'en',
    'pageSize': 5,
    'apiKey': key
}

In [25]:
r = requests.get(url, params=params)
data= r.json()

In [None]:
with open("../data/00/sample_newsapi.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("NewsApi: ", len(data.get("articles", [])))
print('Primer titulo:', data["articles"][0]["title"])

NewsApi:  5
Primer titulo: Bitcoin Flash Crash Roils Crypto Market


# CryptoCompare News API Test

In [27]:
url = "https://min-api.cryptocompare.com/data/v2/news/"
params = {'lang':'EN'}
r = requests.get(url, params=params)
data = r.json()

In [None]:
with open("../data/00/sample_cryptocompare.json", 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("CryptoCompare: ", len(data.get("Data", [])), "articles")
print("Primer titulo:", data["Data"][0]["title"])

CryptoCompare:  50 articles
Primer titulo: Bitcoin Price Astounding Surge: BTC Rockets Above $112,000


# Infobae scraping test

In [29]:
url = "https://www.infobae.com/economia/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')

In [None]:
titulos = [h2.get_text(strip=True) for h2 in soup.find_all("h2")[:5]]

with open("../data/00/sample_infobae.html", "w", encoding="utf-8") as f:
    f.write(r.text)
with open("../data/sample_infobae_titles.json", 'w', encoding='utf-8') as f:
    json.dump(titulos, f, ensure_ascii=False, indent=2)

print("Infobae: ", len(titulos), "artículos")
print("Primer título:", titulos[0] if titulos else "No disponible")

Infobae:  5 artículos
Primer título: El Gobierno asegura que llegó a un acuerdo con los controladores aéreos y que se levantará la medida de  fuerza


In [2]:
base_url = "https://www.infobae.com"
search_url = "https://www.infobae.com/buscar/cripto/"

In [3]:
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(search_url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
containers = soup.find_all('div', class_='queryly_item_container')

In [7]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

BASE_URL = "https://www.infobae.com"
SEARCH_URL = "https://www.infobae.com/buscar/cripto/"

headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(SEARCH_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

containers = soup.find_all("div", class_="queryly_item_container")

data = []
for cont in containers:
    title_tag = cont.find("a", class_="resultlink double-teaser__title")
    desc_tag = cont.find("span", class_="queryly_item_description")
    date_tag = cont.find("div", class_="queryly_item_pubdate")
    img_tag = cont.find("div", class_="queryly_item_imagecontainer")

    if not title_tag:
        continue

    # URL absoluta
    link = title_tag.get("href")
    if link.startswith("/"):
        link = BASE_URL + link

    # Generar ID a partir de la URL
    id_hash = hashlib.md5(link.encode()).hexdigest()

    # Extraer imagen del estilo inline
    img_url = None
    if img_tag and "background-image" in img_tag.get("style", ""):
        style = img_tag["style"]
        start = style.find("url(")
        end = style.find(")", start)
        if start != -1 and end != -1:
            img_url = style[start+4:end].replace("'", "").replace('"', "")

    item = {
        "id": id_hash,
        "title": title_tag.get_text(strip=True),
        "description": desc_tag.get_text(strip=True) if desc_tag else None,
        "content": None,  # pendiente al entrar al artículo
        "url": link,
        "source": "Infobae",
        "published_at": date_tag.get_text(strip=True) if date_tag else None,
        "collected_at": datetime.utcnow().isoformat(),
        "extra": {"image_url": img_url}
    }
    data.append(item)

print(f"Encontradas {len(data)} noticias")
for d in data[:2]:
    print(d)


Encontradas 0 noticias


In [12]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import hashlib

BASE_URL = "https://www.cronista.com"
SECTION_URL = "https://www.cronista.com/criptomonedas/"

headers = {"User-Agent": "Mozilla/5.0"}

# Primero sacamos la lista de artículos (tarjetas)
response = requests.get(SECTION_URL, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
articles = soup.find_all("article", class_=lambda x: x and "item" in x)

data = []

for art in articles:
    a_tag = art.find("a", class_="link")
    if not a_tag:
        continue

    title_tag = a_tag.find("h2", class_="title")
    kicker_tag = a_tag.find("span", class_="kicker")
    img_tag = art.find("div", class_="image").find("img") if art.find("div", class_="image") else None

    link = a_tag.get("href")
    if link.startswith("/"):
        link = BASE_URL + link

    id_hash = hashlib.md5(link.encode()).hexdigest()
    img_url = img_tag["src"] if img_tag else None

    # ---- Extraer contenido completo y fecha ----
    try:
        art_resp = requests.get(link, headers=headers)
        art_soup = BeautifulSoup(art_resp.text, "html.parser")

        # Contenido: todos los <p> dentro de content-media
        body = art_soup.find("div", class_="block-content")
        content = "\n".join([p.get_text(strip=True) for p in body.find_all("p")]) if body else None

        # Fecha: <time> o <span class="date">
        time_tag = art_soup.find("time") or art_soup.find("span", class_="date")
        published_at = time_tag.get_text(strip=True) if time_tag else None

    except Exception as e:
        print(f"Error al extraer {link}: {e}")
        content = None
        published_at = None

    item = {
        "id": id_hash,
        "title": title_tag.get_text(strip=True) if title_tag else None,
        "description": kicker_tag.get_text(strip=True) if kicker_tag else None,
        "content": content,
        "url": link,
        "source": "El Cronista",
        "published_at": published_at,
        "collected_at": datetime.utcnow().isoformat(),
        "extra": {"image_url": img_url}
    }

    data.append(item)

print(f"Se extrajeron {len(data)} artículos con contenido completo")
for d in data[:5]:
    print(d)


Se extrajeron 27 artículos con contenido completo
{'id': 'f968a8c03beee75ab68e4b60372ae127', 'title': 'Ethereum: a cuánto cotiza este sábado 30 de agosto', 'description': 'Mundo cripto', 'content': 'La cotización de la criptomonedaEthereumy eleurode este sábado, 30 de agosto de 2025 en España es de5.104,15 euros. En base a este precio, la variación de dicho activo digital en comparación con el día pasado es de 0,22%.\nEl precio del Ethereum y el euro ha mostrado una tendencia positiva en los últimos días. Esto indica que ambos activos están en un proceso de crecimiento sostenido.\nEn la última semana, la cotización de la criptomoneda Ethereum ha experimentado un leve descenso del -0.32%, lo que sugiere una estabilidad en su valor a corto plazo. Sin embargo, al observar su evolución en el último año, se destaca un notable incremento del 41.81%, evidenciando un crecimiento significativo y una rentabilidad atractiva para los inversores a largo plazo. Esta combinación de resultados refleja

In [6]:
import praw
from newspaper import Article, Config
import re
from dotenv import load_dotenv
import os
import time
import random
import json

# -------- CONFIG --------
load_dotenv()
client_id = os.getenv("client_id")
client_secret = os.getenv("client_secret")
user_agent = os.getenv("user_agent")

# Newspaper headers (para que parezca navegador real)
news_config = Config()
news_config.browser_user_agent = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/128.0.0.0 Safari/537.36"
)

# -------- INIT REDDIT --------
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# -------- FUNCIONES --------
def es_articulo(url: str) -> bool:
    """Filtra URLs que sean links de noticias y no imágenes/otros recursos."""
    if not url:
        return False
    if re.search(r"\.(jpg|jpeg|png|gif|mp4|pdf)$", url, re.IGNORECASE):
        return False
    if "reddit.com" in url:
        return False
    return True

def extraer_contenido(url: str) -> str:
    """Intenta scrapear el contenido de un artículo."""
    try:
        art = Article(url, config=news_config)
        art.download()
        art.parse()
        return art.text.strip()
    except Exception as e:
        return f"[ERROR extrayendo contenido: {e}]"

# -------- MAIN --------
def main(limit=50, flair="GENERAL-NEWS"):
    subreddit = reddit.subreddit("CryptoCurrency")
    posts = subreddit.hot(limit=limit)

    resultados = []

    for post in posts:
        # Filtramos por flair
        if flair and (post.link_flair_text or "").upper() != flair.upper():
            continue

        url = post.url
        if not es_articulo(url):
            continue

        # Sleep aleatorio 1-2 segundos para evitar rate-limit
        time.sleep(random.uniform(1, 2))

        content = extraer_contenido(url)

        resultados.append({
            "title": post.title,
            "url": url,
            "flair": post.link_flair_text,
            "content": content
        })

        # Preview consola
        print("=" * 80)
        print("TITLE:", post.title)
        print("FLAIR:", post.link_flair_text)
        print("URL:", url)
        print("CONTENT (preview):")
        print(content[:400], "..." if len(content) > 400 else "")

    # Guardar en JSON
    with open("reddit_news.json", "w", encoding="utf-8") as f:
        json.dump(resultados, f, ensure_ascii=False, indent=4)

    print(f"\n✅ Guardados {len(resultados)} artículos en reddit_news.json")

if __name__ == "__main__":
    main(limit=100)

TITLE: Banking giant Morgan Stanley reportedly plans to introduce crypto trading on E*Trade
FLAIR: GENERAL-NEWS
URL: https://www.aol.com/finance/banking-giant-morgan-stanley-reportedly-193439102.html
CONTENT (preview):
Morgan Stanley, one of the world’s largest investment banks, plans to introduce crypto trading on its consumer platform in the latest move by a traditional financial institution to capitalize on President Donald Trump’s deregulation of the crypto industry.

The banking giant plans to allow its customers to buy and sell crypto on its subsidiary, E*Trade, starting sometime next year, according to a  ...
TITLE: Bitwise Forecasts $1.3M Bitcoin as Institutional Giants Could Deploy $5 Trillion
FLAIR: GENERAL-NEWS
URL: https://news.bitcoin.com/bitwise-forecasts-1-3m-bitcoin-as-institutional-giants-could-deploy-5-trillion/
CONTENT (preview):
 
TITLE: Trump Family’s $750 Million Crypto Deal Raises Questions Ahead of WLFI Token Debut
FLAIR: GENERAL-NEWS
URL: https://beincrypto.com

In [None]:
import requests
import json

response = requests.get('https://data-api.coindesk.com/news/v1/search',
    params={"search_string":"BTC","lang":"EN","source_key":""},
    headers={"Content-type":"application/json; charset=UTF-8"}
)

json_response = response.json()
with open("coindesk_articles.json", "w", encoding="utf-8") as f:
    json.dump(json_response, f, ensure_ascii=False, indent=2)
print(f"Guardados {len(json_response.get('Data', []))} artículos en coindesk_articles.json")


Guardados 10 artículos en coindesk_articles.json


In [13]:
import requests
from datetime import datetime, timedelta

def obtener_articulos(fecha):
    # Convierte la fecha en un formato compatible con la API (puedes ajustar según como la API lo reciba)
    fecha_timestamp = int(fecha.timestamp())
    
    # Realiza la consulta a la API
    url = f"https://data-api.coindesk.com/news/v1/article/list"
    params = {
        'search_string': 'BTC',
        'fecha': fecha_timestamp,
        'limit': 2,  # Pedir solo 2 artículos
        'order': 'score',  # Asumiendo que puedes ordenar por score, si no ajusta según la API
    }
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()  # Devuelve la lista de artículos
    else:
        return []

def recolectar_articulos():
    articulos = []
    fecha_actual = datetime.now()  # Fecha de hoy
    total_articulos = 0

    while total_articulos < 100:
        # Obtener los artículos para el día actual
        articulos_response = obtener_articulos(fecha_actual)
        articulos_dia = articulos_response.get("Data", []) if isinstance(articulos_response, dict) else []

        # Filtrar los artículos con score > 0
        articulos_dia = [articulo for articulo in articulos_dia if articulo.get('score', 0) > 0]
        
        if not articulos_dia:  # Si no hay artículos válidos, detener la recolección
            break
        
        # Agregar los artículos válidos a la lista
        articulos.extend(articulos_dia)
        total_articulos += len(articulos_dia)
        
        # Decrementamos la fecha para obtener los artículos del día anterior
        fecha_actual -= timedelta(days=1)
        
    return articulos[:100]  # Limita a 100 artículos si es necesario

# Llamada para recolectar los artículos
articulos = recolectar_articulos()
print(f"Se han recolectado {len(articulos)} artículos.")


Se han recolectado 0 artículos.


In [4]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def fetch_cronista(query="bitcoin", max_pages=10, items_per_page=18):
    url = "https://www.cronista.com/0//buscar/list/ajax.vnc"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:142.0) Gecko/20100101 Firefox/142.0",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
    }

    all_articles = []

    for page in range(1, max_pages + 1):
        payload = {
            "id": query,
            "page": page,
            "itemsPerPage": items_per_page,
            "type": "news",
        }
        resp = requests.post(url, data=payload, headers=headers)
        print(resp.headers.get("Content-Type"))
        print(resp.text[:500])
        resp.raise_for_status()

        soup = BeautifulSoup(resp.text, "html.parser")
        articles = soup.find_all("article")

        if not articles:
            print(f"No more articles found (stopped at page {page})")
            break

        for art in articles:
            try:
                title_tag = art.find("h2") or art.find("h3")
                title = title_tag.get_text(strip=True) if title_tag else None

                link_tag = art.find("a", href=True)
                link = link_tag["href"] if link_tag else None

                # fecha viene en un <time datetime="...">
                time_tag = art.find("time")
                published_at = None
                if time_tag and time_tag.has_attr("datetime"):
                    published_at = time_tag["datetime"]

                all_articles.append({
                    "id": link,  # podés hashear si querés
                    "title": title,
                    "url": link,
                    "source": "Cronista",
                    "published_at": published_at,
                    "collected_at": datetime.now().isoformat(),
                })
            except Exception as e:
                print(f"Error parsing article: {e}")

    print(f"Fetched {len(all_articles)} articles from Cronista")
    return all_articles

raw_cronista = fetch_cronista()

text/html; charset=iso-8859-1
<div class="piece grid standard"><div class="items" id=section-items-listv76438v1  data-type=items><div class="item news-minisite locked" data-type=item><div class="kicker">Cotizaciones</div><h2 class="title"><a   href="/infotechnology/finanzas-digitales/baja-el-dolar-cual-es-el-nuevo-precio-que-ahora-anticipan-en-cuevas-virtuales-1430/">Baja el dólar: cuál es el nuevo precio que ahora anticipan en cuevas virtuales</a></h2><div class="media"><div class="image"><picture><source srcset="data:image
No more articles found (stopped at page 1)
Fetched 0 articles from Cronista


In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time
import pandas as pd

# --- Configuración del driver ---
chrome_options = Options()
chrome_options.add_argument("--headless")  # ejecuta sin abrir ventana
chrome_options.add_argument("--window-size=1920,1080")
service = Service("PATH/TO/chromedriver")  # reemplaza con la ruta a tu chromedriver
driver = webdriver.Chrome(service=service, options=chrome_options)

# --- Abrir la página ---
url = "https://www.cronista.com/bitcoin/"
driver.get(url)
time.sleep(3)  # espera inicial para que cargue la página

# --- Scroll / click "Ver más" hasta que no haya más ---
while True:
    try:
        boton = driver.find_element(By.CSS_SELECTOR, "button.load-more, a.load-more")
        driver.execute_script("arguments[0].scrollIntoView();", boton)
        boton.click()
        time.sleep(2)  # espera que carguen los nuevos artículos
    except (NoSuchElementException, ElementClickInterceptedException):
        break

# --- Extraer artículos ---
articulos = driver.find_elements(By.CSS_SELECTOR, "article.item")

data = []
seen_links = set()

for art in articulos:
    try:
        link_el = art.find_element(By.CSS_SELECTOR, "a.link")
        link = link_el.get_attribute("href")
        if link in seen_links:
            continue
        seen_links.add(link)

        titulo = link_el.get_attribute("title")
        fecha = art.find_element(By.CSS_SELECTOR, ".date").text
        data.append({
            "titulo": titulo,
            "link": link,
            "fecha": fecha
        })
    except:
        continue

driver.quit()

# --- Guardar en DataFrame ---
df = pd.DataFrame(data)
print(df.head())
print(f"Total de artículos únicos: {len(df)}")


NoSuchDriverException: Message: Unable to obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


In [None]:
import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.cronista.com/buscar/?q=bitcoin&page={}"

def scrape_cronista(max_pages=5):
    articles = []
    for page in range(1, max_pages + 1):
        url = BASE_URL.format(page)
        print(f"Scraping {url}")
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        if r.status_code != 200:
            break

        soup = BeautifulSoup(r.text, "html.parser")
        for item in soup.select(".article-item"):  # <-- ajustar al selector real
            link = item.select_one("a")["href"]
            title = item.select_one("h2, h3").get_text(strip=True)
            description = item.select_one("p").get_text(strip=True) if item.select_one("p") else ""
            articles.append({
                "title": title,
                "description": description,
                "url": "https://www.cronista.com" + link
            })
    return articles



In [7]:
import requests
from bs4 import BeautifulSoup

URL = "https://www.cronista.com/0//etiqueta/list/ajax.vnc"

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Content-Type": "application/x-www-form-urlencoded",
}

def scrape_cronista(max_pages=5):
    articles = []

    for page in range(1, max_pages + 1):
        payload = {
            "id": "41349",            # el ID de la etiqueta (Bitcoin)
            "page": page,
            "itemsPerPage": 19,
            "type": "grid",
            "pieceProperties": "YUxmUU9BQUc2YWM2ZlpxZENZM25EWG5rZ200TFRhQy9aeWlDaEJ5UTZ4bytzdEJwRkJLeXZqWmkyOXNFZytWVGNPK0FPbFVIcjdKM1BOeUlIWTduWHliOTNXNUhCdjZxTG1PZHp4WEgveDV3c2",
        }

        r = requests.post(URL, headers=HEADERS, data=payload)
        if r.status_code != 200:
            break

        soup = BeautifulSoup(r.text, "html.parser")
        for item in soup.select("article.item"):
            a = item.select_one("a")
            if not a:
                continue
            link = a["href"]
            title = a.get("title") or a.get_text(strip=True)
            date = item.select_one(".date")
            articles.append({
                "title": title.strip(),
                "url": "https://www.cronista.com" + link,
                "date": date.get_text(strip=True) if date else None,
            })

        # si no devuelve más artículos, corto
        if not soup.select("article.item"):
            break

    return articles

if __name__ == "__main__":
    data = scrape_cronista(3)
    for d in data:
        print(d)


In [8]:
import requests
import re

url = "https://www.cronista.com/tema/bitcoin/"
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})

print(f"Status: {r.status_code}, length={len(r.text)}")

# Buscamos pieceProperties
pp_candidates = re.findall(r'pieceProperties["\']?\s*[:=]\s*["\']([^"\']+)["\']', r.text)
print("pieceProperties encontrados:", pp_candidates[:3])

# Buscamos algún "id" parecido a lo que pide el AJAX
id_candidates = re.findall(r'data-id=["\']?(\d+)["\']?', r.text)
print("id encontrados:", id_candidates[:3])

# Si no aparece, imprimimos una muestra del HTML para inspección
print("\n--- snippet ---\n", r.text[:2000])


Status: 200, length=389535
pieceProperties encontrados: []
id encontrados: []

--- snippet ---
 <!DOCTYPE HTML> 
<html lang="es">
<head>
    <meta charset="ISO-8859-1"><link rel="icon" type="image/png" href="/files/image/467/467707/62c2501e9cac7-favicon-32x32_32_32!.png?s=6aa31041de3178c3336f772aedfaae76&d=1747952205" sizes="32x32"><link rel="icon" type="image/png" href="/files/image/467/467707/62c2501e9cac7-favicon-16x16_16_16!.png?s=16a6c3ab222eaf453b8b12860a41e084&d=1747952216" sizes="16x16"><link rel="apple-touch-icon" href="/files/image/467/467707/62c2501e9cac7-favicon-512x512_256_256!.png?s=42c5dcfd444ce7eb3984235db1d2bcac&d=1747952205&42902v24"><meta name="apple-mobile-web-app-title" content="ECC"><meta name="apple-mobile-web-app-capable" content="yes"><meta name="apple-mobile-web-app-status-bar-style" content="black"><meta name="theme-color" content="#3D939B"><link rel="manifest" href="/files/site/manifest.json"><meta name="viewport" content="width=device-width, initial-scale=1

In [8]:
import requests 

response = requests.get('https://data-api.coindesk.com/news/v1/source/list',
    params={"lang":"EN","source_type":"RSS","status":"ACTIVE"},
    headers={"Content-type":"application/json; charset=UTF-8"}
)

json_response = response.json()

names = []

for source in json_response.get("Data", []):
    names.append(source.get("NAME"))

names


['CoinDesk',
 'CoinTelegraph',
 'Blockworks',
 'Crypto Potato',
 'Decrypt',
 'Bitcoin.com',
 'NewsBTC',
 'U.Today',
 'Bitcoinist',
 'Cryptopolitan',
 'TimesTabloid',
 'Investing.com Crypto News',
 'Invezz',
 'CoinTurk News',
 'Bitfinex blog',
 'Huobi blog',
 'Bloomberg (Crypto)',
 'CryptoCoin.News',
 'The Crypto Basic',
 'Coin Edition',
 'Forbes Digital Assets',
 'BTC Pulse',
 'TrustNodes',
 'Kraken Blog',
 'BitDegree',
 'AMB Crypto',
 'Bitcoin World',
 'The Coin Rise',
 'Investing.Com Crypto Opinion and Analysis',
 'Crypto Daily',
 'ZyCrypto',
 'CryptoNewsZ',
 'Coinpaprika',
 'CryptoIntelligence',
 'NullTx',
 'BitcoinSistemi',
 'Coinpaper',
 'Financial Times (Crypto)',
 'Finbold',
 'CoinOtag',
 'Seeking Alpha',
 'Blokt',
 'cryptonews',
 'Bitzo',
 'Chainwire',
 'CryptoCompare']

In [41]:
from pymongo import MongoClient
import os

client = MongoClient(
    f"mongodb://{os.getenv('MONGO_USER')}:{os.getenv('MONGO_PASSWORD')}@localhost:{os.getenv('MONGO_PORT')}/",
    authSource="admin"
)
db = client[os.getenv('MONGO_DB')]
print(db[os.getenv('MONGO_COLLECTION')].count_documents({}))

8003


In [39]:
import psycopg2
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv()

POSTGRES_HOST = "localhost"  # Cambia a localhost para desarrollo local
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT"))
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_DB = os.getenv("POSTGRES_DB")

pg_conn = psycopg2.connect(
	host=POSTGRES_HOST,
	port=POSTGRES_PORT,
	user=POSTGRES_USER,
	password=POSTGRES_PASSWORD,
	dbname=POSTGRES_DB,
)

df = pd.read_sql("SELECT * FROM articles LIMIT 5;", pg_conn)
print(df)


  df = pd.read_sql("SELECT * FROM articles LIMIT 5;", pg_conn)


                                 id  \
0  9f216d301f699e61fab74a71fdc59498   
1  58fc2dff9d107f3d94c8bae5a9604473   
2  1e99b221db9aa154a7db23056f76d051   
3  614e2efea1577b72e1a72823954ab31f   
4  77985c5ac47de2a1728e02ff9e50958d   

                                               title description  \
0  Solana May Hold $210 Support, Could Extend Tow...               
1  Trump Reportedly Considering Alternative CFTC ...               
2  Trump weighs new CFTC chair candidates as Quin...               
3  XRP Price Stability Signals Opportunity – Is a...               
4  Ethereum Price Squeezes Tight – Watch Out for ...               

                                             content  \
0  Solana price confirms $210 as key support afte...   
1  Trump’s team is vetting new candidates for CFT...   
2  Trump is reportedly exploring other CFTC leade...   
3  XRP price started a fresh increase above the $...   
4  Ethereum price started a fresh increase above ...   

                   

In [40]:
df

Unnamed: 0,id,title,description,content,url,source,published_at,collected_at,extra
0,9f216d301f699e61fab74a71fdc59498,"Solana May Hold $210 Support, Could Extend Tow...",,Solana price confirms $210 as key support afte...,https://en.coinotag.com/solana-may-hold-210-su...,CoinOtag,2025-09-19 01:17:07,2025-09-19 01:27:42.677014,"{'lang': 'EN', 'image': 'https://resources.cry..."
1,58fc2dff9d107f3d94c8bae5a9604473,Trump Reportedly Considering Alternative CFTC ...,,Trump’s team is vetting new candidates for CFT...,https://en.coinotag.com/trump-reportedly-consi...,CoinOtag,2025-09-19 01:14:44,2025-09-19 01:27:42.677014,"{'lang': 'EN', 'image': 'https://resources.cry..."
2,1e99b221db9aa154a7db23056f76d051,Trump weighs new CFTC chair candidates as Quin...,,Trump is reportedly exploring other CFTC leade...,https://cointelegraph.com/news/trump-weighs-ne...,CoinTelegraph,2025-09-19 01:12:34,2025-09-19 01:27:42.677014,"{'lang': 'EN', 'image': 'https://images.crypto..."
3,614e2efea1577b72e1a72823954ab31f,XRP Price Stability Signals Opportunity – Is a...,,XRP price started a fresh increase above the $...,https://www.newsbtc.com/analysis/xrp/xrp-price...,NewsBTC,2025-09-19 01:08:51,2025-09-19 01:27:42.677014,"{'lang': 'EN', 'image': 'https://resources.cry..."
4,77985c5ac47de2a1728e02ff9e50958d,Ethereum Price Squeezes Tight – Watch Out for ...,,Ethereum price started a fresh increase above ...,https://www.newsbtc.com/analysis/eth/ethereum-...,NewsBTC,2025-09-19 01:08:46,2025-09-19 01:27:42.677014,"{'lang': 'EN', 'image': 'https://resources.cry..."
