# Libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import warnings
import re
import csv
from tqdm import tqdm  
from datetime import datetime
import os
warnings.filterwarnings("ignore", category=SyntaxWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# FED Scraper 

In [None]:
all_years_1996_2005 = pd.DataFrame(columns=['link', 'title', 'event', 'year'])

# Scraping para los años 1996-2005
years = range(1996, 2006)
for year in years:
    speeches_one_year = pd.DataFrame()
    page = requests.get(f'https://www.federalreserve.gov/newsevents/speech/{year}speech.htm')
    soup = BeautifulSoup(page.text, 'html.parser')
    title = soup.select(".title")
    locations = soup.select(".location")
    for i in range(len(title)):
        speeches_one_year.at[i, 'link'] = 'https://www.federalreserve.gov' + title[i].find_all('a', href=True)[0]['href']
        speeches_one_year.at[i, 'title'] = title[i].text.split('\n')[1]
        speeches_one_year.at[i, 'event'] = locations[i].text.split('\n')[1].strip()
        speeches_one_year.at[i, 'year'] = year
    if not speeches_one_year.empty:
        all_years_1996_2005 = pd.concat([all_years_1996_2005, speeches_one_year], ignore_index=True)

# Para los discursos con años anteriores a 1999
old_site_version_length = sum(all_years_1996_2005['year'] < 1999)
for i in range(old_site_version_length):
    if i % 50 == 0:
        print(i)
    page = requests.get(all_years_1996_2005.loc[i, 'link'])
    soup = BeautifulSoup(page.text, 'html.parser')
    text_list = [i for i in soup.find('p').getText().split('\n') if i]
    text_list = text_list[:-8]  # Eliminar últimas líneas irrelevantes
    text_list = '\n'.join(text_list)
    text_list = text_list.replace('--', ' ').replace('\r', '').replace('\t', '')
    all_years_1996_2005.loc[i, 'text'] = text_list

# Para los discursos entre 1999 y 2005
for i in range(len(all_years_1996_2005)):
    if 1998 < all_years_1996_2005.loc[i, 'year'] < 2006:
        if i % 50 == 0:
            print(i)
        page = requests.get(all_years_1996_2005['link'].iloc[i])
        soup = BeautifulSoup(page.text, 'html.parser')
        events = soup.select("table")
        text_list = events[0].text if len(str(events[0].text)) > 600 else events[1].text
        text_list = ''.join(text_list).replace('--', '').replace('\r', '\n').replace('\t', '')
        if 383 <= i <= 536:
            text_list = text_list.replace('     ', ' ').replace('    ', ' ')
        all_years_1996_2005.loc[i, 'text'] = text_list

all_years_1996_2005['date'] = all_years_1996_2005['link'].str.extract('(\d\d\d\d\d\d\d\d)')
all_years_1996_2005 = all_years_1996_2005[~all_years_1996_2005['text'].isna()]
all_years_1996_2005['text_len'] = all_years_1996_2005['text'].str.split().apply(len)
all_years_1996_2005['location'] = all_years_1996_2005.event.str.split(', ').apply(lambda x: x[-1])

all_years_1996_2005.to_csv('fed_speeches_1996_2005.csv', index=False)

In [None]:
base_url = "https://www.federalreserve.gov"
speeches_url = f"{base_url}/newsevents/speeches.htm"
response = requests.get(speeches_url)
response.raise_for_status()  # solicitud exitosa

# Parsear
soup = BeautifulSoup(response.text, 'html.parser')
speech_links = soup.select("a[href^='/newsevents/speech/']")
speech_urls = [f"{base_url}{link['href']}" for link in speech_links]

# Guardar las URLs en un archivo CSV
with open("fed_speech_urls.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["URL"])  # Encabezado del CSV
    for url in speech_urls:
        writer.writerow([url])
print(f"Se han guardado {len(speech_urls)} URLs de discursos en 'fed_speech_urls.csv'")

In [None]:
annual_urls = []
with open("fed_speech_urls.csv", mode="r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        annual_urls.append(row["URL"])

# URL base para construir las URLs completas de discursos individuales
base_url = "https://www.federalreserve.gov"
all_speech_urls = []

# Para cada URL de discursos anuales
for annual_url in annual_urls:
    # Hacer la solicitud a la página anual
    response = requests.get(annual_url)
    response.raise_for_status()  # solicitud exitosa

    # Parsear
    soup = BeautifulSoup(response.text, 'html.parser')
    speech_links = soup.select("a[href^='/newsevents/speech/']")
    for link in speech_links:
        url = f"{base_url}{link['href']}"
        # Filtrar URLs para evitar duplicados o enlaces que no sean discursos específicos
        if url not in all_speech_urls and url.endswith(".htm"):
            all_speech_urls.append(url)

# Guardar las URLs en un archivo CSV
with open("all_fed_speech_urls.csv", mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["URL"])  
    for url in all_speech_urls:
        writer.writerow([url])
print(f"Se han guardado {len(all_speech_urls)} URLs de discursos en 'all_fed_speech_urls.csv'")

In [None]:
speech_urls = []
with open("all_fed_speech_urls.csv", mode="r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        speech_urls.append(row["URL"])
speeches_data = []

# Patrón regex
date_pattern = re.compile(r"\b(\d{1,2}/\d{1,2}/\d{4})\b")
for url in tqdm(speech_urls, desc="Procesando discursos", unit="discurso"):
    # Hacer la solicitud a la página del discurso
    response = requests.get(url)
    response.raise_for_status()  # solicitud exitosa

    # Parsear
    soup = BeautifulSoup(response.text, 'html.parser')
    date_text = ""
    match = date_pattern.search(soup.get_text())
    if match:
        date_text = match.group(0)  # Capturamos la fecha encontrada
    content_div = soup.find("div", {"class": "col-xs-12 col-sm-8 col-md-8"})
    speech_content = content_div.get_text(separator=" ", strip=True) if content_div else ""
    speeches_data.append({
        "Fecha": date_text,
        "URL": url,
        "Discurso": speech_content
    })

# Guardar los datos en un archivo CSV
with open("fed_speeches_2006_Today.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Fecha", "URL", "Discurso"])
    writer.writeheader()
    writer.writerows(speeches_data)
print("Todos los discursos han sido extraídos y guardados en 'fed_speeches_2006_Today.csv'")

# Big Ass Concat

In [None]:
input_file = "fed_speeches_2006_Today.csv"
output_file = "fed_speeches_2006_Today_with_dates.csv"
date_pattern = re.compile(r"(\d{8})a\.htm")
updated_data = []
with open(input_file, mode="r", encoding="utf-8") as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Extraer la fecha de la URL
        match = date_pattern.search(row["URL"])
        if match:
            # Convertir AAAAMMDD a AAAA-MM-DD
            date_str = match.group(1)
            date_formatted = datetime.strptime(date_str, "%Y%m%d").strftime("%Y-%m-%d")
            # Guardar la fecha formateada en la columna "Fecha"
            row["Fecha"] = date_formatted
        else:
            row["Fecha"] = ""
        updated_data.append(row)

# Archivo CSV con la columna Fecha actualizada
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Fecha", "URL", "Discurso"])
    writer.writeheader()
    writer.writerows(updated_data)
print(f"El archivo con las fechas actualizadas se ha guardado como '{output_file}'")

In [None]:
csv1 = pd.read_csv("fed_speeches_1996_2005.csv", usecols=["date", "link", "text"])
# Convertir la columna de fecha al formato AAAA-MM-DD
csv1["date"] = pd.to_datetime(csv1["date"], errors='coerce').dt.strftime("%Y-%m-%d")
# Renombrar columnas para que coincidan con el formato final
csv1 = csv1.rename(columns={
    "date": "Date",
    "link": "URL",
    "text": "Speech"
})
# Leer el segundo CSV (2006 a hoy)
csv2 = pd.read_csv("fed_speeches_2006_Today_with_dates.csv", usecols=["Fecha", "URL", "Discurso"])
csv2["Fecha"] = pd.to_datetime(csv2["Fecha"], errors='coerce').dt.strftime("%Y-%m-%d")
csv2 = csv2.rename(columns={
    "Fecha": "Date",
    "URL": "URL",
    "Discurso": "Speech"
})
combined_data = pd.concat([csv1, csv2], ignore_index=True)
combined_data.to_csv("Data_1996_Today_FED.csv", index=False, encoding="utf-8")
print("El archivo combinado se ha guardado como 'Data_1996_Today_FED.csv'")

# Limpiar

In [None]:
# Lista de archivos a eliminar
files_to_delete = [
    "all_fed_speech_urls.csv",
    "fed_speech_urls.csv",
    "fed_speeches_1996_2005.csv",
    "fed_speeches_2006_Today_with_dates.csv",
    "fed_speeches_2006_Today.csv"
]
for file in files_to_delete:
    try:
        os.remove(file)
        print(f"{file} ha sido eliminado exitosamente.")
    except FileNotFoundError:
        print(f"{file} no se encontró o ya fue eliminado.")
    except Exception as e:
        print(f"No se pudo eliminar {file}: {e}")