In [None]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import time
import re
from datetime import datetime

# Base directory to save files
BASE_DIR = "federal_reserve_downloads"
os.makedirs(BASE_DIR, exist_ok=True)

# Federal Reserve base URL
BASE_URL = "https://www.federalreserve.gov"

# Maximum filename length (excluding extension and path)
MAX_FILENAME_LENGTH = 100

# Function to sanitize folder and file names
def sanitize_name(name):
    return re.sub(r'[<>:"/\\|?*]', '', name)

# Function to truncate filename if too long
def truncate_filename(name, max_length=MAX_FILENAME_LENGTH):
    if len(name) > max_length:
        return name[:max_length].rsplit(' ', 1)[0]  # Truncate at last space
    return name

# Function to extract date from URL
def extract_date(time_str, url):
    match = re.search(r'(\d{4})(\d{2})?\d{2}', url)
    if match:
        year = match.group(1)
        month = match.group(2) or "Unknown"
        if month != "Unknown":
            dt = datetime(int(year), int(month), 1)
            return dt.strftime("%Y/%B")
        return f"{year}/Unknown"
    return "Unknown/Unknown"

# Function to download a file with a unique, truncated name
def download_file(url, category, date_folder, title):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        # Determine filename and extension
        if url.endswith(".pdf"):
            ext = ".pdf"
        else:
            ext = ".htm"  # Older speeches are typically .htm
        
        filename_base = sanitize_name(title)
        filename_base = truncate_filename(filename_base)
        filename = filename_base + ext
        
        folder_path = os.path.join(BASE_DIR, category, sanitize_name(date_folder))
        filepath = os.path.join(folder_path, filename)
        
        # Avoid overwriting
        base, ext = os.path.splitext(filepath)
        counter = 1
        while os.path.exists(filepath):
            filename = f"{filename_base}_{counter}{ext}"
            filepath = os.path.join(folder_path, filename)
            counter += 1
        
        os.makedirs(folder_path, exist_ok=True)
        with open(filepath, "wb") as f:
            f.write(response.content)
        print(f"Downloaded: {filepath}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")

# Function to scrape historical speeches
def scrape_historical_testimonies(url, category, start_year=1996, end_year=2005):
    try:
        # Add headers to mimic a browser
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        # Follow redirects explicitly
        response = requests.get(url, headers=headers, timeout=10, allow_redirects=True)
        response.raise_for_status()
        
        # Print URL after redirects and content length
        print(f"Reached {response.url}")
        print(f"Retrieved {len(response.text)} characters from {response.url}")
        
        # Use html.parser (or lxml if installed)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Find all links
        links = soup.find_all("a", href=True)
        print(f"Found {len(links)} links in {response.url}")
        
        # Debug: Print all links found
        if len(links) == 0:
            print("No links found. Dumping first 500 characters of HTML:")
            print(response.text[:500])
        else:
            print("Sample of links found:")
            for link in links[:5]:  # Print first 5 links as a sample
                print(f" - {link['href']} : {link.get_text(strip=True)}")
        
        for link in links:
            href = link["href"]
            if not href.startswith("http"):
                href = urljoin(BASE_URL, href)
            
            # Filter for speech links (adjust based on new structure)
            if re.search(r'\d{8}', href) or href.endswith(".htm") or href.endswith(".pdf"):
                year_match = re.search(r'(\d{4})', href)
                if year_match:
                    year = int(year_match.group(1))
                    if start_year <= year <= end_year:
                        title = link.get_text(strip=True) or "untitled"
                        date_folder = extract_date("", href)
                        download_file(href, category, date_folder, title)
            time.sleep(1)  # Be polite
    except requests.exceptions.RequestException as e:
        print(f"Failed to access {url}: {e}")
# Main function to download speeches for 1996-2005
def download_testimonies():
    category = "speeches"
    
    historical_urls = [
        f"https://www.federalreserve.gov/boarddocs/speeches/{year}/"
        for year in range(1996, 2006)
    ]
    
    print("Scraping historical speeches from 1996 to 2005...")
    for url in historical_urls:
        print(f"Processing {url}")
        scrape_historical_testimonies(url, category)

if __name__ == "__main__":
    download_testimonies()

Scraping historical speeches from 1996 to 2005...
Processing https://www.federalreserve.gov/boarddocs/speeches/1996/
Reached https://www.federalreserve.gov/boarddocs/speeches/1996/
Retrieved 1137 characters from https://www.federalreserve.gov/boarddocs/speeches/1996/
Found 0 links in https://www.federalreserve.gov/boarddocs/speeches/1996/
No links found. Dumping first 500 characters of HTML:
<!DOCTYPE HTML PUBLIC "-//SQ//DTD HTML 2.0 HoTMetaL + extensions//EN">
<HTML>
<HEAD>
<meta http-equiv="refresh" content="0;url=/newsevents/speech/1996speech.htm"> 
</HEAD>
<BODY><script>(function(){function c(){var b=a.contentDocument||a.contentWindow.document;if(b){var d=b.createElement('script');d.innerHTML="window.__CF$cv$params={r:'91ccb9c6fe043aee',t:'MTc0MTM3ODExNS4wMDAwMDA='};var a=document.createElement('script');a.nonce='';a.src='/cdn-cgi/challenge-platform/scripts/jsd/main.js';doc
Processing https://www.federalreserve.gov/boarddocs/speeches/1997/
Reached https://www.federalreserve.gov/boa

In [None]:
import requests
from bs4 import BeautifulSoup
import feedparser
import os
from urllib.parse import urljoin
import time
import re
from datetime import datetime, timedelta

# Base directory to save files
BASE_DIR = "federal_reserve_downloads"
os.makedirs(BASE_DIR, exist_ok=True)

# Federal Reserve base URL
BASE_URL = "https://www.federalreserve.gov"

# Maximum filename length (excluding extension and path)
MAX_FILENAME_LENGTH = 100

# Function to sanitize folder and file names
def sanitize_name(name):
    return re.sub(r'[<>:"/\\|?*]', '', name)

# Function to truncate filename if too long
def truncate_filename(name, max_length=MAX_FILENAME_LENGTH):
    if len(name) > max_length:
        return name[:max_length].rsplit(' ', 1)[0]
    return name

# Function to extract date from URL or content
def extract_date(url, soup=None):
    match = re.search(r'(\d{4})(\d{2})(\d{2})', url)
    if match:
        year, month, day = match.groups()
        dt = datetime(int(year), int(month), int(day))
        return dt.strftime("%Y/%B")
    if soup:
        date_tag = soup.find("div", class_="article__time")
        if date_tag:
            date_str = date_tag.get_text(strip=True)
            try:
                dt = datetime.strptime(date_str, "%B %d, %Y")
                return dt.strftime("%Y/%B")
            except ValueError:
                pass
    year_match = re.search(r'(\d{4})', url)
    if year_match:
        return f"{year_match.group(1)}/Unknown"
    return "Unknown/Unknown"

# Function to download a file
def download_file(url, category, date_folder, title):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        if url.endswith(".pdf"):
            default_filename = url.split("/")[-1]
            ext = ".pdf"
        else:
            default_filename = "speech.html"
            ext = ".html"
        
        filename_base = sanitize_name(title)
        filename_base = truncate_filename(filename_base)
        filename = filename_base + ext
        
        folder_path = os.path.join(BASE_DIR, category, sanitize_name(date_folder))
        filepath = os.path.join(folder_path, filename)
        
        base, ext = os.path.splitext(filepath)
        counter = 1
        while os.path.exists(filepath):
            filename = f"{filename_base}_{counter}{ext}"
            filepath = os.path.join(folder_path, filename)
            counter += 1
        
        os.makedirs(folder_path, exist_ok=True)
        with open(filepath, "wb") as f:
            f.write(response.content)
        print(f"Downloaded: {filepath}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")

# Function to generate potential speech URLs
def generate_speech_urls(start_date, end_date):
    urls = []
    current_date = start_date
    while current_date <= end_date:
        # Common pattern: YYYYMMDDa.htm or YYYYMMDDb.htm
        for suffix in ['a', 'b', 'c']:  # Some days have multiple speeches (a, b, c)
            url = f"https://www.federalreserve.gov/newsevents/speech/{current_date.strftime('%Y%m%d')}{suffix}.htm"
            urls.append(url)
        current_date += timedelta(days=1)
    return urls

# Function to scrape RSS feed for recent speeches
def scrape_rss_speeches(rss_url, category):
    feed = feedparser.parse(rss_url)
    for entry in feed.entries:
        link = entry.link
        pub_date = entry.get("published", None)
        title = entry.get("title", "untitled")
        
        try:
            dt = datetime.strptime(pub_date, "%a, %d %b %Y %H:%M:%S %Z")
            if dt >= datetime(2011, 1, 1) and dt <= datetime(2025, 2, 28):
                date_folder = dt.strftime("%Y/%B")
                response = requests.get(link)
                soup = BeautifulSoup(response.content, "html.parser")
                pdf_link = soup.find("a", href=lambda x: x and x.endswith(".pdf"))
                if pdf_link:
                    pdf_url = urljoin(BASE_URL, pdf_link["href"])
                    download_file(pdf_url, category, date_folder, title)
                else:
                    download_file(link, category, date_folder, title)
        except (ValueError, AttributeError):
            date_folder = extract_date(link)
            download_file(link, category, date_folder, title)
        time.sleep(1)

# Main function to download speeches
def download_speeches():
    category = "speeches"
    start_date = datetime(2011, 1, 1)
    end_date = datetime(2025, 2, 28)
    
    # Step 1: Generate and test historical URLs (2011–2022)
    print("Scraping historical speeches (2011–2022)...")
    potential_urls = generate_speech_urls(start_date, datetime(2022, 12, 31))
    for url in potential_urls:
        response = requests.head(url, timeout=10)  # Use HEAD to check existence
        if response.status_code == 200:
            response = requests.get(url)  # Full request if valid
            soup = BeautifulSoup(response.content, "html.parser")
            title_tag = soup.find("h3", class_="title") or soup.find("title")
            title = title_tag.get_text(strip=True) if title_tag else "untitled"
            date_folder = extract_date(url, soup)
            download_file(url, category, date_folder, title)
        time.sleep(1)
    
    # Step 2: Scrape RSS for recent speeches (2023–2025)
    rss_url = "https://www.federalreserve.gov/feeds/speeches.xml"
    print(f"Scraping RSS feed for recent speeches from {rss_url}...")
    scrape_rss_speeches(rss_url, category)

if __name__ == "__main__":
    download_speeches()

In [None]:
import yfinance as yf
import pandas as pd

# Define the tickers and date range
tickers = ["^DJI", "^GSPC"]
start_date = "2006-01-01"
end_date = "2025-02-28"

# Download monthly data
for ticker in tickers:
    data = yf.download(ticker, start=start_date, end=end_date, interval="1mo")
    # Save to CSV
    filename = f"{ticker}_2006_2025.csv"
    data.to_csv(filename)
    print(f"Saved {ticker} data to {filename}")

In [None]:
import requests
import os
from bs4 import BeautifulSoup
from datetime import datetime
import re

# List of URLs you want to download
urls = [
    'https://www.federalreserve.gov/newsevents/testimony/yellen20161117a.htm',
'https://www.federalreserve.gov/newsevents/testimony/sullivan20160928a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20160928a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20160621a.htm',
'https://www.federalreserve.gov/newsevents/testimony/powell20160414.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20160210a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20151203a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20151104a.htm',
'https://www.federalreserve.gov/newsevents/testimony/sullivan20150929a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20150715a.htm',
'https://www.federalreserve.gov/newsevents/testimony/vanderweide20150428a.htm',
'https://www.federalreserve.gov/newsevents/testimony/hunter20150423a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20150319a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20150224a.htm',
'https://www.federalreserve.gov/newsevents/testimony/hunter20150210a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20141121a.htm',
'https://www.federalreserve.gov/newsevents/testimony/sullivan20141118a.htm',
'https://www.federalreserve.gov/newsevents/testimony/hunter20140916a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20140909a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20140715a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20130214a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20130226a.htm',
'https://www.federalreserve.gov/newsevents/testimony/ashton20130411a.htm',
'https://www.federalreserve.gov/newsevents/testimony/powell20130307a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20130416a.htm',
'https://www.federalreserve.gov/newsevents/testimony/gibson20130515a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20130522a.htm',
'https://www.federalreserve.gov/newsevents/testimony/vermilyea20130625a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20130711a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20130717a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20131114a.htm',
'https://www.federalreserve.gov/newsevents/testimony/gibson20140114a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20140205a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20140206a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20140211a.htm',
'https://www.federalreserve.gov/newsevents/testimony/powell20140313a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20140408a.htm',
'https://www.federalreserve.gov/newsevents/testimony/yellen20140507a.htm',
'https://www.federalreserve.gov/newsevents/testimony/roseman20140611a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20140715a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20120118a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bertsch20120201a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20120202a.htm',
'https://www.federalreserve.gov/newsevents/testimony/kamin20120216a.htm',
'https://www.federalreserve.gov/newsevents/testimony/duke20120228a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20120229a.htm',
'https://www.federalreserve.gov/newsevents/testimony/killian20120319a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20120321a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20120322a.htm',
'https://www.federalreserve.gov/newsevents/testimony/kamin20120327a.htm',
'https://www.federalreserve.gov/newsevents/testimony/braunstein20120329a.htm',
'https://www.federalreserve.gov/newsevents/testimony/gibson20120516a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20120517a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20120606a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20120607a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20120619a.htm',
'https://www.federalreserve.gov/newsevents/testimony/martin20120628a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20120717a.htm',
'https://www.federalreserve.gov/newsevents/testimony/eichner20120802a.htm',
'https://www.federalreserve.gov/newsevents/testimony/gibson20121114a.htm',
'https://www.federalreserve.gov/newsevents/testimony/hunter20110406a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20110412a.htm',
'https://www.federalreserve.gov/newsevents/testimony/liang20110414a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20110414a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110420a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110512a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20110601a.htm',
'https://www.federalreserve.gov/newsevents/testimony/gibson20110615a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20110616a.htm',
'https://www.federalreserve.gov/newsevents/testimony/foley20110615a.htm',
'https://www.federalreserve.gov/newsevents/testimony/statement20110707a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110713a.htm',
'https://www.federalreserve.gov/newsevents/testimony/braunstein20110713a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110721a.htm',
'https://www.federalreserve.gov/newsevents/testimony/vanderweide20110727a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bertsch20110812.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20111004a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20111206a.htm',
'https://www.federalreserve.gov/newsevents/testimony/alvarez20111213a.htm',
'https://www.federalreserve.gov/newsevents/testimony/kamin20111216a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110107a.htm',
'https://www.federalreserve.gov/newsevents/testimony/parkinson20110204a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110209a.htm',
'https://www.federalreserve.gov/newsevents/testimony/tarullo20110215a.htm',
'https://www.federalreserve.gov/newsevents/testimony/raskin20110217a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110217a.htm',
'https://www.federalreserve.gov/newsevents/testimony/bernanke20110301a.htm',
'https://www.federalreserve.gov/newsevents/testimony/nelson20110304a.htm',

]

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def get_date_from_page(url):
    """Attempt to extract the publication date from the webpage."""
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Common places where dates might be found (adjust based on your websites)
        date_selectors = [
            'meta[name="date"]',
            'meta[property="article:published_time"]',
            'time[datetime]',
            '.date', '.post-date', 'time'
        ]
        
        for selector in date_selectors:
            date_element = soup.select_one(selector)
            if date_element:
                date_str = date_element.get('content') or date_element.get('datetime') or date_element.text
                if date_str:
                    # Try to parse the date
                    try:
                        date_obj = datetime.strptime(date_str, "%Y-%m-%d")  # Adjust format as needed
                        return date_obj.strftime("%Y%B")  # e.g., 2025March
                    except ValueError:
                        # Try a more flexible approach with regex
                        date_match = re.search(r'(\d{4})[-/](\d{1,2})', date_str)
                        if date_match:
                            year, month = date_match.groups()
                            date_obj = datetime(int(year), int(month), 1)
                            return date_obj.strftime("%Y%B")
        
        # Fallback: use current date if no date is found
        print(f"Could not find date for {url}, using current date.")
        return datetime.now().strftime("%Y%B")
    
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return datetime.now().strftime("%Y%B")

def download_page(url, folder_name):
    """Download the webpage as an HTML file into the specified folder."""
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Create folder if it doesn't exist
        if not os.path.exists(folder_name):
            os.makedirs(folder_name)
        
        # Generate a filename from the URL (you can customize this)
        filename = url.split('/')[-1] or "index"
        if not filename.endswith('.html'):
            filename += '.html'
        filepath = os.path.join(folder_name, filename)
        
        # Save the HTML content
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(response.text)
        print(f"Saved {url} to {filepath}")
    
    except Exception as e:
        print(f"Error downloading {url}: {e}")

# Main process
for url in urls:
    folder_name = get_date_from_page(url)
    download_page(url, folder_name)

In [9]:
import requests
from bs4 import BeautifulSoup
import os
import time
import re

def download_speeches(year):
    # Create directory for the year if it doesn't exist
    directory = f"fed_pressrelease_{year}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Use different base URL pattern based on year
    
    url = f"https://www.federalreserve.gov/newsevents/press/all/{year}all.htm"
  
       
    
    try:
        # Get the archive page
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all speech links
        speech_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            # Adjust pattern matching based on year
            if year < 2001:
                if 'press' in href and not href.endswith('all.htm'):
                    if href.startswith('/'):
                        full_url = f"https://www.federalreserve.gov{href}"
                    else:
                        full_url = href
                    speech_links.append(full_url)
            else:
                if re.match(r'\d{8}/default.htm', href.split('/')[-2:][0] + '/' + href.split('/')[-1]):
                    if href.startswith('/'):
                        full_url = f"https://www.federalreserve.gov{href}"
                    else:
                        full_url = href
                    speech_links.append(full_url)
        
        # Download each speech
        for i, speech_url in enumerate(speech_links, 1):
            try:
                speech_response = requests.get(speech_url)
                speech_response.raise_for_status()
                
                # Extract filename based on URL pattern
                if year < 2001:
                    filename = speech_url.split('/')[-2]
                    if not filename.endswith('.htm') and not filename.endswith('.html'):
                        filename = f"{filename}.html"
                else:
                    # For post-2001, extract the date portion
                    # Option 1: Use just the date (20011218.htm)
                    date_part = speech_url.split('/')[-2]
                    filename = f"{date_part}.htm"
                    
                    # Option 2: Use date/default.htm (uncomment if preferred)
                    # date_part = speech_url.split('/')[-2] + '/' + speech_url.split('/')[-1]
                    # filename = date_part
                
                filepath = os.path.join(directory, filename)
                
                # Save the speech content
                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(speech_response.text)
                
                print(f"Downloaded: {filename} from {year}")
                
                # Be polite to the server - wait between requests
                time.sleep(1)
                
            except requests.RequestException as e:
                print(f"Failed to download {speech_url}: {e}")
                
    except requests.RequestException as e:
        print(f"Failed to access archive page for {year}: {e}")

def main():
    # Years to download (1996-2005)
    years = range(1996, 2006)
    
    for year in years:
        print(f"\nProcessing year: {year}")
        download_speeches(year)

if __name__ == "__main__":
    # Install required packages if not already installed
    try:
        import requests
        from bs4 import BeautifulSoup
    except ImportError:
        print("Installing required packages...")
        import subprocess
        subprocess.check_call(['pip', 'install', 'requests', 'beautifulsoup4'])
    
    main()


Processing year: 1996
Downloaded: 19961227.html from 1996
Downloaded: 19961226.html from 1996
Downloaded: 19961224.html from 1996
Downloaded: 19961223.html from 1996
Downloaded: 19961223.html from 1996
Downloaded: 199612233.html from 1996
Downloaded: 199612232.html from 1996
Downloaded: 19961220.html from 1996
Downloaded: 19961220.html from 1996
Downloaded: 199612202.html from 1996
Downloaded: 19961218.html from 1996
Downloaded: 19961218.html from 1996
Downloaded: 199612182.html from 1996
Downloaded: 19961217.html from 1996
Downloaded: 19961217.html from 1996
Downloaded: 19961216.html from 1996
Downloaded: 199612162.html from 1996
Downloaded: 19961211.html from 1996
Downloaded: 19961209.html from 1996
Downloaded: 199612092.html from 1996
Downloaded: 199612093.html from 1996
Downloaded: 19961204.html from 1996
Downloaded: 19961202.html from 1996
Downloaded: 199612022.html from 1996
Downloaded: 19961127.html from 1996
Downloaded: 19961120.html from 1996
Downloaded: 19961021.html from 19

In [10]:
import os
import shutil
import calendar

def sort_files_by_date_in_subfolders(root_directory):
    """
    Traverse all subdirectories in root_directory and sort files into YYYYMonth folders
    based on the first six digits (YYYYMM) in the filename.
    """
    if not os.path.exists(root_directory):
        print(f"Directory '{root_directory}' does not exist.")
        return
    
    for subdir, _, files in os.walk(root_directory):
        for filename in files:
            if len(filename) < 6 or not filename[:6].isdigit():
                continue  # Skip files that do not match the pattern
            
            year_month = filename[:6]  # Extract YYYYMM
            year = year_month[:4]
            month = int(year_month[4:6])
            
            if month < 1 or month > 12:
                continue  # Skip invalid month values
            
            month_name = calendar.month_name[month]  # Convert to full month name
            destination_folder = os.path.join(subdir, f"{year}{month_name}")
            
            # Create destination folder if it doesn't exist
            if not os.path.exists(destination_folder):
                os.makedirs(destination_folder)
            
            # Move the file
            source_file = os.path.join(subdir, filename)
            destination_file = os.path.join(destination_folder, filename)
            
            shutil.move(source_file, destination_file)
            print(f"Moved: {filename} -> {destination_folder}")

# Example usage
root_directory = "archive"  # Change this to your actual folder path
sort_files_by_date_in_subfolders(root_directory)



Moved: 19960828.html -> archive\fed_pressrelease_1996\1996August
Moved: 19960927.html -> archive\fed_pressrelease_1996\1996September
Moved: 19961021.html -> archive\fed_pressrelease_1996\1996October
Moved: 19961120.html -> archive\fed_pressrelease_1996\1996November
Moved: 19961127.html -> archive\fed_pressrelease_1996\1996November
Moved: 19961202.html -> archive\fed_pressrelease_1996\1996December
Moved: 199612022.html -> archive\fed_pressrelease_1996\1996December
Moved: 19961204.html -> archive\fed_pressrelease_1996\1996December
Moved: 19961209.html -> archive\fed_pressrelease_1996\1996December
Moved: 199612092.html -> archive\fed_pressrelease_1996\1996December
Moved: 199612093.html -> archive\fed_pressrelease_1996\1996December
Moved: 19961211.html -> archive\fed_pressrelease_1996\1996December
Moved: 19961216.html -> archive\fed_pressrelease_1996\1996December
Moved: 199612162.html -> archive\fed_pressrelease_1996\1996December
Moved: 19961217.html -> archive\fed_pressrelease_1996\1996De

In [None]:
import os
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import pandas as pd


# Define the root directory
# Option 1: If uploading files directly to Colab, use '/content/' as the root
# Option 2: If using Google Drive, specify the path like '/content/drive/My Drive/your_folder'
root_folder = 'Federal_Reserve_Archive'  # Default Colab temp storage; change to Google Drive path if needed

# Define document types
doc_types = ["Press_Releases", "Speeches", "Testimonies"]

# Dictionary to store content grouped by YYYYMonth
grouped_content = defaultdict(list)

def extract_text_content(file_path, doc_type):
    """
    Extract the relevant text content from the HTML file between the document type designation
    and "Last Update:", returning it as a plain text string.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'html.parser')
        
        # Step 1: Find the document type designation
        if doc_type == "Press_Releases":
            target = soup.find('h2', string=re.compile(r'\s*(Joint )?Press Release\s*', re.I))
        elif doc_type == "Speeches":
            target = soup.find('h2', string=re.compile(r'\s*Speech\s*', re.I))
        elif doc_type == "Testimonies":
            target = soup.find('h2', string=re.compile(r'\s*Testimony\s*', re.I))
        else:
            return None
        
        if not target:
            print(f"Could not find document type designation in {file_path}")
            return None
        
        # Step 2: Find "Last Update:" to determine the end
        last_update = soup.find(string=re.compile(r'Last Update:'))
        if last_update:
            last_update_parent = last_update.find_parent()
        else:
            last_update_parent = None
        
        # Step 3: Extract text between target and last_update_parent
        text_content = []
        current = target.next_element
        while current and (not last_update_parent or current != last_update_parent):
            if isinstance(current, str) and current.strip():
                # Clean up whitespace and decode HTML entities
                cleaned_text = re.sub(r'\s+', ' ', current.strip())
                if cleaned_text:
                    text_content.append(cleaned_text)
            current = current.next_element
        
        # Join the text with newlines for readability
        return '\n\n'.join(text_content)

def process_files():
    """
    Traverse the folder structure, extract text, and group by YYYYMonth across all doc_types.
    """
    for doc_type in doc_types:
        doc_type_path = os.path.join(root_folder, doc_type)
        if not os.path.exists(doc_type_path):
            print(f"Folder not found: {doc_type_path}")
            continue
        
        for month_folder in os.listdir(doc_type_path):
            month_path = os.path.join(doc_type_path, month_folder)
            if not os.path.isdir(month_path):
                continue
            
            for filename in os.listdir(month_path):
                if filename.endswith('.html'):
                    file_path = os.path.join(month_path, filename)
                    text_content = extract_text_content(file_path, doc_type)
                    
                    if text_content:
                        # Group by YYYYMonth only, combining all doc_types
                        grouped_content[month_folder].append(text_content)
                        print(f"Processed {file_path}")

def save_to_csv(output_folder):
    """
    Create a DataFrame with Date (YYYYMonth) and Text (concatenated string), then save as CSV.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Prepare data for DataFrame
    data = []
    for month, texts in grouped_content.items():
        # Concatenate all text for this month across all doc_types
        concatenated_text = '\n\n'.join(texts)
        data.append({'Date': month, 'Text': concatenated_text})
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=['Date', 'Text'])
    
    # Sort by Date for consistency
    df = df.sort_values('Date')
    
    # Save to CSV
    output_file = os.path.join(output_folder, 'Combined_all.csv')
    df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Saved DataFrame to {output_file}")

def main():
    process_files()
    output_folder = os.path.join(root_folder, "Grouped_Output")
    save_to_csv(output_folder)

if __name__ == "__main__":
    main()

In [None]:
import os
import re
import nltk
from bs4 import BeautifulSoup

# Ensure necessary NLTK data is downloaded
nltk.download('punkt')

def extract_clean_text(file_path):
    """Extracts and cleans text from an HTML or text file, removing headers above document type designation."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # If the file is HTML, parse it
            if file_path.endswith('.html'):
                soup = BeautifulSoup(content, 'html.parser')
                content = soup.get_text()
            
            # Identify where the actual document starts (after type designation)
            match = re.search(r'(Press Release|Speech|Testimony)', content, re.IGNORECASE)
            if match:
                content = content[match.start():]  # Keep everything after the designation
            
            # Tokenize and clean text
            sentences = nltk.sent_tokenize(content)
            cleaned_text = ' '.join(sentences)  # Concatenate sentences into a single string
            return cleaned_text
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return ""

def process_documents(base_folder):
    """Processes all documents, grouping them by YYYYMonth folder."""
    monthly_text = {}  # Dictionary to store concatenated text by YYYYMonth
    
    for doc_type in os.listdir(base_folder):
        doc_type_path = os.path.join(base_folder, doc_type)
        
        if os.path.isdir(doc_type_path):  # Ensure it's a folder
            for month_folder in os.listdir(doc_type_path):
                month_path = os.path.join(doc_type_path, month_folder)
                
                if os.path.isdir(month_path):
                    if month_folder not in monthly_text:
                        monthly_text[month_folder] = ""
                    
                    for file_name in os.listdir(month_path):
                        file_path = os.path.join(month_path, file_name)
                        if file_path.endswith(('.html', '.txt')):  # Process only HTML and text files
                            text = extract_clean_text(file_path)
                            monthly_text[month_folder] += " " + text
    
    return monthly_text

# Example usage
base_folder = "Federal_Reserve_Archive"  # Replace with the actual path
monthly_data = process_documents(base_folder)

# Now `monthly_data` contains cleaned, concatenated text per YYYYMonth folder


In [None]:
from bs4 import BeautifulSoup
import re

def extract_body_text(html_content):
    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the main article content div
    article = soup.find('div', id='article')
    
    if article:
        # Find all paragraphs within the main content area
        paragraphs = article.find_all('p')
        
        # Filter out unwanted paragraphs (like those with email or metadata)
        content_paragraphs = []
        for p in paragraphs:
            # Skip paragraphs with email links or specific classes we don't want
            if (p.find('a', href=re.compile('mailto')) or 
                'article__time' in p.get('class', []) or 
                'releaseTime' in p.get('class', [])):
                continue
            content_paragraphs.append(p)
        
        # Extract text and join paragraphs
        text = ' '.join(p.get_text(strip=True) for p in content_paragraphs)
        
        # Clean up extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    return "No content found"

# Example usage with your HTML file
with open('Agencies announce public outreach meeting as part of their review of regulations.html', 'r', encoding='utf-8') as file:
    html_content = file.read()
    result = extract_body_text(html_content)
    print(result)

In [17]:
import os

input_root_directory = 'Federal_Reserve_Archive'

print("Scanning Directories...")
for dirpath, dirnames, filenames in os.walk(input_root_directory):
    print(f"Processing folder: {dirpath}")
    for filename in filenames:
        if filename.endswith('.html'):
            print(f"Found file: {os.path.join(dirpath, filename)}")
        elif filename.endswith('.htm'):
            print(f"Found file: {os.path.join(dirpath, filename)}")
            print("Filename different")


Scanning Directories...
Processing folder: Federal_Reserve_Archive
Processing folder: Federal_Reserve_Archive\1996August
Found file: Federal_Reserve_Archive\1996August\19960828.html
Processing folder: Federal_Reserve_Archive\1996December
Found file: Federal_Reserve_Archive\1996December\19961202.html
Found file: Federal_Reserve_Archive\1996December\199612022.html
Found file: Federal_Reserve_Archive\1996December\19961204.html
Found file: Federal_Reserve_Archive\1996December\19961209.html
Found file: Federal_Reserve_Archive\1996December\199612092.html
Found file: Federal_Reserve_Archive\1996December\199612093.html
Found file: Federal_Reserve_Archive\1996December\19961211.html
Found file: Federal_Reserve_Archive\1996December\19961216.html
Found file: Federal_Reserve_Archive\1996December\199612162.html
Found file: Federal_Reserve_Archive\1996December\19961217.html
Found file: Federal_Reserve_Archive\1996December\19961218.html
Found file: Federal_Reserve_Archive\1996December\199612182.html
F

In [18]:
import os
from bs4 import BeautifulSoup
import re

def extract_body_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    article = soup.find('div', id='article')
    
    if article:
        paragraphs = article.find_all('p')
        content_paragraphs = []
        for p in paragraphs:
            if (p.find('a', href=re.compile('mailto')) or 
                'article__time' in p.get('class', []) or 
                'releaseTime' in p.get('class', [])):
                continue
            content_paragraphs.append(p)
        
        text = ' '.join(p.get_text(strip=True) for p in content_paragraphs)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return "No content found"

def process_html_files(input_root, output_root):
    # Ensure output root directory exists
    os.makedirs(output_root, exist_ok=True)
    
    # Walk through all directories and files in input_root
    for dirpath, dirnames, filenames in os.walk(input_root):
        # Calculate the relative path to maintain folder structure
        relative_path = os.path.relpath(dirpath, input_root)
        output_dir = os.path.join(output_root, relative_path)
        
        # Create corresponding output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Process each HTML file in the current directory
        for filename in filenames:
            if filename.endswith(('.html','.htm')):
                input_path = os.path.join(dirpath, filename)
                output_filename = f"{os.path.splitext(filename)[0]}.txt"
                output_path = os.path.join(output_dir, output_filename)
                
                try:
                    with open(input_path, 'r', encoding='utf-8') as file:
                        html_content = file.read()
                        result = extract_body_text(html_content)
                        
                        with open(output_path, 'w', encoding='utf-8') as out_file:
                            out_file.write(result)
                    print(f"Processed: {os.path.join(relative_path, filename)}")
                except Exception as e:
                    print(f"Error processing {os.path.join(relative_path, filename)}: {str(e)}")

# Example usage
input_root_directory = 'Federal_Reserve_Archive'  # Where your HTML files are stored
output_root_directory = 'Federal_Reserve_Archive_adj'  # Where you want text files to be saved

process_html_files(input_root_directory, output_root_directory)

Processed: 1996August\19960828.html
Processed: 1996December\19961202.html
Processed: 1996December\199612022.html
Processed: 1996December\19961204.html
Processed: 1996December\19961209.html
Processed: 1996December\199612092.html
Processed: 1996December\199612093.html
Processed: 1996December\19961211.html
Processed: 1996December\19961216.html
Processed: 1996December\199612162.html
Processed: 1996December\19961217.html
Processed: 1996December\19961218.html
Processed: 1996December\199612182.html
Processed: 1996December\19961220.html
Processed: 1996December\199612202.html
Processed: 1996December\19961223.html
Processed: 1996December\199612232.html
Processed: 1996December\199612233.html
Processed: 1996December\19961224.html
Processed: 1996December\19961226.html
Processed: 1996December\19961227.html
Processed: 1996July\19960717.htm
Processed: 1996July\19960724.htm
Processed: 1996July\19960726.htm
Processed: 1996June\19960626.htm
Processed: 1996November\19961120.html
Processed: 1996November\19

In [5]:
# for archive links
from bs4 import BeautifulSoup
import re
import os

def extract_body_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Extract content from all table cells (which contain body text)
    table_cells = soup.find_all('td')
    
    content_paragraphs = []
    for cell in table_cells:
        # Remove unnecessary links, footers, and scripts
        for tag in cell.find_all(['a', 'script', 'style']):
            tag.decompose()
        
        text = cell.get_text(separator=' ', strip=True)
        
        # Exclude footer-like content
        if "Last update:" in text or "Home" in text or "Contact Us" in text:
            continue
        
        content_paragraphs.append(text)

    # Clean up whitespace and return extracted text
    extracted_text = ' '.join(content_paragraphs)
    extracted_text = re.sub(r'\s+', ' ', extracted_text).strip()
    
    return extracted_text if extracted_text else "No content found"

# Processing function remains unchanged
def process_html_files(input_root, output_root):
    os.makedirs(output_root, exist_ok=True)
    
    for dirpath, _, filenames in os.walk(input_root):
        relative_path = os.path.relpath(dirpath, input_root)
        output_dir = os.path.join(output_root, relative_path)
        os.makedirs(output_dir, exist_ok=True)
        
        for filename in filenames:
            if filename.endswith(('.html','.htm')):
                input_path = os.path.join(dirpath, filename)
                output_filename = f"{os.path.splitext(filename)[0]}.txt"
                output_path = os.path.join(output_dir, output_filename)
                
                try:
                    with open(input_path, 'r', encoding='utf-8') as file:
                        html_content = file.read()
                        result = extract_body_text(html_content)
                        
                        with open(output_path, 'w', encoding='utf-8') as out_file:
                            out_file.write(result)
                    print(f"Processed: {os.path.join(relative_path, filename)}")
                except Exception as e:
                    print(f"Error processing {os.path.join(relative_path, filename)}: {str(e)}")

# Example usage
input_root_directory = 'Federal_Reserve_Archive'
output_root_directory = 'Federal_Reserve_Archive_adj'
process_html_files(input_root_directory, output_root_directory)


Processed: 1996August\19960828.html
Processed: 1996December\19961202.html
Processed: 1996December\199612022.html
Processed: 1996December\19961204.html
Processed: 1996December\19961209.html
Processed: 1996December\199612092.html
Processed: 1996December\199612093.html
Processed: 1996December\19961211.html
Processed: 1996December\19961216.html
Processed: 1996December\199612162.html
Processed: 1996December\19961217.html
Processed: 1996December\19961218.html
Processed: 1996December\199612182.html
Processed: 1996December\19961220.html
Processed: 1996December\199612202.html
Processed: 1996December\19961223.html
Processed: 1996December\199612232.html
Processed: 1996December\199612233.html
Processed: 1996December\19961224.html
Processed: 1996December\19961226.html
Processed: 1996December\19961227.html
Processed: 1996July\19960717.htm
Processed: 1996July\19960724.htm
Processed: 1996July\19960726.htm
Processed: 1996June\19960626.htm
Processed: 1996November\19961120.html
Processed: 1996November\19

In [4]:
for dirpath, _, filenames in os.walk(input_root_directory):
    for filename in filenames:
        if filename.endswith(('.html', '.htm')):
            print(f"Detected File: {os.path.join(dirpath, filename)}")


Detected File: Federal_Reserve_Archive\1996August\19960828.html
Detected File: Federal_Reserve_Archive\1996December\19961202.html
Detected File: Federal_Reserve_Archive\1996December\199612022.html
Detected File: Federal_Reserve_Archive\1996December\19961204.html
Detected File: Federal_Reserve_Archive\1996December\19961209.html
Detected File: Federal_Reserve_Archive\1996December\199612092.html
Detected File: Federal_Reserve_Archive\1996December\199612093.html
Detected File: Federal_Reserve_Archive\1996December\19961211.html
Detected File: Federal_Reserve_Archive\1996December\19961216.html
Detected File: Federal_Reserve_Archive\1996December\199612162.html
Detected File: Federal_Reserve_Archive\1996December\19961217.html
Detected File: Federal_Reserve_Archive\1996December\19961218.html
Detected File: Federal_Reserve_Archive\1996December\199612182.html
Detected File: Federal_Reserve_Archive\1996December\19961220.html
Detected File: Federal_Reserve_Archive\1996December\199612202.html
Detect

In [6]:
import os

# Define the path where the folders are stored
base_path = "Federal_Reserve_Archive_adj"  # Change this to the path containing your folders
output_folder = os.path.join(base_path, "archive_concatenated_text")

# Create the output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Iterate through all folders in the base directory
for folder in os.listdir(base_path):
    folder_path = os.path.join(base_path, folder)

    # Ensure it's a directory
    if os.path.isdir(folder_path):
        combined_text = []

        # Iterate through all text files in the folder
        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)

            # Ensure it's a text file
            if os.path.isfile(file_path) and file.endswith(".txt"):
                with open(file_path, "r", encoding="utf-8") as f:
                    combined_text.append(f.read())

        # Create the output file in the concatenated_text folder
        if combined_text:
            output_file = os.path.join(output_folder, f"{folder}_combined.txt")
            with open(output_file, "w", encoding="utf-8") as f:
                f.write("\n".join(combined_text))

print("Concatenation complete. Check the 'concatenated_text' folder.")


Concatenation complete. Check the 'concatenated_text' folder.


In [None]:
import os
import glob
import pandas as pd
from datetime import datetime

# Folder containing text files
folder_path = "concatenated_text"  # Change this to your folder path

# Function to extract MM/YYYY from the filename format YYYYMonth
def extract_date_from_filename(filename):
    base_name = os.path.basename(filename)  # Get the file name
    date_part = base_name.split('_')[0]
    year = date_part[:4]  # Extract YYYY
    month_name = date_part[4:]  # Extract Month name
    month_number = datetime.strptime(month_name, "%B").month  # Convert month name to number
    return f"{year}-{month_number:02d}"  # Format as YYYY-MM

# Create an empty list to store data
data = []

# Read all text files from the folder
for file_path in glob.glob(os.path.join(folder_path, "*.txt")):
    with open(file_path, 'r', encoding='utf-8') as file:
        text_content = file.read().strip()  # Read text from file
        date_str = extract_date_from_filename(file_path)  # Extract date
        data.append([date_str, text_content])  # Append to list

# Convert to DataFrame
df = pd.DataFrame(data, columns=['Date', 'Text'])

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m")

# Sort by Date
df = df.sort_values(by='Date')

# Display DataFrame
print(df)


          Date                                               Text
4   2006-01-01  The Federal Reserve Board on Tuesday announced...
3   2006-02-01  The Federal Reserve Board on Monday announced ...
7   2006-03-01  The federal financial regulatory agencies toda...
0   2006-04-01  Governor Susan Schmidt Bies At the Enterprise ...
8   2006-05-01  Five federal agencies today requested public c...
..         ...                                                ...
226 2024-10-01  Vice Chair Philip N. Jefferson At Davidson Col...
225 2024-11-01  Governor Michelle W. Bowman At the Forum Club ...
218 2024-12-01  Governor Adriana D. Kugler At the Detroit Econ...
229 2025-01-01  Federal bank regulatory agencies will hold a v...
228 2025-02-01  Vice Chair for Supervision Michael S. Barr At ...

[230 rows x 2 columns]


In [None]:
# Import necessary libraries
import pandas as pd
import os

# File paths for the stock market data
dji_file = "DJIReturns.csv"  # Update this with the correct path
sp500_file = "SP500Returns.csv"  # Update this with the correct path

# Load the CSV files, keeping only relevant columns
dji_df = pd.read_csv(dji_file, usecols=["Date", "Open", "Percent growth", "Trend"])
sp500_df = pd.read_csv(sp500_file, usecols=["Date", "Open", "Percent Growth", "Return"])

# Rename columns to indicate the source of data
dji_df = dji_df.rename(columns={
    "Open": "DJI_Open",
    "Percent Growth": "DJI_Percent_Growth",
    "Trend": "DJI_Trend"
})

sp500_df = sp500_df.rename(columns={
    "Open": "SP500_Open",
    "Percent Growth": "SP500_Percent_Growth",
    "Return": "SP500_Trend"
})

# Convert Date column in CSV files to match 'YYYY-MM' format
dji_df["Date"] = pd.to_datetime(dji_df["Date"], format="%m/%d/%Y").dt.to_period("M").astype(str)
sp500_df["Date"] = pd.to_datetime(sp500_df["Date"], format="%m/%d/%Y").dt.to_period("M").astype(str)

# Ensure the Date column in df is also in the same 'YYYY-MM' format
df["Date"] = df["Date"].dt.to_period("M").astype(str)


# Merge all data on the Date column
df = df.merge(dji_df, on="Date", how="left")
df = df.merge(sp500_df, on="Date", how="left")

print(df)


In [7]:
import yfinance as yf
import pandas as pd

# Define the ticker symbol (e.g., S&P 500: "^GSPC", Dow Jones: "^DJI", Nasdaq: "^IXIC")
ticker_symbol = "^GSPC"  # Change to your desired index

# Define the date range
start_date = "1996-01-01"
end_date = "2025-03-01"

# Download the data
data = yf.download(ticker_symbol, start=start_date, end=end_date, interval="1mo")

# Display the first few rows
print(data.head())

# Save to CSV if needed
data.to_csv("SP_monthly.csv")


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price            Close        High         Low        Open      Volume
Ticker           ^GSPC       ^GSPC       ^GSPC       ^GSPC       ^GSPC
Date                                                                  
1996-01-01  636.020020  636.179993  597.289978  615.929993  9188050000
1996-02-01  640.429993  664.229980  633.710022  636.020020  8749960000
1996-03-01  645.500000  656.969971  627.630005  640.429993  8984200000
1996-04-01  654.169983  656.679993  624.140015  645.500000  8875580000
1996-05-01  669.119995  681.099976  630.070007  654.169983  8921140000





In [8]:
import yfinance as yf
import pandas as pd

# Define the ticker symbol (e.g., S&P 500: "^GSPC", Dow Jones: "^DJI", Nasdaq: "^IXIC")
ticker_symbol = "^DJI"  # Change to your desired index

# Define the date range
start_date = "1996-01-01"
end_date = "2025-03-01"

# Download the data
data = yf.download(ticker_symbol, start=start_date, end=end_date, interval="1mo")

# Display the first few rows
print(data.head())

# Save to CSV if needed
data.to_csv("DJI_monthly.csv")


[*********************100%***********************]  1 of 1 completed

Price             Close         High          Low         Open     Volume
Ticker             ^DJI         ^DJI         ^DJI         ^DJI       ^DJI
Date                                                                     
1996-01-01  5395.299805  5408.669922  5014.520020  5115.700195  851030000
1996-02-01  5485.620117  5659.399902  5348.339844  5394.899902  769130000
1996-03-01  5587.140137  5709.970215  5424.200195  5488.200195  831070000
1996-04-01  5569.080078  5697.689941  5415.529785  5588.600098  756530000
1996-05-01  5643.180176  5796.100098  5342.200195  5567.600098  801900000



