In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re  # For cleaning strings

In [None]:
# Going to scrape data from the Federak Reserve Bank of St. Louis, specifically, the consumer opinion survey: Future Tendency for the United States. This survey is a gauge of consumer sentiment about the direction of future consumer price changes. Postive values are increased inflation expectations, for example we can see recently that consumers expectations of inflation are decreasing which could indicate that we are headed towards an economic downturn. 

# Step 1: Fetch the page
url = 'https://fred.stlouisfed.org/data/CSINFT02USQ460S'  # test scraping site with a table, we want to pull consumer sentiment from the second table
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
# Mimic browser to avoid blocks

response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"Error: {response.status_code} - Check URL to make sure site actually works")
    exit()
print(response)

<Response [200]>


In [8]:
# Step 2: Parse HTML with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')  # 'lxml' faster if installed

In [14]:
# Find the table (inspect page: class='table' or id='historical-data')
tables = soup.find_all(class_='table')
if len(tables) >= 2:
    table = tables[1]  # Adjust selector via browser inspect
if not table:
    print("Table not found, ensure the URL actually has a table to scrape from")
    exit()
print(table)

<table aria-labelledby="data-table-title" class="table table-bordered" id="data-table-observations">
<thead>
<tr>
<th scope="col">DATE</th>
<th scope="col">VALUE</th>
</tr>
</thead>
<tbody>
<tr>
<th class="pe-5" scope="row">1978-01-01</th>
<td class="pe-5">7.3666670000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1978-04-01</th>
<td class="pe-5">8.3000000000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1978-07-01</th>
<td class="pe-5">8.8666670000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1978-10-01</th>
<td class="pe-5">8.6000000000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1979-01-01</th>
<td class="pe-5">10.3666700000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1979-04-01</th>
<td class="pe-5">11.7000000000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1979-07-01</th>
<td class="pe-5">11.0333300000000000</td>
</tr>
<tr>
<th class="pe-5" scope="row">1979-10-01</th>
<td class="pe-5">11.0333300000000000</td>
</tr>
<tr>
<th class="pe-5" scope="r

In [47]:
#HTML Elements of a table structure, how to pull rows, cells or headers
#td - table data the specific cell tag
#tr - table row the tag for the whole row of cells
#th - table header ""


# Step 3: Extract data and put into rows and columns
rows = [] #empty list to put the data into using the .append() method
headers = [th.get_text(strip=True) 
           for th in table.find('tr').find_all('th')]  # for each table row, get the text and strip the whitespace for the table header
# Rows: [1:] skips header; check both <td>/<th>; require 2+ cols
for row in table.find_all('tr')[1:]:
    cols = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
    if len(cols) >= 2 and cols[0].strip():  # Valid row: 2+ cols, non-empty date
        date, value_str = cols[0], cols[1]  # Unpack for clarity
        try:
            value = float(value_str)  # Direct conversionâ€”no cleaning
            rows.append([date, value])
            if len(rows) <= 3:  # Debug: Log first few valid rows
                print(f"Added row: [{date}, {value}]")
        except ValueError:
            print(f"Skipping invalid row (non-numeric value): {cols[:2]}")  # Log for diagnosis

print(f"Number of valid rows extracted: {len(rows)}")

Added row: [1978-01-01, 7.366667]
Added row: [1978-04-01, 8.3]
Added row: [1978-07-01, 8.866667]
Number of valid rows extracted: 191


In [46]:
# Step 4: Convert to DataFrame
df = pd.DataFrame(rows, columns=headers[:2])  # Assumes first two cols: DATE, VALUE
df['DATE'] = pd.to_datetime(df['DATE'])  # Optional: For time-series analysis

print("This data is the consumer opinion survevy of consumer prices, what is the future price tendancy for the US")
print("\n")
print(df.head(20))

This data is the consumer opinion survevy of consumer prices, what is the future price tendancy for the US


         DATE      VALUE
0  1978-01-01   7.366667
1  1978-04-01   8.300000
2  1978-07-01   8.866667
3  1978-10-01   8.600000
4  1979-01-01  10.366670
5  1979-04-01  11.700000
6  1979-07-01  11.033330
7  1979-10-01  11.033330
8  1980-01-01  12.566670
9  1980-04-01  10.166670
10 1980-07-01   9.366667
11 1980-10-01  10.233330
12 1981-01-01   8.533334
13 1981-04-01   8.566667
14 1981-07-01   8.033334
15 1981-10-01   7.866667
16 1982-01-01   6.200000
17 1982-04-01   5.233333
18 1982-07-01   5.833333
19 1982-10-01   5.300000
