In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
from datetime import datetime
import sqlite3 as sq

In [2]:
def scrape_wsj_diaries():
    # Set up the Selenium driver
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(options=options)
    driver.get("https://www.wsj.com/market-data/stocks/marketsdiary")
    
    # Wait for the content to load (you can adjust the time as needed)
    driver.implicitly_wait(10)
    
    # Extract the page source
    page_source = driver.page_source
    
    # Close the browser
    driver.close()
    
    # Use BeautifulSoup to parse the page source
    soup = BeautifulSoup(page_source, 'html.parser')
    
    # Extract the date using the exact class name
    date_element = soup.find('span', class_='WSJBase--card__timestamp--3F2HxyAE')
    
    if date_element:
        date_str = ' '.join(date_element.text.split()[1:])
        date_obj = datetime.strptime(date_str, '%B %d, %Y')
    else:
        raise ValueError("Unable to extract the date from the page source.")
    
    # Use pandas to read tables from the page source
    tables = pd.read_html(page_source)
    
    # Assuming the first table is the one we need
    df = tables[0]
    
    # Add the date column
    df['Date'] = date_obj
    
    return df

def organize_dataframe(df):
    market_names = ['NYSE', 'NASDAQ', 'NYSE American', 'NYSE Arca']
    
    n = len(df) // 4
    dfs = [df.iloc[i*n:(i+1)*n].copy() for i in range(4)]
    
    for i, market in enumerate(market_names):
        dfs[i]['Market'] = market

    final_df = pd.concat(dfs, ignore_index=True)
    final_df.rename(columns={'Unnamed: 0_level_0': 'Index'}, inplace=True)
    column_order = ['Market', 'Date', 'Index', 'Latest Close', 'Previous Close', 'Week Ago']
    final_df = final_df[column_order]
    
    return final_df

In [3]:
df = scrape_wsj_diaries()
df = organize_dataframe(df)
df.columns = df.columns.get_level_values(0)
df

Unnamed: 0,Market,Date,Index,Latest Close,Previous Close,Week Ago
0,NYSE,2023-08-17,Issues traded,3022.0,3017.0,3003.0
1,NYSE,2023-08-17,Advances,897.0,760.0,1282.0
2,NYSE,2023-08-17,Declines,2017.0,2155.0,1609.0
3,NYSE,2023-08-17,Unchanged,108.0,102.0,112.0
4,NYSE,2023-08-17,New highs,19.0,42.0,72.0
5,NYSE,2023-08-17,New lows,88.0,95.0,31.0
6,NYSE,2023-08-17,Adv. volume*,330043900.0,193374200.0,382072100.0
7,NYSE,2023-08-17,Decl. volume*,527148900.0,587716900.0,459888600.0
8,NYSE,2023-08-17,Total volume*,871033100.0,795140000.0,851098500.0
9,NYSE,2023-08-17,Closing Arms (TRIN)†,0.7,1.02,0.69


In [6]:
# Connect to SQLite database (will be created if doesn't exist)
conn = sq.connect('../market_data.db')
cursor = conn.cursor()

# Check if the table exists
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='mkt_breadth'")
if not cursor.fetchone():
    # If table doesn't exist, create it
    df.to_sql('mkt_breadth', conn, if_exists='replace', index=False)
else:
    # If table exists, check for each row in the DataFrame
    for _, row in df.iterrows():
        market = row['Market']
        date = row['Date']
        index_val = row['Index']
        
        cursor.execute("SELECT COUNT(*) FROM mkt_breadth WHERE Market=? AND Date=? AND Index=?", (market, date, index_val))
        count = cursor.fetchone()[0]
        
        # If the combination doesn't exist in the table, insert the row
        if count == 0:
            query = "INSERT INTO mkt_breadth (Market, Date, Index, Latest Close, Previous Close, Week Ago) VALUES (?, ?, ?, ?, ?, ?)"
            cursor.execute(query, (row['Market'], row['Date'], row['Index'], row['Latest Close'], row['Previous Close'], row['Week Ago']))
            conn.commit()

# Close the database connection
conn.close()
