<a href="https://colab.research.google.com/github/TomMcIver/Stock/blob/Data-CleanUp/stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install ace-tools yfinance

Collecting ace-tools
  Downloading ace_tools-0.0-py3-none-any.whl.metadata (300 bytes)
Downloading ace_tools-0.0-py3-none-any.whl (1.1 kB)
Installing collected packages: ace-tools
Successfully installed ace-tools-0.0


In [3]:
import yfinance as yf
import pandas as pd
import time
import os
from datetime import datetime, timedelta

TECH_STOCKS = [
    "AAPL", "MSFT", "NVDA", "AVGO", "CRM", "ORCL", "CSCO", "ACN", "NOW", "IBM",
    "AMD", "ADBE", "INTU", "QCOM", "TXN", "AMAT", "PLTR", "ANET", "PANW", "MU",
    "ADI", "LRCX", "KLAC", "APH", "INTC", "CDNS", "CRWD", "MSI", "SNPS", "ADSK",
    "FTNT", "ROP", "WDAY", "NXPI", "FICO", "TEL", "IT", "CTSH", "GLW", "DELL",
    "HPQ", "MCHP", "ANSS", "MPWR", "HPE", "KEYS", "GDDY", "CDW", "TYL", "NTAP"
]

MACRO_TICKERS = {
    '^VIX': 'Volatility Index',
    '^TNX': '10-Year Treasury Yield',
    '^FVX': '5-Year Treasury Yield',
    'CL=F': 'Crude Oil Futures'
}

DATA_FOLDER = "yfinance_tech_stock_data"
MACRO_FOLDER = "macroeconomic_data"
os.makedirs(DATA_FOLDER, exist_ok=True)
os.makedirs(MACRO_FOLDER, exist_ok=True)

def fetch_stock_data(symbol, interval="1h", period="2y"):
    """Fetch and save stock data with enhanced error handling"""
    try:
        print(f"\n📈 Fetching {symbol} ({interval} interval)...")
        df = yf.Ticker(symbol).history(period=period, interval=interval)

        if df.empty:
            print(f"🚨 No data for {symbol}")
            return None

        df = df.reset_index().rename(columns={"Datetime": "timestamp"})
        df['symbol'] = symbol


        path = os.path.join(DATA_FOLDER, f"{symbol}_stock.csv")
        df.to_csv(path, index=False)
        print(f"✅ Saved {len(df)} rows to {path}")

        return df

    except Exception as e:
        print(f"🚨 Error fetching {symbol}: {str(e)}")
        return None

def fetch_macro_data():
    """Fetch macroeconomic indicators with retry logic"""
    macro_data = {}

    for ticker, name in MACRO_TICKERS.items():
        retries = 3
        while retries > 0:
            try:
                print(f"\n🌐 Fetching {name} ({ticker})...")
                df = yf.download(ticker, start=datetime.now()-timedelta(days=730), interval='1h')

                if not df.empty:
                    df = df.reset_index().rename(columns={'Date': 'timestamp'})
                    df['ticker'] = ticker
                    macro_data[ticker] = df


                    path = os.path.join(MACRO_FOLDER, f"{ticker.replace('^','')}.csv")
                    df.to_csv(path, index=False)
                    print(f"✅ Saved {ticker} data")


                    print(f"\n📋 Columns for {ticker} ({name}):")
                    print(df.columns.tolist())
                    print(f"\nFirst row for {ticker}:")
                    print(df.head(1))
                    break
                else:
                    print(f"🚨 Empty data for {ticker}")
                    retries -= 1

            except Exception as e:
                print(f"🚨 Error ({retries} retries left): {str(e)}")
                retries -= 1
                time.sleep(5)

    return macro_data

def main_data_pipeline():
    """Orchestrate data fetching with rate limiting"""
    stock_dfs = []
    preview_printed = False


    for symbol in TECH_STOCKS:
        df = fetch_stock_data(symbol)
        if df is not None:
            stock_dfs.append(df)
            if not preview_printed:
                print(f"\n📋 Columns for {symbol}:")
                print(df.columns.tolist())
                print(f"\nFirst row for {symbol}:")
                print(df.head(1))
                preview_printed = True
        time.sleep(1.5)


    macro_data = fetch_macro_data()

    return pd.concat(stock_dfs), macro_data


if __name__ == "__main__":
    stock_data, macro_data = main_data_pipeline()
    print("\n🎉 Data pipeline complete!")
    print(f"Stock data shape: {stock_data.shape}")
    print(f"Macro data keys: {list(macro_data.keys())}")


📈 Fetching AAPL (1h interval)...
✅ Saved 3494 rows to yfinance_tech_stock_data/AAPL_stock.csv

📋 Columns for AAPL:
['timestamp', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'symbol']

First row for AAPL:
                  timestamp        Open        High         Low       Close  \
0 2023-02-01 09:30:00-05:00  143.860001  143.869995  142.589996  142.960007   

     Volume  Dividends  Stock Splits symbol  
0  11912462        0.0           0.0   AAPL  

📈 Fetching MSFT (1h interval)...
✅ Saved 3494 rows to yfinance_tech_stock_data/MSFT_stock.csv

📈 Fetching NVDA (1h interval)...
✅ Saved 3494 rows to yfinance_tech_stock_data/NVDA_stock.csv

📈 Fetching AVGO (1h interval)...
✅ Saved 3494 rows to yfinance_tech_stock_data/AVGO_stock.csv

📈 Fetching CRM (1h interval)...
✅ Saved 3494 rows to yfinance_tech_stock_data/CRM_stock.csv

📈 Fetching ORCL (1h interval)...
✅ Saved 3494 rows to yfinance_tech_stock_data/ORCL_stock.csv

📈 Fetching CSCO (1h interval)...
✅ Saved 34

[*********************100%***********************]  1 of 1 completed


✅ Saved ^VIX data

📋 Columns for ^VIX (Volatility Index):
[('Datetime', ''), ('Close', '^VIX'), ('High', '^VIX'), ('Low', '^VIX'), ('Open', '^VIX'), ('Volume', '^VIX'), ('ticker', '')]

First row for ^VIX:
Price                   Datetime      Close   High        Low   Open Volume  \
Ticker                                 ^VIX   ^VIX       ^VIX   ^VIX   ^VIX   
0      2023-02-02 08:00:00+00:00  17.620001  17.75  17.610001  17.74      0   

Price  ticker  
Ticker         
0        ^VIX  

🌐 Fetching 10-Year Treasury Yield (^TNX)...


[*********************100%***********************]  1 of 1 completed


✅ Saved ^TNX data

📋 Columns for ^TNX (10-Year Treasury Yield):
[('Datetime', ''), ('Close', '^TNX'), ('High', '^TNX'), ('Low', '^TNX'), ('Open', '^TNX'), ('Volume', '^TNX'), ('ticker', '')]

First row for ^TNX:
Price                   Datetime  Close   High    Low   Open Volume ticker
Ticker                             ^TNX   ^TNX   ^TNX   ^TNX   ^TNX       
0      2023-02-02 13:20:00+00:00  3.344  3.378  3.344  3.373      0   ^TNX

🌐 Fetching 5-Year Treasury Yield (^FVX)...


[*********************100%***********************]  1 of 1 completed


✅ Saved ^FVX data

📋 Columns for ^FVX (5-Year Treasury Yield):
[('Datetime', ''), ('Close', '^FVX'), ('High', '^FVX'), ('Low', '^FVX'), ('Open', '^FVX'), ('Volume', '^FVX'), ('ticker', '')]

First row for ^FVX:
Price                   Datetime  Close   High    Low   Open Volume ticker
Ticker                             ^FVX   ^FVX   ^FVX   ^FVX   ^FVX       
0      2023-02-02 13:20:00+00:00  3.423  3.462  3.423  3.454      0   ^FVX

🌐 Fetching Crude Oil Futures (CL=F)...


[*********************100%***********************]  1 of 1 completed

✅ Saved CL=F data

📋 Columns for CL=F (Crude Oil Futures):
[('Datetime', ''), ('Close', 'CL=F'), ('High', 'CL=F'), ('Low', 'CL=F'), ('Open', 'CL=F'), ('Volume', 'CL=F'), ('ticker', '')]

First row for CL=F:
Price                   Datetime      Close       High        Low       Open  \
Ticker                                 CL=F       CL=F       CL=F       CL=F   
0      2023-02-02 06:00:00+00:00  76.870003  77.089996  76.800003  77.029999   

Price  Volume ticker  
Ticker   CL=F         
0           0   CL=F  

🎉 Data pipeline complete!
Stock data shape: (174699, 9)
Macro data keys: ['^VIX', '^TNX', '^FVX', 'CL=F']





In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys
import logging
import re
from datetime import datetime, timedelta
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor, as_completed

TECH_STOCKS = [
    "AAPL", "MSFT", "NVDA", "AVGO", "CRM", "ORCL", "CSCO", "ACN", "NOW", "IBM",
    "AMD", "ADBE", "INTU", "QCOM", "TXN", "AMAT", "PLTR", "ANET", "PANW", "MU",
    "ADI", "LRCX", "KLAC", "APH", "INTC", "CDNS", "CRWD", "MSI", "SNPS", "ADSK",
    "FTNT", "ROP", "WDAY", "NXPI", "FICO", "TEL", "IT", "CTSH", "GLW", "DELL",
    "HPQ", "MCHP", "ANSS", "MPWR", "HPE", "KEYS", "GDDY", "CDW", "TYL", "NTAP"
]

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class ProgressTracker:
    def __init__(self, total_months: int):
        self.start_time = time.time()
        self.total_months = total_months
        self.completed_months = 0
        self.total_articles = 0
        self.current_month = None

    def update_progress(self):
        elapsed = time.time() - self.start_time
        avg_time = elapsed / (self.completed_months + 1e-6)
        remaining = avg_time * (self.total_months - self.completed_months)

        progress = (
            f"\n📅 Current: {self.current_month} | "
            f"Completed: {self.completed_months}/{self.total_months} months | "
            f"Articles: {self.total_articles} | "
            f"Elapsed: {timedelta(seconds=int(elapsed))} | "
            f"ETA: {timedelta(seconds=int(remaining))}"
        )
        sys.stdout.write("\r\033[K" + progress)
        sys.stdout.flush()

class VoxScraper:
    def __init__(self, start_date: datetime, end_date: datetime):
        self.base_url = "https://www.vox.com"
        self.rate_limiter = RateLimiter(calls=3, period=1)
        self.article_buffer = []
        self.all_articles = []  # Stores all articles across months
        self.start_date = start_date
        self.end_date = end_date
        self.total_months = ((end_date.year - start_date.year) * 12
                            + end_date.month - start_date.month + 1)
        self.progress = ProgressTracker(self.total_months)
        self.financial_terms = re.compile(
            r'\b(stock|market|tech|economy|investment|trading|'
            r'earnings|valuation|merger|acquisition|IPO)\b',
            re.IGNORECASE
        )

    def fetch_article_content(self, url: str) -> Dict:
        with self.rate_limiter:
            try:
                response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')

                time_tag = soup.find('time')
                article_date = pd.to_datetime(time_tag['datetime']) if time_tag else None

                article_body = soup.find('div', class_='c-entry-content')
                text_content = ' '.join([p.get_text() for p in article_body.find_all('p')]) if article_body else ''

                self.progress.total_articles += 1
                self.progress.update_progress()

                return {
                    'url': url,
                    'timestamp': article_date,
                    'content': text_content,
                    'is_financial': bool(self.financial_terms.search(text_content)),
                    'tickers': self._extract_tickers(text_content)
                }

            except Exception as e:
                logging.error(f"Failed to fetch {url}: {str(e)}")
                return None

    def _extract_tickers(self, text: str) -> List[str]:
        return [ticker for ticker in TECH_STOCKS if re.search(r'\b' + ticker + r'\b', text)]

    def scrape_time_range(self):
        current_date = self.start_date
        while current_date <= self.end_date:
            year = current_date.year
            month = current_date.month
            self.progress.current_month = f"{year}-{month:02d}"
            self.progress.update_progress()

            self._scrape_month(year, month)
            current_date += timedelta(days=32)
            self.progress.completed_months += 1
            self.progress.update_progress()

    def _scrape_month(self, year: int, month: int):
        page = 1
        while True:
            url = f"{self.base_url}/archives/{year}/{month}/{page}"
            try:
                article_links = self._get_article_links(url)
                if not article_links:
                    break

                with ThreadPoolExecutor(max_workers=4) as executor:
                    futures = [executor.submit(self.fetch_article_content, link) for link in article_links]
                    for future in as_completed(futures):
                        if (result := future.result()) is not None:
                            self.article_buffer.append(result)

                page += 1
                time.sleep(0.5)

            except Exception as e:
                logging.error(f"Stopping scrape for {year}-{month:02d}: {str(e)}")
                break

        self._save_buffer(year, month)

    def _get_article_links(self, url: str) -> List[str]:
        try:
            response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            return [
                self.base_url + a['href']
                for a in soup.select('div._1p9ghgz3 a[href^="/"]')
                if not a['href'].startswith("/archives")
            ]
        except Exception as e:
            logging.error(f"Failed to get links from {url}: {str(e)}")
            return []

    def _save_buffer(self, year: int, month: int):
        if not self.article_buffer:
            return

        df = pd.DataFrame(self.article_buffer)
        self.all_articles.extend(self.article_buffer)

        df['date_hour'] = df['timestamp'].dt.floor('h')
        df = df.dropna(subset=['date_hour'])

        path = f"vox_articles_{year}_{month:02d}.parquet"
        df.to_parquet(path, index=False)
        logging.info(f"\n✅ Saved {len(df)} articles to {path}")
        self.article_buffer.clear()

class RateLimiter:
    def __init__(self, calls: int, period: int):
        self.calls = calls
        self.period = period
        self.timestamps = []

    def __enter__(self):
        now = time.time()
        self.timestamps = [t for t in self.timestamps if t > now - self.period]

        if len(self.timestamps) >= self.calls:
            sleep_time = self.period - (now - self.timestamps[0])
            time.sleep(sleep_time)

        self.timestamps.append(time.time())

    def __exit__(self, *args):
        pass

if __name__ == "__main__":
    start_date = datetime(2023, 1, 1)
    end_date = datetime.now()

    scraper = VoxScraper(start_date, end_date)
    print("🚀 Starting Vox.com scraper with progress tracking:")
    print(f"⏳ Time range: {start_date.strftime('%Y-%m')} to {end_date.strftime('%Y-%m')}")
    print("📊 Progress will be updated in real-time below:\n")

    try:
        scraper.scrape_time_range()

        final_df = pd.DataFrame(scraper.all_articles)

        print("\n\n🎉 Final Report:")
        print(f"Total articles collected: {len(final_df)}")

        if not final_df.empty:
            stock_articles = final_df[final_df['tickers'].apply(len) > 0]
            print(f"\n📈 Tech Stock Articles Found ({len(stock_articles)}):")
            for idx, row in stock_articles.iterrows():
                print(f"\n📰 Article {idx + 1}:")
                print(f"   URL: {row['url']}")
                print(f"   Date: {row['timestamp'].strftime('%Y-%m-%d') if pd.notnull(row['timestamp']) else 'Unknown'}")
                print(f"   Tickers: {', '.join(row['tickers'])}")
                print(f"   Financial Terms Found: {'Yes' if row['is_financial'] else 'No'}")
            print(f"\n💾 Full data saved to vox_articles_YYYY_MM.parquet files")
        else:
            print("\nNo articles found in the specified date range")

        print(f"\nProcessed months: {scraper.progress.completed_months}/{scraper.total_months}")

    except KeyboardInterrupt:
        print("\n\n⚠️ Scraping interrupted by user! Partial results saved.")

🚀 Starting Vox.com scraper with progress tracking:
⏳ Time range: 2023-01 to 2025-02
📊 Progress will be updated in real-time below:

[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-01 | Completed: 0/26 months | Articles: 164 | Elapsed: 0:01:40 | ETA: 30268 days, 12:53:09

ERROR:root:Failed to fetch https://www.vox.com/policy-and-politics/2023/1/4/23537232/new-york-kathy-hochul-hector-lasalle-court-of-appeals: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-01 | Completed: 0/26 months | Articles: 175 | Elapsed: 0:01:55 | ETA: 34904 days, 19:04:46

ERROR:root:Failed to fetch https://www.vox.com/future-perfect/23516639/veterinarians-avma-factory-farming-ventilation-shutdown: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-02 | Completed: 1/26 months | Articles: 219 | Elapsed: 0:02:28 | ETA: 1:01:45

ERROR:root:Failed to fetch https://www.vox.com/culture/2023/2/22/23601729/cocaine-bear-movie-true-story: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-02 | Completed: 1/26 months | Articles: 278 | Elapsed: 0:03:15 | ETA: 1:21:18

ERROR:root:Failed to fetch https://www.vox.com/policy-and-politics/2023/2/15/23599953/biden-junk-fee-protection-act-white-house-ticketmaster-resort: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K


ERROR:root:Failed to fetch https://www.vox.com/policy/2023/4/13/23679164/abortion-pill-reversal-5th-circuit-ruling: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)
ERROR:root:Failed to fetch https://www.vox.com/politics/23678636/supreme-court-anthony-comstock-abortion-mifepristone-matthew-kacsmaryk: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-04 | Completed: 3/26 months | Articles: 799 | Elapsed: 0:08:33 | ETA: 1:05:35

ERROR:root:Failed to fetch https://www.vox.com/future-perfect/2023/4/3/23647615/raj-chetty-housing-vouchers-cmto-seattle: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-05 | Completed: 4/26 months | Articles: 877 | Elapsed: 0:09:37 | ETA: 0:52:58

ERROR:root:Failed to fetch https://www.vox.com/culture/23730350/little-mermaid-remake-review-halle-bailey-visuals: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K


ERROR:root:Failed to get links from https://www.vox.com/archives/2023/6/10: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-07 | Completed: 6/26 months | Articles: 1229 | Elapsed: 0:13:26 | ETA: 0:44:46

ERROR:root:Failed to fetch https://www.vox.com/climate/23807520/heat-wave-record-temperature-history-death-valley-climate: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2023-07 | Completed: 6/26 months | Articles: 1318 | Elapsed: 0:14:20 | ETA: 0:47:46

ERROR:root:Failed to fetch https://www.vox.com/climate/23746045/state-farm-california-climate-change-insurance-wildfire-florida-flood: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K


ERROR:root:Failed to fetch https://www.vox.com/politics/378025/trump-harris-2024-election-polls-union-voters: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2024-10 | Completed: 20/26 months | Articles: 3824 | Elapsed: 0:37:49 | ETA: 0:11:20

ERROR:root:Failed to fetch https://www.vox.com/culture/377787/apprentice-movie-sebastian-stan-donald-trump-oscar-nominated-ali-abbasi: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2024-10 | Completed: 20/26 months | Articles: 3853 | Elapsed: 0:38:12 | ETA: 0:11:27

ERROR:root:Failed to fetch https://www.vox.com/climate/377094/hurricane-milton-helene-home-insurance-flooding-damage: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2024-10 | Completed: 20/26 months | Articles: 3867 | Elapsed: 0:38:26 | ETA: 0:11:32

ERROR:root:Failed to fetch https://www.vox.com/videos/376628/why-is-cheese-yellow: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2024-10 | Completed: 20/26 months | Articles: 3880 | Elapsed: 0:38:43 | ETA: 0:11:37

ERROR:root:Failed to fetch https://www.vox.com/culture/376069/joker-folie-a-deux-review-lady-gaga: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)
ERROR:root:Failed to fetch https://www.vox.com/world-politics/375398/israel-palestine-lebanon-october-7-anniversary-one-year: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2024-11 | Completed: 21/26 months | Articles: 3956 | Elapsed: 0:39:34 | ETA: 0:09:25

ERROR:root:Failed to get links from https://www.vox.com/archives/2024/11/4: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2024-12 | Completed: 22/26 months | Articles: 4060 | Elapsed: 0:40:32 | ETA: 0:07:22

ERROR:root:Failed to fetch https://www.vox.com/culture/390327/jay-z-diddy-rape-lawsuit-fbi-sex-trafficking: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2025-01 | Completed: 23/26 months | Articles: 4185 | Elapsed: 0:41:36 | ETA: 0:05:25

ERROR:root:Failed to fetch https://www.vox.com/abortion/396252/abortion-trump-bans-march-for-life-ivf-reproductive-rights-contraception: HTTPSConnectionPool(host='www.vox.com', port=443): Read timed out. (read timeout=10)


[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
[K
📅 Current: 2025-01 | Completed: 24/26 months | Articles: 4306 | Elapsed: 0:42:37 | ETA: 0:03:33

🎉 Final Report:
Total articles collected: 4306

📈 Tech Stock Articles Found (0):

💾 Full data saved to vox_articles_YYYY_MM.parquet files

Processed months: 24/26


In [5]:
import os
import pandas as pd
from datetime import timezone

def merge_and_clean_parquet():
    """Merge all Parquet files and delete individual files"""


    parquet_files = [f for f in os.listdir()
                    if f.startswith("vox_articles_")
                    and f.endswith(".parquet")]

    if not parquet_files:
        print("❌ No Parquet files found to merge")
        return

    print(f"🔍 Found {len(parquet_files)} files to merge")


    merged_df = pd.DataFrame()
    for idx, file in enumerate(parquet_files, 1):
        try:
            df = pd.read_parquet(file)
            merged_df = pd.concat([merged_df, df], ignore_index=True)
            print(f"✅ Merged {file} ({len(df)} articles)")
        except Exception as e:
            print(f"🚨 Error merging {file}: {str(e)}")

    if merged_df.empty:
        print("❌ No data to save")
        return


    merged_file = "merged_vox_articles.parquet"
    merged_df.to_parquet(merged_file, index=False)
    print(f"\n💾 Saved merged data to {merged_file}")
    print(f"📊 Total articles: {len(merged_df):,}")

    if input("\n🚨 Delete original files? (y/n): ").lower() == 'y':
        for file in parquet_files:
            try:
                os.remove(file)
                print(f"🗑️ Deleted {file}")
            except Exception as e:
                print(f"🚨 Error deleting {file}: {str(e)}")
        print("\n✅ Cleanup complete")
    else:
        print("\n🔵 Original files preserved")

if __name__ == "__main__":
    merge_and_clean_parquet()


🔍 Found 24 files to merge
✅ Merged vox_articles_2024_10.parquet (204 articles)
✅ Merged vox_articles_2023_03.parquet (215 articles)
✅ Merged vox_articles_2023_06.parquet (135 articles)
✅ Merged vox_articles_2023_11.parquet (150 articles)
✅ Merged vox_articles_2025_01.parquet (195 articles)
✅ Merged vox_articles_2023_08.parquet (205 articles)
✅ Merged vox_articles_2024_04.parquet (168 articles)
✅ Merged vox_articles_2023_04.parquet (224 articles)
✅ Merged vox_articles_2023_01.parquet (175 articles)
✅ Merged vox_articles_2023_02.parquet (186 articles)
✅ Merged vox_articles_2024_02.parquet (165 articles)
✅ Merged vox_articles_2024_12.parquet (154 articles)
✅ Merged vox_articles_2024_08.parquet (196 articles)
✅ Merged vox_articles_2024_03.parquet (189 articles)
✅ Merged vox_articles_2023_05.parquet (246 articles)
✅ Merged vox_articles_2024_11.parquet (45 articles)
✅ Merged vox_articles_2023_10.parquet (182 articles)
✅ Merged vox_articles_2024_05.parquet (178 articles)
✅ Merged vox_articles

In [9]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed


INPUT_FILE = "/content/merged_vox_articles.parquet"
OUTPUT_FILE = "/content/scraped_vox_articles.parquet"

def format_eta(seconds):
    """Convert seconds to HH:MM:SS format"""
    return time.strftime("%H:%M:%S", time.gmtime(seconds))

def scrape_article(url):
    """Your original working scraper"""
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        article_tag = soup.find("article")
        time_tag = article_tag.find("time", datetime=True) if article_tag else None
        author_tags = soup.select("a[href^='/authors/']")


        content = []
        if article_tag:
            for tag in article_tag.find_all(["h1", "h2", "h3", "p", "blockquote", "ul", "ol"]):
                text = tag.get_text(" ", strip=True)
                if text:
                    content.append(text)

        return {
            "url": url,
            "date_time": time_tag["datetime"] if time_tag else None,
            "author": ", ".join([a.get_text(strip=True) for a in author_tags]) or "Unknown",
            "content": "\n\n".join(content) if content else None
        }

    except Exception as e:
        return {"url": url, "error": str(e)}

def main():
    """Parallel version with Parquet handling and ETA"""
    start_time = time.time()


    df = pd.read_parquet(INPUT_FILE)
    urls = df["url"].tolist()
    total = len(urls)

    results = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(scrape_article, url): url for url in urls}

        processed = 0
        for future in as_completed(futures):
            processed += 1
            result = future.result()
            if result.get('content'):
                results.append(result)

            elapsed = time.time() - start_time
            avg_time = elapsed / processed
            remaining = total - processed
            eta = format_eta(avg_time * remaining)

            sys.stdout.write(
                f"\r🚀 Processed: {processed}/{total} | "
                f"✅ Valid: {len(results)} | "
                f"⏳ ETA: {eta} | "
                f"Elapsed: {format_eta(elapsed)}"
            )
            sys.stdout.flush()


    pd.DataFrame(results).to_parquet(OUTPUT_FILE, index=False)
    print(f"\n\n✅ Saved {len(results)} articles to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

🚀 Processed: 4295/4295 | ✅ Valid: 4295 | ⏳ ETA: 00:00:00 | Elapsed: 00:08:58

✅ Saved 4295 articles to /content/scraped_vox_articles.parquet


In [11]:
import pandas as pd

file_path = "/content/scraped_vox_articles.parquet"
df = pd.read_parquet(file_path)


print(df.head())


                                                 url  \
0  https://www.vox.com/politics/381470/trump-rfk-...   
1  https://www.vox.com/politics/379738/progressiv...   
2  https://www.vox.com/health-care/381484/2024-el...   
3  https://www.vox.com/politics/380978/trump-goog...   
4  https://www.vox.com/culture/381178/sean-combs-...   

                   date_time                                  author  \
0  2024-10-31T18:25:00+00:00            Andrew Prokop, Andrew Prokop   
1  2024-10-31T17:00:00+00:00                Eric Levitz, Eric Levitz   
2  2024-10-31T19:45:02+00:00                Dylan Scott, Dylan Scott   
3  2024-10-31T17:00:00+00:00                Eric Levitz, Eric Levitz   
4  2024-10-31T11:30:00+00:00  Kyndall Cunningham, Kyndall Cunningham   

                                             content  
0  Politics\n\nTrump really could empower RFK Jr....  
1  Politics\n\nThe one big thing progressive crit...  
2  Politics / 2024 Elections\n\nRepublicans are s...  
3  Politic

In [12]:
import pandas as pd
import re

def clean_data_preview():
    try:
        df = pd.read_parquet("/content/scraped_vox_articles.parquet")

        df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
        df['date'] = df['date_time'].dt.strftime('%Y-%m-%d')
        df['time'] = df['date_time'].dt.strftime('%H:%M UTC')


        def parse_author_category(text):
            if pd.isna(text):
                return ('Unknown', 'General')


            parts = re.split(r'\n\n|: ', text, 1)
            if len(parts) > 1:
                author = parts[0].split(',')[0].strip()
                category = parts[1].split('\n')[0].strip()
                return (author, category)
            return (text.split(',')[0].strip(), 'General')

        df[['author', 'category']] = df['author'].apply(
            lambda x: pd.Series(parse_author_category(x))
        )


        df['clean_content'] = df['content'].str.replace('\n', ' ', regex=True).str.strip()

        preview = df.head().copy()
        preview['content_preview'] = preview['clean_content'].str.slice(0, 150) + '...'


        print(f"📆 Articles from {df['date'].min()} to {df['date'].max()}")
        print("\n🧑💻 Authors & Categories:")
        print(preview[['date', 'time', 'author', 'category']].to_string(index=False))

        print("\n📰 Content Previews:")
        for i, row in preview.iterrows():
            print(f"\n=== {row['date']} {row['time']} [{row['category']}] ===")
            print(f"AUTHOR: {row['author']}")
            print(f"CONTENT: {row['content_preview']}\n")

    except Exception as e:
        print(f"🚨 Error: {str(e)}")

clean_data_preview()

📆 Articles from 2023-01-01 to 2025-01-31

🧑💻 Authors & Categories:
      date      time             author category
2024-10-31 18:25 UTC      Andrew Prokop  General
2024-10-31 17:00 UTC        Eric Levitz  General
2024-10-31 19:45 UTC        Dylan Scott  General
2024-10-31 17:00 UTC        Eric Levitz  General
2024-10-31 11:30 UTC Kyndall Cunningham  General

📰 Content Previews:

=== 2024-10-31 18:25 UTC [General] ===
AUTHOR: Andrew Prokop
CONTENT: Politics  Trump really could empower RFK Jr. to wreck public health  Even if Kennedy can’t get Senate confirmation, he could still wield vast influenc...


=== 2024-10-31 17:00 UTC [General] ===
AUTHOR: Eric Levitz
CONTENT: Politics  The one big thing progressive critics of Big Business get wrong  Corporate power isn’t the cause of every social problem.  by Eric Levitz  F...


=== 2024-10-31 19:45 UTC [General] ===
AUTHOR: Dylan Scott
CONTENT: Politics / 2024 Elections  Republicans are serious about cutting people’s health care  Donald Trump