---
title: "scrape"
eval: false
---

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import argparse
from datetime import datetime

def get_nhl_stats(team_code, year, month, session_cookie=None):
    """
    Scrape NHL player stats from Natural Stat Trick for a specific team and month.
    
    Args:
        team_code (str): The three-letter team code (e.g., 'VAN' for Vancouver)
        year (int): The year (e.g., 2025)
        month (int): The month (1-12)
        session_cookie (str, optional): Your Natural Stat Trick session cookie for premium access
    
    Returns:
        pandas.DataFrame: Player statistics for the specified period
    """
    # Calculate the first and last day of the month
    first_day = f"{year}-{month:02d}-01"
    
    # Simple way to get last day of month
    if month == 12:
        next_month_year = year + 1
        next_month = 1
    else:
        next_month_year = year
        next_month = month + 1
    
    last_day_obj = datetime(next_month_year, next_month, 1)
    last_day_obj = last_day_obj.replace(day=1) - pd.Timedelta(days=1)
    last_day = last_day_obj.strftime("%Y-%m-%d")
    
    # Determine the season
    if month >= 9:  # NHL season starts in October, so September is part of the new season
        season_start = year
        season_end = year + 1
    else:
        season_start = year - 1
        season_end = year
    
    season = f"{season_start}{season_end}"
    
    url = (f"https://www.naturalstattrick.com/playerteams.php?"
           f"fromseason={season}&thruseason={season}&stype=2&sit=all&score=all&"
           f"stdoi=oi&rate=n&team={team_code}&pos=S&loc=B&toi=0&"
           f"gpfilt=gpdate&fd={first_day}&td={last_day}&tgp=5&lines=single&draftteam=ALL")
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    }
    
    # Add session cookie if provided (needed for premium features)
    cookies = {}
    if session_cookie:
        cookies['PHPSESSID'] = session_cookie
    
    response = requests.get(url, headers=headers, cookies=cookies)
    
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data: Status code {response.status_code}")
    
    # Parse the HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the stats table
    table = soup.find('table', {'id': 'playerTable'})
    
    if not table:
        raise Exception("Could not find the player stats table on the page")
    
    # Extract headers
    headers = []
    for th in table.find('thead').find_all('th'):
        headers.append(th.text.strip())
    
    # Extract rows
    rows = []
    for tr in table.find('tbody').find_all('tr'):
        row = []
        for td in tr.find_all('td'):
            row.append(td.text.strip())
        rows.append(row)
    
    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)
    
    # Add metadata
    df['team'] = team_code
    df['year'] = year
    df['month'] = month
    df['url'] = url
    
    return df

def scrape_multiple_months(team_code, start_year, start_month, end_year, end_month, session_cookie=None):
    """
    Scrape data for multiple consecutive months
    """
    all_data = []
    current_year, current_month = start_year, start_month
    
    while (current_year < end_year) or (current_year == end_year and current_month <= end_month):
        print(f"Scraping {team_code} data for {current_year}-{current_month:02d}...")
        try:
            df = get_nhl_stats(team_code, current_year, current_month, session_cookie)
            all_data.append(df)
            print(f"Found {len(df)} player records")
        except Exception as e:
            print(f"Error scraping {current_year}-{current_month:02d}: {e}")
        
        # Move to next month
        if current_month == 12:
            current_month = 1
            current_year += 1
        else:
            current_month += 1
        
        # Be nice to the server
        time.sleep(2)
    
    if all_data:
        return pd.concat(all_data, ignore_index=True)
    return pd.DataFrame()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Scrape NHL player stats from Natural Stat Trick')
    parser.add_argument('team', type=str, help='Three-letter team code (e.g., VAN)')
    parser.add_argument('--start-year', type=int, required=True, help='Starting year (e.g., 2024)')
    parser.add_argument('--start-month', type=int, required=True, help='Starting month (1-12)')
    parser.add_argument('--end-year', type=int, required=True, help='Ending year (e.g., 2025)')
    parser.add_argument('--end-month', type=int, required=True, help='Ending month (1-12)')
    parser.add_argument('--session', type=str, help='Your Natural Stat Trick session cookie (for premium access)')
    parser.add_argument('--output', type=str, default='nhl_stats.csv', help='Output CSV filename')
    
    args = parser.parse_args()
    
    df = scrape_multiple_months(
        args.team, 
        args.start_year, 
        args.start_month, 
        args.end_year, 
        args.end_month,
        args.session
    )
    
    if not df.empty:
        df.to_csv(args.output, index=False)
        print(f"Data saved to {args.output}")
    else:
        print("No data was collected")
