# Creating a hybrid Retrieval Augmented Generation (RAG) system that uses both structured and unstructured data
## 01_Datascraping

### Data Extraction:
Extracting structured data from a database or API (here basketball-reference, nba.com), and unstructured data from web pages or documents (Wikipedia, Play-By-Play Data off of Basketball-Reference, Injury Data from ). For structured data, you can use a database query language like SQL or use pdfplumber on PDF files, and for unstructured data, we will use web scraping libraries like Beautiful Soup or Selenium

Unstructured Data: Scrape data from Basketball-Reference, Wikipedia (Gamelogs, Conference Data, Injury Data).
Structured Data: Collect structured datasets (CSV-Files) for detailed analysis and predictions.

### Imports

In [15]:
# Pandas
import pandas as pd

# Selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# Beautiful soup
from bs4 import BeautifulSoup

# requests
import requests

# time
import time

### Step 1: Data Extraction - NBA.com + Basketball-Reference
Use of Selenium for NBA.com + Basketball-Reference
- After trying to use a web scraping library like Beautiful Soup for team stats I realized that because of the dynamic nature of the site it failed to extract unstructured data from basketball-reference and used Selenium instead
- Additionally for the future I might use a database query language like SQL to extract structured data from a database or API
- Since the player stats are static I was able to use Beatiful Soup which will be used here

In this following case I used requests and Selenium libraries to scrape data from basketball-reference.com.

Steps done beforehand:
Installed Selenium WebDriver
Installed BeautifulSoup

Installed chromedriver using ChromeDriverManager
https://googlechromelabs.github.io/chrome-for-testing/

Use:
Open the Basketball-Reference advanced team stats page
Allow the page to load completely
Extract Data from the Table:

Locate the advanced team stats table using its ID
Extract headers and rows from the table
Store the data in a pandas DataFrame

Save Data:
Save the DataFrame to a CSV file

#### Advanced Stats

In [16]:
def scrape_advanced_team_stats():
    try:
        service = Service(ChromeDriverManager().install())
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Chrome(service=service, options=options)
        url = "https://www.basketball-reference.com/leagues/NBA_2024.html"
        driver.get(url)
        wait = WebDriverWait(driver, 10)
        table = wait.until(EC.presence_of_element_located((By.ID, 'advanced-team')))

        # Get headers
        header_row = table.find_elements(By.XPATH, './/thead/tr[2]/th')
        headers = [th.text for th in header_row]

        # Get rows
        rows = table.find_elements(By.XPATH, './/tbody/tr')
        team_stats = []
        for row in rows:
            stats = [td.text for td in row.find_elements(By.XPATH, './/td')]
            if stats:
                team_stats.append(stats)

        # Create DataFrame
        df = pd.DataFrame(team_stats, columns=headers[1:])

        # Remove * from team name column
        df['Team'] = df['Team'].str.replace('*', '')

        df = df.rename(columns={
            # fixing all other % names
            'TS%': 'TSpct',
            # having a . at the End throws nesting Errors for Elasticsearch, so fixing this
            'Attend.': 'Attend',
            'Attend./G': 'Attend/G'
        })

        # Convert Attend and Attend/G columns to integers
        df['Attend'] = df['Attend'].str.replace(',', '').astype(int)
        df['Attend/G'] = df['Attend/G'].str.replace(',', '').astype(int)

        # Drop unnecessary columns with only null values
        columns_to_drop = [' ', ' .1', ' .2']
        df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

        # Rename columns to distinguish between Offense and Defense Four Factors
        df.columns.values[16:20] = ['Off_eFGpct', 'Off_TOVpct', 'Off_ORBpct', 'Off_FT/FGA']
        df.columns.values[20:24] = ['Def_eFGpct', 'Def_TOVpct', 'Def_DRBpct', 'Def_FT/FGA']

        # Save to CSV
        df.to_csv('advanced_team_stats_2024.csv', index=False)
        print("Scraping complete, data saved to 'advanced_team_stats_2024.csv'")
    except Exception as e:
        print(f"Error during scraping: {e}")
    finally:
        driver.quit()

scrape_advanced_team_stats()

Scraping complete, data saved to 'advanced_team_stats_2024.csv'


#### Clutch Team Stats

In [17]:
# scraping clutch stats using selenium

def scrape_clutch_team_stats():
    driver = webdriver.Chrome()
    driver.get('https://www.nba.com/stats/teams/clutch-traditional?SeasonType=Regular+Season')
    wait = WebDriverWait(driver, 10)
    table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'Crom_table__p1iZz')))
    # define table headers - threw a couple of errors
    header_row = table.find_element(By.CLASS_NAME, 'Crom_headers__mzI_m')
    headers = header_row.find_elements(By.TAG_NAME, 'th')
    header_texts = [header.text for header in headers]
    # filter empty headers
    filtered_headers = [header for header in header_texts if header]
    print(f"Filtered Headers: {filtered_headers}")
    # table rows
    body = table.find_element(By.CLASS_NAME, 'Crom_body__UYOcU')
    rows = body.find_elements(By.TAG_NAME, 'tr')
    # extract the data from the site
    data = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, 'td')
        # exclude the first cell (the extra number)
        data.append([cell.text for cell in cells[1:]])

    df_clutch = pd.DataFrame(data, columns=filtered_headers)
    # function to rename LA Clippers for further data usage - reasoning: basketball-reference page lists them as Los Angeles Clippers
    df_clutch.replace({'LA Clippers': 'Los Angeles Clippers'}, inplace=True)
    # fixing % values for elasticsearch later on
    df_clutch.columns = df_clutch.columns.str.replace('%', 'pct')
    df_clutch.to_csv('clutch_team_stats_2024.csv', index=False)
    print("Scraping complete, data saved to 'clutch_team_stats_2024.csv'")

    # close drive again to stop the process
    driver.quit()

scrape_clutch_team_stats()

Filtered Headers: ['TEAM', 'GP', 'W', 'L', 'WIN%', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', '+/-']
Scraping complete, data saved to 'clutch_team_stats_2024.csv'


#### Post-All-Star Stats

In [18]:
def scrape_post_all_star_advanced_stats():
    # Set up the WebDriver (make sure to have the appropriate driver installed, e.g., chromedriver for Chrome)
    with webdriver.Chrome() as driver:
        driver.get('https://www.nba.com/stats/teams/traditional?SeasonType=Regular+Season&SeasonSegment=Post+All-Star')
        wait = WebDriverWait(driver, 10)
        table = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'Crom_table__p1iZz')))

        # table headers
        header_row = table.find_element(By.CLASS_NAME, 'Crom_headers__mzI_m')
        headers = header_row.find_elements(By.TAG_NAME, 'th')
        header_texts = [header.text for header in headers]

        # emtpy headers needed to be filtered
        filtered_headers = [header for header in header_texts if header]
        print(f"Filtered Headers: {filtered_headers}")

        # table rows finding
        body = table.find_element(By.CLASS_NAME, 'Crom_body__UYOcU')
        rows = body.find_elements(By.TAG_NAME, 'tr')

        # data extraction
        data = []
        for row in rows:
            cells = row.find_elements(By.TAG_NAME, 'td')
            # Use all cells since there's no extra number
            data.append([cell.text for cell in cells])

        # print for debugging since I had some issues
        print(f"Sample data row: {data[:2]}")

        # create df
        df = pd.DataFrame(data, columns=filtered_headers)

        # function to rename LA Clippers for further data usage
        df.replace({'LA Clippers': 'Los Angeles Clippers'}, inplace=True)
        df.columns = df.columns.str.replace('%', 'pct')

        df.drop(df.columns[0], axis=1, inplace=True)

        # Save to CSV
        df.to_csv('post_all_star_advanced_stats_2024.csv', index=False)
        print("Scraping complete, data saved to 'post_all_star_advanced_stats_2024.csv'")

        # close drive again to stop the process
        driver.quit()

scrape_post_all_star_advanced_stats()

Filtered Headers: [' ', 'TEAM', 'GP', 'W', 'L', 'WIN%', 'MIN', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'BLKA', 'PF', 'PFD', '+/-']
Sample data row: [['1', 'Boston Celtics', '27', '21', '6', '.778', '48.2', '120.4', '44.8', '90.0', '49.8', '17.0', '42.0', '40.4', '13.8', '17.2', '80.2', '10.7', '33.6', '44.3', '28.3', '10.7', '7.7', '6.7', '3.7', '14.4', '15.3', '13.9'], ['1', 'Denver Nuggets', '27', '21', '6', '.778', '48.2', '116.7', '45.3', '88.7', '51.0', '12.1', '31.4', '38.6', '14.1', '18.2', '77.4', '10.0', '35.1', '45.1', '31.1', '12.7', '7.9', '6.1', '4.4', '17.4', '16.3', '9.6']]
Scraping complete, data saved to 'post_all_star_advanced_stats_2024.csv'


#### Player Data Scraping

In [19]:
# dictionary mapping full team names and common abbreviations to their standard NBA abbreviations (used in different function - but needed in reverse for the scraping)
team_map = {
    'Atlanta Hawks': 'ATL',
    'Boston Celtics': 'BOS',
    'Brooklyn Nets': 'BKN',
    'Charlotte Hornets': 'CHA',
    'Chicago Bulls': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Cleveland': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Denver Nuggets': 'DEN',
    'Detroit Pistons': 'DET',
    'Golden State Warriors': 'GSW',
    'Houston Rockets': 'HOU',
    'Indiana Pacers': 'IND',
    'Los Angeles Clippers': 'LAC',
    'Los Angeles Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Miami Heat': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'New Orleans Pelicans': 'NOP',
    'New York Knicks': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Orlando Magic': 'ORL',
    'Philadelphia 76ers': 'PHI',
    'Phoenix Suns': 'PHX',
    'Portland Trail Blazers': 'POR',
    'Sacramento Kings': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Toronto Raptors': 'TOR',
    'Utah Jazz': 'UTA',
    'Washington Wizards': 'WAS',
}

In [20]:
# reverse mapping this time
reverse_team_map = {v: k for k, v in team_map.items()}

In [21]:
# URLs for the playoff teams in the 2024 season to limit dataset size for now - can be increased later on
playoff_team_urls = [
    "https://www.basketball-reference.com/teams/BOS/2024.html",
    "https://www.basketball-reference.com/teams/MIA/2024.html",
    "https://www.basketball-reference.com/teams/NYK/2024.html",
    "https://www.basketball-reference.com/teams/PHI/2024.html",
    "https://www.basketball-reference.com/teams/MIL/2024.html",
    "https://www.basketball-reference.com/teams/IND/2024.html",
    "https://www.basketball-reference.com/teams/CLE/2024.html",
    "https://www.basketball-reference.com/teams/ORL/2024.html",
    "https://www.basketball-reference.com/teams/OKC/2024.html",
    "https://www.basketball-reference.com/teams/NOP/2024.html",
    "https://www.basketball-reference.com/teams/DEN/2024.html",
    "https://www.basketball-reference.com/teams/LAL/2024.html",
    "https://www.basketball-reference.com/teams/MIN/2024.html",
    "https://www.basketball-reference.com/teams/PHO/2024.html",
    "https://www.basketball-reference.com/teams/LAC/2024.html",
    "https://www.basketball-reference.com/teams/DAL/2024.html"
]

# DataFrame for player data
player_df = pd.DataFrame()

In [22]:
# Define a function to scrape player data from a team's page
def scrape_player_data(url):
    tries = 3
    for _ in range(tries):
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            table = soup.find('table', {'id': 'advanced'})

            # Get the team abbreviation from the URL
            team_abbreviation = url.split('/')[4]
            # Map the abbreviation to the full team name
            team_name = reverse_team_map.get(team_abbreviation, team_abbreviation)

            # Extract column headers, replacing "%" with "_percent"
            headers = [th.get_text().replace('%', 'pct') for th in table.find('thead').find_all('th')]

            # Drop columns 17 and 22 adjusted 0 base index
            columns_to_drop = [17, 22]
            headers = [header for i, header in enumerate(headers) if i not in columns_to_drop]

            # Extract player rows
            rows = table.find('tbody').find_all('tr')

            # Extract data from each row
            team_data = []
            for row in rows:
                if row.find('th', {'scope': 'row'}) is not None:  # Only process rows with player data
                    player_data = [td.get_text() for td in row.find_all('td')]
                    player_data.insert(0, row.find('th').get_text())  # Insert the rank
                    # Drop columns 17 and 22
                    player_data = [data for i, data in enumerate(player_data) if i not in columns_to_drop]
                    player_data.append(team_name)  # Add the team name
                    team_data.append(player_data)
            team_df = pd.DataFrame(team_data, columns=headers + ['Team'])
            return team_df
        except (requests.exceptions.RequestException, AttributeError) as e:
            print(f"Error scraping {url}: {e}")
            time.sleep(5)  # Wait before retrying
    else:
        print(f"Failed to scrape {url} after {tries} tries.")
        return pd.DataFrame()

In [23]:
# scrape data for teams but stagger to not strain servers
for url in playoff_team_urls:
    team_df = scrape_player_data(url)
    player_df = pd.concat([player_df, team_df], ignore_index=True)
    print(f"Scraped data from {url}")
    time.sleep(10)  # 10 seconds before the next request

Scraped data from https://www.basketball-reference.com/teams/BOS/2024.html
Scraped data from https://www.basketball-reference.com/teams/MIA/2024.html
Scraped data from https://www.basketball-reference.com/teams/NYK/2024.html
Scraped data from https://www.basketball-reference.com/teams/PHI/2024.html
Scraped data from https://www.basketball-reference.com/teams/MIL/2024.html
Scraped data from https://www.basketball-reference.com/teams/IND/2024.html
Scraped data from https://www.basketball-reference.com/teams/CLE/2024.html
Scraped data from https://www.basketball-reference.com/teams/ORL/2024.html
Scraped data from https://www.basketball-reference.com/teams/OKC/2024.html
Scraped data from https://www.basketball-reference.com/teams/NOP/2024.html
Scraped data from https://www.basketball-reference.com/teams/DEN/2024.html
Scraped data from https://www.basketball-reference.com/teams/LAL/2024.html
Scraped data from https://www.basketball-reference.com/teams/MIN/2024.html
Scraped data from https:/

In [24]:
# check data
player_df

Unnamed: 0,Rk,Player,Age,G,MP,PER,TSpct,3PAr,FTr,ORBpct,...,USGpct,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,Team
0,1,Jayson Tatum,25,74,2645,22.3,.604,.427,.349,2.9,...,30.2,6.4,4.1,10.4,.189,4.5,0.6,5.1,4.7,Boston Celtics
1,2,Derrick White,29,73,2381,16.9,.611,.590,.181,2.4,...,18.6,5.2,3.3,8.5,.171,2.4,1.4,3.8,3.5,Boston Celtics
2,3,Jaylen Brown,27,70,2343,18.6,.580,.326,.239,4.0,...,28.9,2.7,3.2,5.9,.121,0.9,-0.1,0.7,1.6,Boston Celtics
3,4,Jrue Holiday,33,69,2263,14.4,.597,.467,.104,4.2,...,16.3,3.3,3.0,6.3,.133,1.0,1.1,2.1,2.4,Boston Celtics
4,5,Payton Pritchard,26,82,1825,15.3,.597,.602,.088,4.3,...,17.2,4.2,1.9,6.1,.160,1.3,-0.2,1.0,1.4,Boston Celtics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,18,Markieff Morris,34,26,216,6.6,.467,.618,.088,2.0,...,16.4,-0.3,0.2,-0.1,-0.014,-4.0,-1.3,-5.3,-0.2,Dallas Mavericks
333,19,Brandon Williams,24,17,113,9.3,.439,.278,.315,3.9,...,26.0,-0.2,0.1,-0.1,-0.044,-2.7,-2.8,-5.5,-0.1,Dallas Mavericks
334,20,Greg Brown III,22,6,40,10.5,.501,.273,.818,8.2,...,20.3,0.0,0.0,0.0,.000,-5.0,0.6,-4.4,0.0,Dallas Mavericks
335,21,Dexter Dennis,24,4,30,18.3,.541,.421,.158,3.7,...,33.3,0.0,0.0,0.0,.004,1.7,-0.1,1.7,0.0,Dallas Mavericks


In [25]:
# save the player data to a CSV
player_df.to_csv('nba_2024_player_data.csv', index=False)

print("Player data saved to 'nba_2024_player_data.csv'")

Player data saved to 'nba_2024_player_data.csv'


#### Play by Play Scraping

In [26]:
# using full team map this time
# dictionary mapping full team names and common abbreviations to their standard NBA abbreviations
team_map = {
    'Atlanta Hawks': 'ATL',
    'Hawks': 'ATL',
    'Atlanta': 'ATL',
    'Boston Celtics': 'BOS',
    'Celtics': 'BOS',
    'Boston': 'BOS',
    'Brooklyn Nets': 'BKN',
    'Nets': 'BKN',
    'Brooklyn': 'BKN',
    'Charlotte Hornets': 'CHA',
    'Hornets': 'CHA',
    'Charlotte': 'CHA',
    'Chicago Bulls': 'CHI',
    'Bulls': 'CHI',
    'Chicago': 'CHI',
    'Cleveland Cavaliers': 'CLE',
    'Cavaliers': 'CLE',
    'Cavs': 'CLE',
    'Cleveland': 'CLE',
    'Dallas Mavericks': 'DAL',
    'Mavericks': 'DAL',
    'Mavs': 'DAL',
    'Dallas': 'DAL',
    'Denver Nuggets': 'DEN',
    'Nuggets': 'DEN',
    'Denver': 'DEN',
    'Detroit Pistons': 'DET',
    'Pistons': 'DET',
    'Detroit': 'DET',
    'Golden State Warriors': 'GSW',
    'Warriors': 'GSW',
    'Golden State': 'GSW',
    'Houston Rockets': 'HOU',
    'Rockets': 'HOU',
    'Houston': 'HOU',
    'Indiana Pacers': 'IND',
    'Pacers': 'IND',
    'Indiana': 'IND',
    'Los Angeles Clippers': 'LAC',
    'Clippers': 'LAC',
    'LA Clippers': 'LAC',
    'Los Angeles': 'LAC',
    'LA': 'LAC',  # Assuming LA Clippers context
    'Los Angeles Lakers': 'LAL',
    'Lakers': 'LAL',
    'LA Lakers': 'LAL',
    'Memphis Grizzlies': 'MEM',
    'Grizzlies': 'MEM',
    'Memphis': 'MEM',
    'Miami Heat': 'MIA',
    'Heat': 'MIA',
    'Miami': 'MIA',
    'Milwaukee Bucks': 'MIL',
    'Bucks': 'MIL',
    'Milwaukee': 'MIL',
    'Minnesota Timberwolves': 'MIN',
    'Timberwolves': 'MIN',
    'Wolves': 'MIN',
    'Minnesota': 'MIN',
    'New Orleans Pelicans': 'NOP',
    'Pelicans': 'NOP',
    'New Orleans': 'NOP',
    'New York Knicks': 'NYK',
    'Knicks': 'NYK',
    'New York': 'NYK',
    'Oklahoma City Thunder': 'OKC',
    'Thunder': 'OKC',
    'Oklahoma City': 'OKC',
    'Orlando Magic': 'ORL',
    'Magic': 'ORL',
    'Orlando': 'ORL',
    'Philadelphia 76ers': 'PHI',
    '76ers': 'PHI',
    'Sixers': 'PHI',
    'Philadelphia': 'PHI',
    'Phoenix Suns': 'PHX',
    'Suns': 'PHX',
    'Phoenix': 'PHX',
    'Portland Trail Blazers': 'POR',
    'Trail Blazers': 'POR',
    'Blazers': 'POR',
    'Portland': 'POR',
    'Sacramento Kings': 'SAC',
    'Kings': 'SAC',
    'Sacramento': 'SAC',
    'San Antonio Spurs': 'SAS',
    'Spurs': 'SAS',
    'San Antonio': 'SAS',
    'Toronto Raptors': 'TOR',
    'Raptors': 'TOR',
    'Toronto': 'TOR',
    'Utah Jazz': 'UTA',
    'Jazz': 'UTA',
    'Utah': 'UTA',
    'Washington Wizards': 'WAS',
    'Wizards': 'WAS',
    'Washington': 'WAS'
}

In [27]:
# fetches the play-by-play links for games between two specified teams in a given season
# Inputs: team_name, opponent_name, season year

def get_matchup_links(team_name, opponent_name, season):
    team_abbr = team_map.get(team_name, team_name)
    url = f"https://www.basketball-reference.com/teams/{team_abbr}/{season}_games.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    game_links = []
    for row in soup.select('table#games tbody tr'):
        opponent_cell = row.find('td', {'data-stat': 'opp_name'})
        if opponent_cell:
            opponent_full_name = opponent_cell.get_text()
            if any(name_part.lower() in opponent_full_name.lower() for name_part in opponent_name.split()):
                box_score_cell = row.find('td', {'data-stat': 'box_score_text'})
                if box_score_cell and box_score_cell.find('a'):
                    game_link = box_score_cell.find('a')['href']
                    # Insert '/pbp/' in the link
                    game_link = game_link.replace('/boxscores/', '/boxscores/pbp/')
                    game_links.append(f"https://www.basketball-reference.com{game_link}")

    return game_links

# fetching play by plays -> used this as a guideline: https://github.com/schadam26/BR_Scrape
# https://jman4190.medium.com/how-to-accessing-live-nba-play-by-play-data-f24e02b0a976
# Iterates over each URL and fetches the page content, extracts team names using get_teams, parses the play-by-play table to extract detailed play information
# Structures each play with time, score, away team actions, and home team actions.

def fetch_and_structure_play_by_play(urls):
    all_plays = {}

    for url in urls:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        teams = get_teams(soup)
        plays = []
        table = soup.find('table', {'id': 'pbp'})

        if table:
            for row in table.find_all('tr'):
                cells = row.find_all('td')
                if len(cells) > 4:
                    time = cells[0].get_text()
                    away_play = extract_player_action(cells[1])
                    score = cells[3].get_text()
                    home_play = extract_player_action(cells[5])

                    play = {
                        'time': time,
                        team_map.get(teams['away'], teams['away']): away_play,
                        'score': score,
                        team_map.get(teams['home'], teams['home']): home_play
                    }
                    plays.append(play)

        all_plays[url] = plays

    return all_plays

# extracting teamnames from soup object
def get_teams(soup):
    scorebox = soup.find('div', {'class': 'scorebox'})
    teams = scorebox.find_all('strong')
    away_team = teams[0].get_text()
    home_team = teams[1].get_text()
    return {'away': away_team, 'home': home_team}

# extracting player names
def extract_player_action(cell):
    if not cell or cell.get_text(strip=True) == '':
        return {'player': '', 'action': ''}

    player_tags = cell.find_all('a')
    players = ' '.join(tag.get_text() for tag in player_tags)
    full_text = cell.get_text(separator=' ', strip=True)
    action_text = full_text.replace(players, "").strip()

    if not players:
        players = ''
        action_text = full_text

    return {
        'player': players.strip(),
        'action': action_text.strip()
    }

# example
team_name = 'Boston Celtics'
opponent_name = 'Dallas Mavericks'
season = 2024

matchup_links = get_matchup_links(team_name, opponent_name, season)
structured_data = fetch_and_structure_play_by_play(matchup_links)

In [28]:
structured_data

{'https://www.basketball-reference.com/boxscores/pbp/202401220DAL.html': [{'time': '11:49.0',
   '\nBoston Celtics\n': {'player': 'J. Brown',
    'action': 'misses 2-pt jump shot from 15 ft'},
   'score': '0-0',
   '\nDallas Mavericks\n': {'player': '', 'action': ''}},
  {'time': '11:47.0',
   '\nBoston Celtics\n': {'player': '', 'action': ''},
   'score': '0-0',
   '\nDallas Mavericks\n': {'player': 'L. Dončić',
    'action': 'Defensive rebound by'}},
  {'time': '11:31.0',
   '\nBoston Celtics\n': {'player': '', 'action': ''},
   'score': '0-2',
   '\nDallas Mavericks\n': {'player': 'L. Dončić',
    'action': 'makes 2-pt layup from 2 ft'}},
  {'time': '11:10.0',
   '\nBoston Celtics\n': {'player': 'A. Horford',
    'action': 'misses 3-pt jump shot from 25 ft'},
   'score': '0-2',
   '\nDallas Mavericks\n': {'player': '', 'action': ''}},
  {'time': '11:09.0',
   '\nBoston Celtics\n': {'player': '', 'action': ''},
   'score': '0-2',
   '\nDallas Mavericks\n': {'player': 'J. Green',
    

Regarding use case of RAG - ensuring play by play data is consistent:

Ensuring Consistent Field Names: Making sure that the field names are consistent across all entries : the field names are time, score, and the team abbreviations for away and home plays

Flattening Nested Structures: Flattening nested dictionaries (e.g., {'player': 'J. Brown', 'action': 'misses 2-pt jump shot from 15 ft'}) these structures into a single-level dictionary

Providing Contextual Information: Ensuring that each play contains enough context, keeping track of which team is currently playing home or away, and including this information in each play entry

In [29]:
# consistency, flatteing and context as seen above
def export_to_csv(data, filename):

    all_data = []

    for url, plays in data.items():
        for play in plays:
            # Flatten the nested structures
            flattened_play = {
                'url': url,
                'time': play['time'],
                'score': play['score'],
                'away_team': list(play.keys())[2],  # Get the away team abbreviation
                'away_player': play[list(play.keys())[2]]['player'],
                'away_action': play[list(play.keys())[2]]['action'],
                'home_team': list(play.keys())[4],  # Get the home team abbreviation
                'home_player': play[list(play.keys())[4]]['player'],
                'home_action': play[list(play.keys())[4]]['action'],
            }
            all_data.append(flattened_play)

    df = pd.DataFrame(all_data)
    df.to_csv(filename, index=False)

# Example usage
export_to_csv(structured_data, 'structured_play_by_play.csv')

TypeError: string indices must be integers, not 'str'

#### Scraping game results

In [None]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

# Function to fetch game results from a single page
def fetch_game_results(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.content, 'html.parser')

    games = []

    schedule_table = soup.find('table', {'id': 'schedule'})
    if not schedule_table:
        return games  # If the table is not found, return an empty list

    for row in schedule_table.find('tbody').find_all('tr'):
        if 'class' in row.attrs and 'thead' in row.attrs['class']:
            continue  # Skip header rows

        date_str = row.find('th', {'data-stat': 'date_game'}).text.strip()
        date = datetime.strptime(date_str, '%a, %b %d, %Y')
        if date > datetime(2024, 4, 19):
            continue  # Skip games after April 19th

        home_team = row.find('td', {'data-stat': 'home_team_name'}).text.strip()
        away_team = row.find('td', {'data-stat': 'visitor_team_name'}).text.strip()
        home_score = int(row.find('td', {'data-stat': 'home_pts'}).text.strip())
        away_score = int(row.find('td', {'data-stat': 'visitor_pts'}).text.strip())

        winning_team = home_team if home_score > away_score else away_team
        losing_team = away_team if home_score > away_score else home_team
        point_differential = abs(home_score - away_score)

        games.append({
            "Date": date_str,
            "Home Team": home_team,
            "Away Team": away_team,
            "Home Score": home_score,
            "Away Score": away_score,
            "Winning Team": winning_team,
            "Losing Team": losing_team,
            "Point Differential": point_differential
        })

    return games

# Generate URLs for each month from October to April
base_url = "https://www.basketball-reference.com/leagues/NBA_2024_games-"
months = ["october", "november", "december", "january", "february", "march", "april"]
urls = [f"{base_url}{month}.html" for month in months]

all_games = []

for url in urls:
    games = fetch_game_results(url)
    all_games.extend(games)
    time.sleep(5)  # Staggering the requests to avoid overloading the server

# Creating a DataFrame and saving to CSV
df = pd.DataFrame(all_games)
csv_file_path = "NBA_2024_comprehensive_results.csv"
df.to_csv(csv_file_path, index=False)
print(f"CSV file saved to {csv_file_path}")

#### possible injury scraping for live search

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

def fetch_injury_data(team, start_date, end_date):
    # Format the URL with the provided team and date range
    url = f"https://www.prosportstransactions.com/basketball/Search/SearchResults.php?Player=&Team={team}&BeginDate={start_date}&EndDate={end_date}&PlayerMovementChkBx=yes&ILChkBx=yes&NBADLChkBx=yes&InjuriesChkBx=yes&PersonalChkBx=yes&DisciplinaryChkBx=yes&LegalChkBx=yes&Submit=Search"

    # Fetch the webpage content
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the relevant table
    table = soup.find('table', class_='datatable center')

    if not table:
        print(f"No table found for team {team} between {start_date} and {end_date}")
        return None

    # Extract column headers
    headers = [th.get_text(strip=True) for th in table.find('tr', class_='DraftTableLabel').find_all('td')]

    # Extract data rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skip the header row
        cells = row.find_all('td')
        if len(cells) == len(headers):
            row_data = [cell.get_text(strip=True) for cell in cells]
            rows.append(row_data)

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(rows, columns=headers)

    return df

# Define the teams, start date, and end date
teams = ['Celtics', 'Lakers']
start_date = '2024-06-01' #datetime.today().strftime('%Y-%m-%d')
end_date = datetime.today().strftime('%Y-%m-%d')

# Fetch data for each team and concatenate the results
dataframes = []
for team in teams:
    team_data = fetch_injury_data(team, start_date, end_date)
    if team_data is not None:
        dataframes.append(team_data)

if dataframes:
    all_teams_data = pd.concat(dataframes, ignore_index=True)
else:
    print("No data found for the specified teams and date range.")