In [None]:
import pandas as pd
import requests
import bs4
import time
import re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [None]:
url = 'https://quotes.toscrape.com/' # get the url of the site you want to scrape

request = requests.get(url) # use the requests library to the http request

In [None]:
data = bs4.BeautifulSoup(request.text, 'html.parser') 
# uncomment this if you want to see what beautifulsoup returns, it is rather long

In [None]:
quote_list = data.find_all('div', attrs={'class':'quote'}) # find_all finds every instance of the tag 'div', with the listed attributes
#quote_list


In [None]:
quote_data = { # establishing a dictionary in which to house the scrape data
    'quote_text': [],
    'author': [],
    'tags': [],
}

for quote in quote_list:
    
    # for every quote in the quote list, we find the instance of the three things we're looking for, the quote text, the author, and the associated tags
    # also note that .text gets rid of all the html stuff around the text
    
    quote_data['quote_text'].append(quote.find('span', attrs={'class':'text'}).text)
    quote_data['author'].append(quote.find('small', attrs={'class':'author'}).text)
    tags = [x.text for x in quote.find_all('a', attrs={'class':'tag'})]
    quote_data['tags'].append(tags)
    
quote_df = pd.DataFrame(data = quote_data, columns = quote_data.keys())
quote_df

In [None]:
# changing pages manually for static sites

quote_data = { # establishing a dictionary in which to house the scrape data
    'quote_text': [],
    'author': [],
    'tags': [],
}

for i in range(0,11): # this workds because we know the way the url works, in terms of what happens when you change the page
    url = f'https://quotes.toscrape.com/page/{i}/' 
    request = requests.get(url) 
    data = bs4.BeautifulSoup(request.text, 'html.parser') 
    quote_list = data.find_all('div', attrs={'class':'quote'})
    
    for quote in quote_list:
    
    # for every quote in the quote list, we find the instance of the three things we're looking for, the quote text, the author, and the associated tags
    # also note that .text gets rid of all the html stuff around the text
    
        quote_data['quote_text'].append(quote.find('span', attrs={'class':'text'}).text)
        quote_data['author'].append(quote.find('small', attrs={'class':'author'}).text)
        tags = [x.text for x in quote.find_all('a', attrs={'class':'tag'})]
        quote_data['tags'].append(tags)
    
quote_df_two = pd.DataFrame(data = quote_data, columns = quote_data.keys())
quote_df_two


In [None]:
# super short example about automated webscraping with Selenium, chromedriver, and (still) beautiful soup

driver = webdriver.Chrome() # establishing the chromedriver
url = 'https://www.nba.com/stats/players/advanced' # getting our starting point url
driver.get(url) # giving the url to our webdriver

In [None]:
# nba.com is a dynamic site, clicking around won't neccesarily change the url

# we need to change the site to display all the players at once, which requires us to change the dynamic parts of the page
select = Select(driver.find_element('xpath',r"/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
select.select_by_index(0)

In [None]:
statsDict = {
    'NAME' : [],
    'AGE' : [],
    'TEAM' : [],
    'GP' : [],
    'W' : [],
    'L' : [],
    'MINUTES' : [],
    'OFFRTG' : [],
    'DEFRTG' : [],
    'NETRTG' : [],
    'AST%' : [],
    'AST/TO' : [],
    'AST RATIO' : [],
    'OREB%' : [],
    'DREB%' : [],
    'REB%' : [],
    'TO RATIO' : [],
    'EFG%' : [],
    'TS%' : [],
    'USG%' : [],
    'PACE' : [],
    'PIE' : [], 
}

src = driver.page_source
parser = bs4.BeautifulSoup(src, 'html.parser') # establishing beautiful soup as our parser

# finding the table 
table = parser.find('table', attrs={'class': 'Crom_table__p1iZz'}) 
tt = table.find('tbody', attrs={'class':'Crom_body__UYOcU'})

# finding the player data
players = tt.find_all('tr')[:20]
playerList = [p.find_all('td') for p in players]

# we want to append all of the data into our dictionary
for player in playerList:
    statsDict['NAME'].append(player[1])
    statsDict['TEAM'].append(player[2])
    statsDict['AGE'].append(player[3])
    statsDict['GP'].append(player[4])
    statsDict['W'].append(player[5])
    statsDict['L'].append(player[6])
    statsDict['MINUTES'].append(player[7])
    statsDict['OFFRTG'].append(player[8])
    statsDict['DEFRTG'].append(player[9])
    statsDict['NETRTG'].append(player[10])
    statsDict['AST%'].append(player[11])
    statsDict['AST/TO'].append(player[12])
    statsDict['AST RATIO'].append(player[13])
    statsDict['OREB%'].append(player[14])
    statsDict['DREB%'].append(player[15])
    statsDict['REB%'].append(player[16])
    statsDict['TO RATIO'].append(player[17])
    statsDict['EFG%'].append(player[18])
    statsDict['TS%'].append(player[19])
    statsDict['USG%'].append(player[20])
    statsDict['PACE'].append(player[21])
    statsDict['PIE'].append(player[22])
    
# creating a dataframe with our scraped data
player_df_dirty = pd.DataFrame(data = statsDict, columns= statsDict.keys())



In [None]:
# defining functions in order to clean our dataframe 

def extract_name(html_string):
    match = re.search(r'>([^<>]+)<', html_string)
    return match.group(1) if match else html_string

def clean_html_columns(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(extract_name)
    return df

player_df = clean_html_columns(player_df_dirty, player_df_dirty.columns)
player_df

In [None]:
driver.close()

In [None]:
# Interactive section:
# We want to find the FIRST QUARTER statistics for Austin Reaves in the game on November 10th, using Selenium and Chromedriver

In [None]:
driver = webdriver.Chrome() 
url = 'https://www.nba.com/stats/player/1630559/boxscores-traditional?Season=2023-24' 
driver.get(url) 


In [None]:
# We want to expand the advanced filters selector, so we'll need the xpath for it

button = driver.find_element('xpath',r"How do we find the xpath\?")
button.click() # this line just clicks the button you have selected by the driver

In [None]:
# We want to select the dropdown menu for the quarter box

QuarterSelect = Select(driver.find_element('xpath',r"How do we find the xpath\?"))
QuarterSelect.select_by_index('What index should you select?')

In [None]:
# And now we want to confirm our filter with the website by selecting and clicking the large 'Get Stats button'

getStats = driver.find_element('xpath',r"How do we find the xpath\?")
getStats.click()

In [None]:
# Like in the example, we have to have the website display all available games instead of the first 50 to display the game on November 11th

AllGamesSelect = Select(driver.find_element('xpath',r"How do we find the xpath\?"))
AllGamesSelect.select_by_index(0) # pre-entered this one because it was acting strange during testing


In [None]:
# Run this it's already all written out just trust me

src = driver.page_source
parser = bs4.BeautifulSoup(src, 'html.parser')
table = parser.find('table', attrs={'class': 'Crom_table__p1iZz'}) 
tt = table.find('tbody', attrs={'class':'Crom_body__UYOcU'})
game = tt.find_all('tr')[73]
N10game = []
for i in game:
    N10game.append(i.text)
    
N10gameDF = pd.DataFrame(data = N10game )
N10gameDF = N10gameDF.T
N10gameDF.columns = ['Match Up','W/L','MIN','PTS','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB'	,'DREB','REB','AST','STL','BLK','TOV','PF','+/-']
N10gameDF


In [None]:
# should return true for all columns

N10gameDF.iloc[0] == ['Nov 10, 2023 - LAL @ PHX','W','6:30','3','1','3','33.3','0','1','0.0','1','1','100','0','0','0','0','1','0','0','0','-7']

In [None]:
# Don't forget to close the webdriver

driver.close()