In [2]:
import pandas as pd
import requests
import bs4
import time
import re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select

In [3]:
url = 'https://quotes.toscrape.com/' # get the url of the site you want to scrape

request = requests.get(url) # use the requests library to the http request

In [4]:
data = bs4.BeautifulSoup(request.text, 'html.parser') 
# uncomment this if you want to see what beautifulsoup returns, it is rather long

In [5]:
quote_list = data.find_all('div', attrs={'class':'quote'}) # find_all finds every instance of the tag 'div', with the listed attributes
#quote_list


In [6]:
quote_data = { # establishing a dictionary in which to house the scrape data
    'quote_text': [],
    'author': [],
    'tags': [],
}

for quote in quote_list:
    
    # for every quote in the quote list, we find the instance of the three things we're looking for, the quote text, the author, and the associated tags
    # also note that .text gets rid of all the html stuff around the text
    
    quote_data['quote_text'].append(quote.find('span', attrs={'class':'text'}).text)
    quote_data['author'].append(quote.find('small', attrs={'class':'author'}).text)
    tags = [x.text for x in quote.find_all('a', attrs={'class':'tag'})]
    quote_data['tags'].append(tags)
    
quote_df = pd.DataFrame(data = quote_data, columns = quote_data.keys())
quote_df

Unnamed: 0,quote_text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
5,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
6,“It is better to be hated for what you are tha...,André Gide,"[life, love]"
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison,"[edison, failure, inspirational, paraphrased]"
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt,[misattributed-eleanor-roosevelt]
9,"“A day without sunshine is like, you know, nig...",Steve Martin,"[humor, obvious, simile]"


In [7]:
# changing pages manually for static sites

quote_data = { # establishing a dictionary in which to house the scrape data
    'quote_text': [],
    'author': [],
    'tags': [],
}

for i in range(0,11): # this workds because we know the way the url works, in terms of what happens when you change the page
    url = f'https://quotes.toscrape.com/page/{i}/' 
    request = requests.get(url) 
    data = bs4.BeautifulSoup(request.text, 'html.parser') 
    quote_list = data.find_all('div', attrs={'class':'quote'})
    
    for quote in quote_list:
    
    # for every quote in the quote list, we find the instance of the three things we're looking for, the quote text, the author, and the associated tags
    # also note that .text gets rid of all the html stuff around the text
    
        quote_data['quote_text'].append(quote.find('span', attrs={'class':'text'}).text)
        quote_data['author'].append(quote.find('small', attrs={'class':'author'}).text)
        tags = [x.text for x in quote.find_all('a', attrs={'class':'tag'})]
        quote_data['tags'].append(tags)
    
quote_df_two = pd.DataFrame(data = quote_data, columns = quote_data.keys())
quote_df_two


Unnamed: 0,quote_text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
...,...,...,...
95,“You never really understand a person until yo...,Harper Lee,[better-life-empathy]
96,“You have to write the book that wants to be w...,Madeleine L'Engle,"[books, children, difficult, grown-ups, write,..."
97,“Never tell the truth to people who are not wo...,Mark Twain,[truth]
98,"“A person's a person, no matter how small.”",Dr. Seuss,[inspirational]


In [8]:
# super short example about automated webscraping with Selenium, chromedriver, and (still) beautiful soup

driver = webdriver.Chrome() # establishing the chromedriver
url = 'https://www.nba.com/stats/players/advanced' # getting our starting point url
driver.get(url) # giving the url to our webdriver

In [9]:
# nba.com is a dynamic site, clicking around won't neccesarily change the url

# we need to change the site to display all the players at once, which requires us to change the dynamic parts of the page
select = Select(driver.find_element('xpath',r"/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
select.select_by_index(0)

In [10]:
statsDict = {
    'NAME' : [],
    'AGE' : [],
    'TEAM' : [],
    'GP' : [],
    'W' : [],
    'L' : [],
    'MINUTES' : [],
    'OFFRTG' : [],
    'DEFRTG' : [],
    'NETRTG' : [],
    'AST%' : [],
    'AST/TO' : [],
    'AST RATIO' : [],
    'OREB%' : [],
    'DREB%' : [],
    'REB%' : [],
    'TO RATIO' : [],
    'EFG%' : [],
    'TS%' : [],
    'USG%' : [],
    'PACE' : [],
    'PIE' : [], 
}

src = driver.page_source
parser = bs4.BeautifulSoup(src, 'html.parser') # establishing beautiful soup as our parser

# finding the table 
table = parser.find('table', attrs={'class': 'Crom_table__p1iZz'}) 
tt = table.find('tbody', attrs={'class':'Crom_body__UYOcU'})

# finding the player data
players = tt.find_all('tr')[:20]
playerList = [p.find_all('td') for p in players]

# we want to append all of the data into our dictionary
for player in playerList:
    statsDict['NAME'].append(player[1])
    statsDict['TEAM'].append(player[2])
    statsDict['AGE'].append(player[3])
    statsDict['GP'].append(player[4])
    statsDict['W'].append(player[5])
    statsDict['L'].append(player[6])
    statsDict['MINUTES'].append(player[7])
    statsDict['OFFRTG'].append(player[8])
    statsDict['DEFRTG'].append(player[9])
    statsDict['NETRTG'].append(player[10])
    statsDict['AST%'].append(player[11])
    statsDict['AST/TO'].append(player[12])
    statsDict['AST RATIO'].append(player[13])
    statsDict['OREB%'].append(player[14])
    statsDict['DREB%'].append(player[15])
    statsDict['REB%'].append(player[16])
    statsDict['TO RATIO'].append(player[17])
    statsDict['EFG%'].append(player[18])
    statsDict['TS%'].append(player[19])
    statsDict['USG%'].append(player[20])
    statsDict['PACE'].append(player[21])
    statsDict['PIE'].append(player[22])
    
# creating a dataframe with our scraped data
player_df_dirty = pd.DataFrame(data = statsDict, columns= statsDict.keys())



In [11]:
# defining functions in order to clean our dataframe 

def extract_name(html_string):
    match = re.search(r'>([^<>]+)<', html_string)
    return match.group(1) if match else html_string

def clean_html_columns(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(extract_name)
    return df

player_df = clean_html_columns(player_df_dirty, player_df_dirty.columns)
player_df

Unnamed: 0,NAME,AGE,TEAM,GP,W,L,MINUTES,OFFRTG,DEFRTG,NETRTG,...,AST RATIO,OREB%,DREB%,REB%,TO RATIO,EFG%,TS%,USG%,PACE,PIE
0,AJ Green,25,MIL,1,1,0,4.1,133.3,114.3,19.0,...,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.31,10.0
1,AJ Johnson,19,MIL,1,1,0,1.1,133.3,200.0,-66.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,109.09,-22.2
2,Aaron Nesmith,25,IND,1,1,0,18.8,123.7,117.1,6.6,...,0.0,5.6,5.3,5.4,0.0,62.5,71.7,11.9,101.09,5.4
3,Adem Bona,21,PHI,1,0,1,2.7,166.7,116.7,50.0,...,0.0,33.3,0.0,16.7,0.0,100.0,100.0,11.1,108.68,13.6
4,Al Horford,38,BOS,1,1,0,26.1,154.9,117.6,37.3,...,41.7,0.0,15.8,6.8,0.0,78.6,78.6,11.7,93.85,10.8
5,Alperen Sengun,22,HOU,1,0,1,33.9,109.7,104.5,5.2,...,16.1,12.5,36.4,22.2,9.7,50.0,52.6,28.9,97.74,26.7
6,Amen Thompson,21,HOU,1,0,1,21.8,93.0,128.3,-35.2,...,0.0,3.2,11.1,6.9,0.0,50.0,50.9,24.5,97.86,7.6
7,Amir Coffey,27,LAC,1,0,1,25.8,87.3,109.1,-21.8,...,16.7,0.0,15.8,6.1,0.0,125.0,112.7,8.1,102.24,14.0
8,Andre Drummond,31,PHI,1,0,1,25.1,107.7,121.6,-13.9,...,0.0,14.7,40.0,24.1,10.0,50.0,56.3,15.4,98.68,10.4
9,Andre Jackson Jr.,22,MIL,1,1,0,1.1,133.3,200.0,-66.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,109.09,0.0


In [39]:
driver.close()

In [None]:
# Interactive section:
# We want to find the FIRST QUARTER statistics for Austin Reaves in the game on November 10th, using Selenium and Chromedriver

In [81]:
driver = webdriver.Chrome() 
url = 'https://www.nba.com/stats/player/1630559/boxscores-traditional?Season=2023-24' 
driver.get(url) 


In [82]:
# We want to expand the advanced filters selector, so we'll need the xpath for it

button = driver.find_element('xpath',r"How do we find the xpath\?")
button.click() # this line just clicks the button you have selected by the driver

In [83]:
# We want to select the dropdown menu for the quarter box

QuarterSelect = Select(driver.find_element('xpath',r"How do we find the xpath\?"))
QuarterSelect.select_by_index('What index should you select?')

In [84]:
# And now we want to confirm our filter with the website by selecting and clicking the large 'Get Stats button'

getStats = driver.find_element('xpath',r"How do we find the xpath\?")
getStats.click()

In [85]:
# Like in the example, we have to have the website display all available games instead of the first 50 to display the game on November 11th

AllGamesSelect = Select(driver.find_element('xpath',r"How do we find the xpath\?"))
AllGamesSelect.select_by_index(0) # pre-entered this one because it was acting strange during testing


In [90]:
# Run this it's already all written out just trust me

src = driver.page_source
parser = bs4.BeautifulSoup(src, 'html.parser')
table = parser.find('table', attrs={'class': 'Crom_table__p1iZz'}) 
tt = table.find('tbody', attrs={'class':'Crom_body__UYOcU'})
game = tt.find_all('tr')[73]
N10game = []
for i in game:
    N10game.append(i.text)
    
N10gameDF = pd.DataFrame(data = N10game )
N10gameDF = N10gameDF.T
N10gameDF.columns = ['Match Up','W/L','MIN','PTS','FGM','FGA','FG%','3PM','3PA','3P%','FTM','FTA','FT%','OREB'	,'DREB','REB','AST','STL','BLK','TOV','PF','+/-']
N10gameDF


IndexError: list index out of range

In [93]:
# should return true for all columns

N10gameDF.iloc[0] == ['Nov 10, 2023 - LAL @ PHX','W','6:30','3','1','3','33.3','0','1','0.0','1','1','100','0','0','0','0','1','0','0','0','-7']

Match Up    True
W/L         True
MIN         True
PTS         True
FGM         True
FGA         True
FG%         True
3PM         True
3PA         True
3P%         True
FTM         True
FTA         True
FT%         True
OREB        True
DREB        True
REB         True
AST         True
STL         True
BLK         True
TOV         True
PF          True
+/-         True
Name: 0, dtype: bool

In [94]:
# Don't forget to close the webdriver

driver.close()