In [171]:
import pandas as pd
import requests
import bs4
import time
import re
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [29]:
url = 'https://quotes.toscrape.com/' # get the url of the site you want to scrape

request = requests.get(url) # use the requests library to the http request

In [74]:
data = bs4.BeautifulSoup(request.text, 'html.parser') #
#data # uncomment this if you want to see what beautifulsoup returns, it is rather long

In [86]:
quote_list = data.find_all('div', attrs={'class':'quote'}) # find_all finds every instance of the tag 'div', with the listed attributes
#quote_list


In [91]:
quote_data = { # establishing a dictionary in which to house the scrape data
    'quote_text': [],
    'author': [],
    'tags': [],
}

for quote in quote_list:
    
    # for every quote in the quote list, we find the instance of the three things we're looking for, the quote text, the author, and the associated tags
    # also note that .text gets rid of all the html stuff around the text
    
    quote_data['quote_text'].append(quote.find('span', attrs={'class':'text'}).text)
    quote_data['author'].append(quote.find('small', attrs={'class':'author'}).text)
    tags = [x.text for x in quote.find_all('a', attrs={'class':'tag'})]
    quote_data['tags'].append(tags)
    
quote_df = pd.DataFrame(data = quote_data, columns = quote_data.keys())
quote_df

Unnamed: 0,quote_text,author,tags
0,"“The truth."" Dumbledore sighed. ""It is a beaut...",J.K. Rowling,[truth]
1,“I'm the one that's got to die when it's time ...,Jimi Hendrix,"[death, life]"
2,“To die will be an awfully big adventure.”,J.M. Barrie,"[adventure, love]"
3,“It takes courage to grow up and become who yo...,E.E. Cummings,[courage]
4,“But better to get hurt by the truth than comf...,Khaled Hosseini,[life]
5,“You never really understand a person until yo...,Harper Lee,[better-life-empathy]
6,“You have to write the book that wants to be w...,Madeleine L'Engle,"[books, children, difficult, grown-ups, write,..."
7,“Never tell the truth to people who are not wo...,Mark Twain,[truth]
8,"“A person's a person, no matter how small.”",Dr. Seuss,[inspirational]
9,“... a mind needs books as a sword needs a whe...,George R.R. Martin,"[books, mind]"


In [93]:
# changing pages manually for static sites

quote_data = { # establishing a dictionary in which to house the scrape data
    'quote_text': [],
    'author': [],
    'tags': [],
}

for i in range(0,11): # this workds because we know the way the url works, in terms of what happens when you change the page
    url = f'https://quotes.toscrape.com/page/{i}/' 
    request = requests.get(url) 
    data = bs4.BeautifulSoup(request.text, 'html.parser') 
    quote_list = data.find_all('div', attrs={'class':'quote'})
    
    for quote in quote_list:
    
    # for every quote in the quote list, we find the instance of the three things we're looking for, the quote text, the author, and the associated tags
    # also note that .text gets rid of all the html stuff around the text
    
        quote_data['quote_text'].append(quote.find('span', attrs={'class':'text'}).text)
        quote_data['author'].append(quote.find('small', attrs={'class':'author'}).text)
        tags = [x.text for x in quote.find_all('a', attrs={'class':'tag'})]
        quote_data['tags'].append(tags)
    
quote_df_two = pd.DataFrame(data = quote_data, columns = quote_data.keys())
quote_df_two


Unnamed: 0,quote_text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
...,...,...,...
95,“You never really understand a person until yo...,Harper Lee,[better-life-empathy]
96,“You have to write the book that wants to be w...,Madeleine L'Engle,"[books, children, difficult, grown-ups, write,..."
97,“Never tell the truth to people who are not wo...,Mark Twain,[truth]
98,"“A person's a person, no matter how small.”",Dr. Seuss,[inspirational]


In [153]:
# automated webscraping with Selenium, chromedriver, and (still) beautiful soup

driver = webdriver.Chrome() # establishing the chromedriver
url = 'https://www.nba.com/stats/players/advanced' # getting our starting point url
driver.get(url) # giving the url to our webdriver

In [154]:
# nba.com is a dynamic site, clicking around won't neccesarily change the url

# we need to change the site to display all the players at once, which requires us to change the dynamic parts of the page
select = Select(driver.find_element('xpath',r"/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
select.select_by_index(0)

In [166]:
statsDict = {
    'NAME' : [],
    'AGE' : [],
    'TEAM' : [],
    'GP' : [],
    'W' : [],
    'L' : [],
    'MINUTES' : [],
    'OFFRTG' : [],
    'DEFRTG' : [],
    'NETRTG' : [],
    'AST%' : [],
    'AST/TO' : [],
    'AST RATIO' : [],
    'OREB%' : [],
    'DREB%' : [],
    'REB%' : [],
    'TO RATIO' : [],
    'EFG%' : [],
    'TS%' : [],
    'USG%' : [],
    'PACE' : [],
    'PIE' : [], 
}

src = driver.page_source
parser = bs4.BeautifulSoup(src, 'html.parser') # establishing beautiful soup as our parser

# finding the table 
table = parser.find('table', attrs={'class': 'Crom_table__p1iZz'}) 
tt = table.find('tbody', attrs={'class':'Crom_body__UYOcU'})

# finding the player data
players = tt.find_all('tr')[:20]
playerList = [p.find_all('td') for p in players]

# we want to append all of the data into our dictionary
for player in playerList:
    statsDict['NAME'].append(player[1])
    statsDict['TEAM'].append(player[2])
    statsDict['AGE'].append(player[3])
    statsDict['GP'].append(player[4])
    statsDict['W'].append(player[5])
    statsDict['L'].append(player[6])
    statsDict['MINUTES'].append(player[7])
    statsDict['OFFRTG'].append(player[8])
    statsDict['DEFRTG'].append(player[9])
    statsDict['NETRTG'].append(player[10])
    statsDict['AST%'].append(player[11])
    statsDict['AST/TO'].append(player[12])
    statsDict['AST RATIO'].append(player[13])
    statsDict['OREB%'].append(player[14])
    statsDict['DREB%'].append(player[15])
    statsDict['REB%'].append(player[16])
    statsDict['TO RATIO'].append(player[17])
    statsDict['EFG%'].append(player[18])
    statsDict['TS%'].append(player[19])
    statsDict['USG%'].append(player[20])
    statsDict['PACE'].append(player[21])
    statsDict['PIE'].append(player[22])
    
# creating a dataframe with our scraped data
player_df_dirty = pd.DataFrame(data = statsDict, columns= statsDict.keys())



In [170]:
# defining functions in order to clean our dataframe 

def extract_name(html_string):
    match = re.search(r'>([^<>]+)<', html_string)
    return match.group(1) if match else html_string

def clean_html_columns(df, columns):
    for col in columns:
        if col in df.columns:
            df[col] = df[col].astype(str).apply(extract_name)
    return df

player_df = clean_html_columns(player_df_dirty, player_df_dirty.columns)
player_df

Unnamed: 0,NAME,AGE,TEAM,GP,W,L,MINUTES,OFFRTG,DEFRTG,NETRTG,...,AST RATIO,OREB%,DREB%,REB%,TO RATIO,EFG%,TS%,USG%,PACE,PIE
0,Adem Bona,21,PHI,6,3,3,10.8,103.7,106.8,-3.1,...,11.8,6.3,15.2,10.3,17.6,60.0,57.4,9.3,99.86,4.2
1,Brice Sensabaugh,20,UTA,6,4,2,21.4,102.8,101.7,1.0,...,14.7,3.1,12.4,8.2,14.7,52.2,57.6,19.5,108.81,10.1
2,Bronny James,20,LAL,6,2,4,16.2,88.9,118.4,-29.6,...,4.2,2.6,7.2,4.7,14.6,31.1,32.2,18.7,101.87,-1.0
3,Buddy Hield,31,GSW,6,6,0,17.2,115.2,84.0,31.2,...,13.9,0.9,11.5,6.6,13.9,70.2,70.2,23.9,102.89,13.9
4,Cody Williams,19,UTA,6,4,2,23.9,103.7,105.1,-1.4,...,17.9,3.2,6.4,5.0,12.5,46.9,53.7,13.1,106.31,5.0
5,Collin Sexton,25,UTA,6,4,2,22.5,108.9,108.3,0.7,...,21.0,1.6,11.4,7.2,16.2,53.2,55.7,25.4,103.37,10.9
6,Gary Payton II,31,GSW,6,6,0,13.1,122.8,92.4,30.4,...,34.4,6.5,10.6,8.6,18.8,75.0,71.8,10.6,104.42,7.7
7,Guerschon Yabusele,28,PHI,6,3,3,17.9,103.6,111.1,-7.5,...,21.2,6.5,20.4,13.6,7.7,69.7,73.1,16.5,100.73,17.2
8,Gui Santos,22,GSW,6,6,0,7.7,101.0,90.0,11.0,...,26.7,4.1,8.2,6.4,0.0,40.0,46.8,10.1,96.21,5.0
9,Jalen Hood-Schifino,21,LAL,6,2,4,19.8,92.9,117.7,-24.8,...,29.1,2.9,9.1,5.8,16.5,29.8,31.2,19.2,102.48,2.3


In [127]:
driver.close()