# Wikipedia

In [34]:
URL = "https://en.wikipedia.org/wiki/Academy_Award_for_Best_Actor"

## Scrape data

In [35]:
import requests

s= requests.Session()
response = s.get(URL, timeout=10)
response

<Response [200]>

In [36]:
import bs4
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
# View in html
pretty_soup = soup.prettify()

## Find Tables

In [37]:
right_table=soup.find('table', {"class":'wikitable sortable'})

In [48]:
# Number of columns in the table
for row in right_table.findAll("tr"):
    cells = row.findAll('td')

len(cells)

3

In [43]:
# Number of rows in the table including header
rows = right_table.findAll("tr")

len(rows)

463

In [65]:
header = [th.text.rstrip() for th in rows[0].find_all('th')]
print(header)

['Year', 'Actor', 'Role(s)', 'Film', 'Ref.']


## Scrape the Data into lists

In [None]:
# Create empty lists
c1=[]
c2=[]
c3=[]

# Scrape each cell and append the data in a list
for row in right_table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==4: # Play with this. When cells == 3, you can pull all nominees, and when cells == 4, you can pull the winners
        c1.append(cells[0].find(text=True))
        c2.append(cells[1].find(text=True))
        c3.append(cells[2].find(text=True))
        
# Check the data
print(c1)

In [None]:
# Create a dictionary
d = dict([(x,0) for x in header])

# Recall the headers we pulled were: 
# ['Year', 'Actor', 'Role(s)', 'Film', 'Ref.']

# Assign lists to their respective columns
d['Actor'] = c1
d['Role(s)']= c2
d['Film'] = c3

## Clean the Data

In [None]:
# Convert dict to DataFrame
df_table = pd.DataFrame(d)

# Cheeky workaround to add the years
for index, row in df_table.iterrows():
    df_table.iloc[index,0] = 1928+index

# Drop the unused "Ref." column from the table
df_table = df_table.drop('Ref.',axis=1)

# Give the DataFrame a meaningful name for analysis
best_actor = df_table

# Write a copy to .csv so you can access it later
best_actor.to_csv("best_actor_1-10-2020.csv")

# Metacritic

In [14]:

# Create a Data Frame
df = pd.DataFrame(columns=["Title", "Platform", "Metascore", "User Score", "Release Date"])

for i in range(0,162): # Going on the site, there are 162 pages of games
    # In the URL below, we've set it up so we see the condensed view of the site and as the page number is at the end, we can add it to the string
    URL = "https://www.metacritic.com/browse/games/score/metascore/all/all/filtered?view=condensed&sort=desc&page="+str(i)
    
    # Set a user so that metacritic thinks it is being accessed by a browser
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36'
    headers = {'User-Agent':user_agent,}

    # Include the headers in the request to the page
    s = requests.Session()
    response = s.get(URL, headers=headers, timeout=10)
    
    soup = BeautifulSoup(response, "html.parser")
    
    # Once you figure out the specific scrape for the page, wrap it in a loop
    ## Note, the title and platform are formatted a little funny so I've included some code to clean it as it scrapes
    ## I left it in just so if you tried you wouldn't be confused about the output
    
    for games in soup.find_all('div', class_ = "product_wrap", limit=100):

        # Extract title and platform
        title_platform = games.find('a').text
        title_platform = str(title_platform).replace("\n".join(["  "]), "").strip()

        title = title_platform[0:title_platform.find('\n')]
        platform = title_platform[title_platform.find('\n')+2:len(title_platform)-1]

        # Extract Metascore and User score
        m_score = games.find('div', class_ = 'metascore_w').text
        u_score = games.find('span', class_ = 'textscore').text

        # Extract release date
        r_date = games.find_all('span', class_= 'data')[1].text
        
        # Append to the Data Frame
        df = df.append({'Title':title, 'Platform':platform ,'Metascore':m_score, 'User Score':u_score, 'Release Date':r_date}, ignore_index = True)