In [1]:
# Initial imports
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import numpy as np
from config import db_password
from sqlalchemy import create_engine

In [2]:
# Set up Splinter

executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 101.0.4951
Get LATEST chromedriver version for 101.0.4951 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/101.0.4951.41/chromedriver_mac64.zip
Driver has been saved in cache [/Users/kaiyahull/.wdm/drivers/chromedriver/mac64/101.0.4951.41]


In [3]:
# Visit the filmsite.org page 
url = 'https://en.wikipedia.org/wiki/List_of_actors_with_Academy_Award_nominations#List_of_actors'
browser.visit(url)

# Optional delay for loading page
browser.is_element_present_by_css('div.list_text', wait_time=1)

False

In [4]:
# Parse the HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [5]:
# Get all table elements on the page
table = html_soup.find('table', class_='sortable wikitable jquery-tablesorter')


In [6]:
# Read the table of the list of actors and create a dataframe
actor_table = pd.read_html(str(table))[0]

In [7]:
# Show dataframe
actor_table

Unnamed: 0,Actor,Unnamed: 1,Born,Died,Age,Nomina-tions,Wins,Lead and sup-porting details,First winning film role or first nomination(also see list of all nominated roles),Firstyear,Lastyear
0,Barkhad Abdi,M,1985,~,37,1,0,S,Captain Phillips,2013,2013
1,F. Murray Abraham,M,1939,~,82,1,1,L,Amadeus,1984,1984
2,Amy Adams,F,1974,~,47,6,0,1L:5S,American Hustle (2013),2005,2018
3,Nick Adams,M,1931,1968,36,1,0,S,Twilight of Honor,1963,1963
4,Isabelle Adjani,F,1955,~,66,2,0,L,"Story of Adele H., The",1975,1989
...,...,...,...,...,...,...,...,...,...,...,...
949,Loretta Young,F,1913,2000,87,2,1,L,"Farmer's Daughter, The",1947,1949
950,Roland Young,M,1887,1953,65,1,0,S,Topper,1937,1937
951,Youn Yuh-jung,F,1947,~,74,1,1,S,Minari,2020,2020
952,Renée Zellweger,F,1969,~,53,4,2,3L1:1S1,Judy [w 42],2001,2019


In [8]:
# Rename columns
actor_table = actor_table.set_axis(['actor', 'gender', 'birth_year', 'death_year', 'age', 'nominations', 'awards_won', 'lead', 'film_role', 'firstyear', 'lastyear'], axis=1, inplace=False)

In [9]:
# Drop columns - target information is the number of nominations and awards won by each actor
actor_table = actor_table.drop(['gender', 'birth_year', 'death_year', 'age', 'lead', 'film_role', 'firstyear', 'lastyear'], axis=1)

In [10]:
# Show dataframe
actor_table.head()

Unnamed: 0,actor,nominations,awards_won
0,Barkhad Abdi,1,0
1,F. Murray Abraham,1,1
2,Amy Adams,6,0
3,Nick Adams,1,0
4,Isabelle Adjani,2,0


In [11]:
# Check data types
actor_table.dtypes

actor          object
nominations    object
awards_won     object
dtype: object

In [12]:
# Clean up numbers (some rows have references), cast data as int
actor_table['nominations'] = actor_table['nominations'].str.extract('(\d+)', expand=False)
actor_table['nominations'] = actor_table['nominations'].astype(str).astype(int)

In [13]:
# Clean up numbers, cast data as int
actor_table['awards_won'] = actor_table['awards_won'].str.extract('(\d+)', expand=False)
actor_table['awards_won'] = actor_table['awards_won'].astype(str).astype(int)

In [14]:
# Verify data types were correctly updated
actor_table.dtypes

actor          object
nominations     int64
awards_won      int64
dtype: object

In [15]:
# Connection string to PostgreSQL
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/group_project"

In [16]:
# Create database engine
engine = create_engine(db_string)

In [17]:
# Export actor awards to a SQL table
actor_table.to_sql(name="actor_awards", con=engine, index=False)

In [18]:
# Quit web scraping browser
browser.quit()