In [26]:
import wikipedia
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

# first pull the HTML from the page that links to all of the pages with the links.
# in this case, this page gives the links list pages of sci-fi films by decade.
# just go to https://en.wikipedia.org/wiki/Lists_of_science_fiction_films
# to see what I'm pulling from.
html = requests.get('https://en.wikipedia.org/wiki/Lists_of_science_fiction_films')


#turn the HTML into a beautiful soup text object
b = BeautifulSoup(html.text, 'lxml')

# create an mpty list where those links will go.
links = []

# in this case, all of the links we're in a '<li>' brackets.
for i in b.find_all(name = 'li'):
    # pull the actual link for each one
    for link in i.find_all('a', href=True):
        links.append(link['href'])
links

['/wiki/List_of_science_fiction_films_before_1920',
 '/wiki/List_of_science_fiction_films_of_the_1920s',
 '/wiki/List_of_science_fiction_films_of_the_1930s',
 '/wiki/List_of_science_fiction_films_of_the_1940s',
 '/wiki/List_of_science_fiction_films_of_the_1950s',
 '/wiki/List_of_science_fiction_films_of_the_1960s',
 '/wiki/List_of_science_fiction_films_of_the_1970s',
 '/wiki/List_of_science_fiction_films_of_the_1980s',
 '/wiki/List_of_science_fiction_films_of_the_1990s',
 '/wiki/List_of_science_fiction_films_of_the_2000s',
 '/wiki/List_of_science_fiction_films_of_the_2010s',
 '/wiki/List_of_science_fiction_television_films',
 '/wiki/Template:Scififilmlist',
 '/wiki/Template_talk:Scififilmlist',
 '//en.wikipedia.org/w/index.php?title=Template:Scififilmlist&action=edit',
 '#Lists_by_decade',
 '#See_also',
 '#References',
 '#External_links',
 '/wiki/List_of_science_fiction_films_of_the_1920s',
 '/wiki/List_of_science_fiction_films_of_the_1930s',
 '/wiki/List_of_science_fiction_films_of_th

In [27]:
# the above code ends up pulling more links than I want,
# so I just use the ones I want
links = links[1:11]
# each link only returns something like 'wiki/List_of_science_fiction_films_of_the_1920s'
# so I add the other part of the URL to each.
decade_links = ['https://en.wikipedia.org' + i for i in links]

decade_links

['https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1930s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1940s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1950s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1960s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1970s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1980s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1990s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2000s',
 'https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s']

In [28]:
# create two new lists, one for the title of the page, 
# and one for the link to the page
film_titles = []
film_links = []
# for loop to pull from each decade page with list of films.
# look at https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
# to follow along as an exampe
for decade in decade_links:
    print(f'Collecting films from {decade}')
    html = requests.get(decade)
    b = BeautifulSoup(html.text, 'lxml')
    # get to the table on the page
    for i in b.find_all(name='table', class_='wikitable'):
        # get to the row of each film
        for j in i.find_all(name='tr'):
            #get just the title cell for each row.
            # contains the title and the URL
            for k in j.find_all(name='i'):
                # get within that cell to just get the words
                for link in k.find_all('a', href=True):
                    # get the title and add to the list
                    film_titles.append(link['title'])
                    # get the link and add to that list
                    film_links.append(link['href'])
    #be a conscientious scraper and pause between scrapes
    time.sleep(1)
print(f'Number of Film Links Collected: {len(film_links)}')
print(f'Number of Film Titles Collected: {len(film_titles)}')


Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1920s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1930s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1940s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1950s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1960s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1970s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1980s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_1990s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2000s
Collecting films from https://en.wikipedia.org/wiki/List_of_science_fiction_films_of_the_2010s
Number of Film Links Collected: 1828
Number of Fil

In [29]:
# remove film links that don't have a description page on Wikipedia
new_film_links = [i for i in film_links if 'redlink' not in i]

# same goes for titles
new_film_titles = [i for i in film_titles if '(page does not exist)' not in i]

print(f'Number of Film Links with Wikipedia Pages: {len(new_film_links)}')
print(f'Number of Film Titles with Wikipedia Pages: {len(new_film_titles)}')
#use this list to fetch from the API
title_links = list(zip(new_film_titles, new_film_links))

Number of Film Links with Wikipedia Pages: 1773
Number of Film Titles with Wikipedia Pages: 1773


In [31]:
title_links

[('Algol (film)', '/wiki/Algol_(film)'),
 ('Dr. Jekyll and Mr. Hyde (1920 Haydon film)',
  '/wiki/Dr._Jekyll_and_Mr._Hyde_(1920_Haydon_film)'),
 ('Dr. Jekyll and Mr. Hyde (1920 film)',
  '/wiki/Dr._Jekyll_and_Mr._Hyde_(1920_film)'),
 ('Figures of the Night', '/wiki/Figures_of_the_Night'),
 ('The Invisible Ray (1920 serial)', '/wiki/The_Invisible_Ray_(1920_serial)'),
 ('The Mechanical Man', '/wiki/The_Mechanical_Man'),
 ('The Man from Beyond', '/wiki/The_Man_from_Beyond'),
 ('Black Oxen', '/wiki/Black_Oxen'),
 ('Aelita', '/wiki/Aelita'),
 ('The Hands of Orlac (1924 film)', '/wiki/The_Hands_of_Orlac_(1924_film)'),
 ("L'Inhumaine", '/wiki/L%27Inhumaine'),
 ('The Last Man on Earth (1924 film)',
  '/wiki/The_Last_Man_on_Earth_(1924_film)'),
 ('The Lost World (1925 film)', '/wiki/The_Lost_World_(1925_film)'),
 ('Luch Smerti', '/wiki/Luch_Smerti'),
 ('Paris Qui Dort', '/wiki/Paris_Qui_Dort'),
 ('Metropolis (1927 film)', '/wiki/Metropolis_(1927_film)'),
 ('Alraune (1928 film)', '/wiki/Alraune_