In [1]:
import requests
import re
import urllib
from bs4 import BeautifulSoup

In [2]:
def __has_valid_parentheses(s):
    count = 0
    for i in s:
        if i == "(":
            count += 1
        elif i == ")":
            count -= 1
        if count < 0:
            return False
    return count == 0

def __check_bad_movie_type(s):
    bad_types = ['TV Series', 'Short', 'Video Game', 'Video short', 'Video', 'TV Movie', 'TV Mini-Series', 'TV Series short', 'TV Special', 'Documentary short', 'Documentary', 'voice', 'uncredited', 'unconfirmed']
    for bad_type in bad_types:
        if bad_type in s:
            return True
    return False

def __check_bad_release(el) -> bool:
    result = el.find_all('a')
    if len(result) != 1:
        return True
    return False

def __extract_year(el) -> None:
    result = el.find('span', attrs={'class': 'year_column'})
    if len(result) != 0:
        result.extract()

def __extract_movie_name_url(el) -> (str, str):
    result = el.find('b')
    result.extract()
    return result.text.strip(), result.find('a')['href']    
    

In [34]:
_url = 'https://www.imdb.com/'
headers = {'Accept-Language': 'en', 'X-FORWARDED-FOR': '2.21.184.0'}

url = 'https://www.imdb.com/name/nm0000001/'

In [35]:
response = requests.get(url, headers=headers)
assert response.ok
soup = BeautifulSoup(response.text, "html.parser")

In [36]:
output = []
result = soup.find_all('div', attrs={'id': re.compile('actor-*'), 'class': 'filmo-row'})
# check if it is actor or actress
if len(result) == 0:
    result = soup.find_all('div', attrs={'id': re.compile('actress-*'), 'class': 'filmo-row'})
assert len(result) != 0, 'Something wrong with the actor/actress soup.'
for el in result:
    # get rid of year
    __extract_year(el)

    # check for the release and movie type
    if __check_bad_release(el) or __check_bad_movie_type(el.text.strip()):
        continue

    # get movie name and url
    movie_name, movie_url = __extract_movie_name_url(el)
    # get element text
    text = el.text.strip()

    # check if there are some unidentified brackets left
    if '(' in text and __has_valid_parentheses(text):
        print('='*25)
        print(f'{movie_name} has some unidentified brackets:\n{text}')
    
    output.append((movie_name, urllib.parse.urljoin(_url, movie_url)))

Ziegfeld Follies has some unidentified brackets:
Fred Astaire ('Here's to the Ladies') / Raffles ('This Heart of Mine') / Tai Long ('Limehouse Blues') / ...


In [37]:
output

[('Ghost Story', 'https://www.imdb.com/title/tt0082449/'),
 ('The Purple Taxi', 'https://www.imdb.com/title/tt0076851/'),
 ('The Amazing Dobermans', 'https://www.imdb.com/title/tt0074130/'),
 ('The Towering Inferno', 'https://www.imdb.com/title/tt0072308/'),
 ('Midas Run', 'https://www.imdb.com/title/tt0064664/'),
 ("Finian's Rainbow", 'https://www.imdb.com/title/tt0062974/'),
 ('The Notorious Landlady', 'https://www.imdb.com/title/tt0056289/'),
 ('The Pleasure of His Company', 'https://www.imdb.com/title/tt0055307/'),
 ('On the Beach', 'https://www.imdb.com/title/tt0053137/'),
 ('Silk Stockings', 'https://www.imdb.com/title/tt0050972/'),
 ('Funny Face', 'https://www.imdb.com/title/tt0050419/'),
 ('Daddy Long Legs', 'https://www.imdb.com/title/tt0047969/'),
 ('The Band Wagon', 'https://www.imdb.com/title/tt0045537/'),
 ('The Belle of New York', 'https://www.imdb.com/title/tt0044408/'),
 ('Royal Wedding', 'https://www.imdb.com/title/tt0043983/'),
 ("Let's Dance", 'https://www.imdb.com/t

In [9]:
correct = [('The Tomorrow War', 'https:www.imdb.com/title/tt9777666'), ('Avengers: Endgame', 'https:www.imdb.com/title/tt4154796'), ('The Kid', 'https:www.imdb.com/title/tt4975920'), ('Jurassic World: Fallen Kingdom', 'https:www.imdb.com/title/tt4881806'), ('Avengers: Infinity War', 'https:www.imdb.com/title/tt4154756'), ('Guardians of the Galaxy Vol. 2', 'https:www.imdb.com/title/tt3896198'), ('Passengers', 'https:www.imdb.com/title/tt1355644'), ('The Magnificent Seven', 'https:www.imdb.com/title/tt2404435'), ('Jem and the Holograms', 'https:www.imdb.com/title/tt3614530'), ('Jurassic World', 'https:www.imdb.com/title/tt0369610'), ('Guardians of the Galaxy', 'https:www.imdb.com/title/tt2015381'), ('Delivery Man', 'https:www.imdb.com/title/tt2387559'), ('Her', 'https:www.imdb.com/title/tt1798709'), ('Movie 43', 'https:www.imdb.com/title/tt1333125'), ('Zero Dark Thirty', 'https:www.imdb.com/title/tt1790885'), ('The Five-Year Engagement', 'https:www.imdb.com/title/tt1195478'), ("What's Your Number?", 'https:www.imdb.com/title/tt0770703'), ('10 Years', 'https:www.imdb.com/title/tt1715873'), ('Moneyball', 'https:www.imdb.com/title/tt1210166'), ('Take Me Home Tonight', 'https:www.imdb.com/title/tt0810922'), ("Jennifer's Body", 'https:www.imdb.com/title/tt1131734'), ('Deep in the Valley', 'https:www.imdb.com/title/tt1078885'), ('Bride Wars', 'https:www.imdb.com/title/tt0901476'), ('Wanted', 'https:www.imdb.com/title/tt0493464'), ('Wieners', 'https:www.imdb.com/title/tt0800268'), ('Walk the Talk', 'https:www.imdb.com/title/tt0462601'), ('Strangers with Candy', 'https:www.imdb.com/title/tt0369994'), ('The Extreme Team', 'https:www.imdb.com/title/tt0295249')]

In [11]:
output_set = set([x[0] for x in output])
for x in correct:
    if x[0] not in output_set:
        print(x)
        break
len(output_set), len(correct)

(28, 28)