In [1]:
import pandas as pd
import os
import urllib.request
import re
import json
import pandas as pd
import numpy as np 
import os
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from fa2 import ForceAtlas2
from bs4 import BeautifulSoup
import ast
import urllib.request
import json
from imdb import IMDb
from difflib import SequenceMatcher
from urllib import request, parse
import wikipedia
import warnings
warnings.filterwarnings('ignore')
import imdb
import nltk
import pandasql as psql
from nltk.stem import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
from nltk.corpus import stopwords
from nltk import word_tokenize
import itertools
from wordcloud import WordCloud

## 1. <font id = 'data_extraction'>Data Extraction</font>

*Note: long computation time (~2h)*

## <font color=blue id="film_extraction"> Extracting content from Moviepedia's pages </font>

We start with the [list of all titles](https://datasets.imdbws.com/title.basics.tsv.gz) present in IMDb database. Our objective is to find the top rated movies, from 1970 on, and with more than 100000 votes. To do so, we need to merge this dataset with [the one that contains number of votes and rating for each title](https://datasets.imdbws.com/title.ratings.tsv.gz).


In [2]:
#import dataset with all titles in IMDb
title_basics = pd.read_csv(os.path.join('raw_IMDb','title_basics.tsv'), sep='\t')

title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
#import dataset with ratings for every title in IMDb
title_ratings = pd.read_csv(os.path.join('raw_IMDb','title_ratings.tsv'), sep='\t')

title_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1832
1,tt0000002,6.0,236
2,tt0000003,6.5,1590
3,tt0000004,6.0,153
4,tt0000005,6.2,2407


In [4]:
#merge both datasets
ratings_merged = pd.merge(title_basics, title_ratings, how = 'inner', on='tconst')

#clean start year
ratings_merged['startYear'] = ratings_merged['startYear'].apply(lambda x: int(x) if x!= '\\N' else 0)

#select top movies according to the following criteria:
#release year >= 1970
#rating >= 7
#number of votes >= 100000
top_movies = ratings_merged[(ratings_merged['titleType'] == 'movie') & (ratings_merged['numVotes'] >=100000) & (ratings_merged['startYear'] >= 1970) & (ratings_merged['averageRating'] >= 7)].sort_values('averageRating', ascending = False)

#dataframe with columns of interest
top_movies = top_movies[['tconst', 'primaryTitle', 'startYear', 'genres', 'averageRating']].copy()

#change names of the columns
top_movies.columns = ['Const', 'Title', 'Release Date', 'Genres', 'IMDb Rating']

top_movies

Unnamed: 0,Const,Title,Release Date,Genres,IMDb Rating
82237,tt0111161,The Shawshank Redemption,1994,Drama,9.3
45996,tt0068646,The Godfather,1972,"Crime,Drama",9.2
48538,tt0071562,The Godfather: Part II,1974,"Crime,Drama",9.0
249056,tt0468569,The Dark Knight,2008,"Action,Crime,Drama",9.0
82020,tt0110912,Pulp Fiction,1994,"Crime,Drama",8.9
...,...,...,...,...,...
228740,tt0418279,Transformers,2007,"Action,Adventure,Sci-Fi",7.0
225929,tt0411477,Hellboy II: The Golden Army,2008,"Action,Adventure,Fantasy",7.0
578777,tt1282140,Easy A,2010,"Comedy,Drama,Romance",7.0
89933,tt0120616,The Mummy,1999,"Action,Adventure,Fantasy",7.0


We must now move to Moviepedia in order to extract the plot of each film. Some movie titles will be spelled in the same way in Moviepedia, while others will have extra information like the year of release (e.g. Gladiator/Gladiator (2000)) or additional punctuation characters (e.g. The Godfather part II/ The Godfather: part II). As a result, we'll need to come up with an equivalent title for those that are spelled differently. To extract the content of each film's page from Moviepedia according to the list of top movies, we have designed an algorithm:

1. Make a query to Moviepedia api for a given top_movies title and extract the content of the page that corresponds to film's data:
    1. If the length of this content is greater than 1000 (length < 1000 includes the case in which the page doesn't exist, and also the case in which the page needs to be eliminated from the database), then store the content in movies_raw folder for that title.
    2. If length < 1000, then make a query for the given title + (release year). 
        1. Again, if length is > 1000, then store the content in movies_raw folder for that title.
        2. If length < 1000, then obtain the html content of Moviepedia's search page when searching for the movie title. Obtain the titles of the search result by manipulating DOM elemnents with [**BeautifulSoup**](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) python library. 
            1. Compare the similarity between the top_movies title and each of the titles from the search result with the method  *difflib* provided by [**SequenceMatcher**](https://docs.python.org/3.5/library/difflib.html#sequencematcher-examples).
            2. Get the most similar title and make a query to Moviepedia api with such title. 
                1. If length of the content > 1000, then store it in movies_raw.
                2. If length of the content < 1000, then do nothing. 

In [5]:
#these titles give rise to duplications/errors when obtaining equivalent titles, so remove them
#identified manually after data exploration
titles_drop = ['tt0023427', 'tt0020686', 'tt0025509', 'tt0055824', 'tt0044926', 'tt1287878', 'tt0063518', 'tt0028203', 'tt0028772', 'tt0024991', 'tt0056937']

top_movies.drop(top_movies.loc[top_movies['Const'].isin(titles_drop)].index, inplace = True)

In [6]:
#folders to store data
dir_path = os.getcwd()
movies_path = os.path.join(dir_path, "movies_raw")
movies_plot = os.path.join(dir_path, "movies_plot")
movies_cast = os.path.join(dir_path, "movies_cast")

#create folders if they do not exist
os.makedirs(movies_path, exist_ok=True)
os.makedirs(movies_plot, exist_ok=True)
os.makedirs(movies_cast, exist_ok=True)

In [7]:
#function to store content
def write_content(movie, text, path):
    file1 = open(os.path.join(path, "{}.txt".format(movie)),"w+")
    text = text.encode('utf-16','surrogatepass').decode('utf-16')
    file1.write(text.lower().replace('\n', ' '))
    file1.close() 

In [8]:
#function to obtain similarity ratio
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [9]:
#function to extract title data from Moviepedia and return content if possible
def search_film(movie):
    
    action = "action=query"
    content = "prop=revisions&rvprop=content"
    dataformat ="format=json"
    extra = "rvslots=*"
    baseurl = "https://movies.fandom.com/api.php?"
    title = "titles=" + urllib.parse.quote_plus(movie)

    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    
    wikitext = ast.literal_eval(wikitext)
    
    if list(wikitext['query']['pages'].keys())[0] != '-1':
    
        content = wikitext['query']['pages'][list(wikitext['query']['pages'].keys())[0]]['revisions'][0]['*']
        
        if len(content) < 1000:
            
            content = 'None'
    else:
        
        content = 'None'
    
    return content

In [10]:
#function to extract search results for a given title and return content if possible
def search_results(movie):
    
    film_original = movie
    
    film = movie.strip().lower()
    
    url = "https://movies.fandom.com/wiki/Special:Search?query={}".format(urllib.parse.quote_plus(film))

    html = request.urlopen(url).read().decode('utf8')

    soup = BeautifulSoup(html, 'html.parser')

    #find first 6 DOM elements containing title according to DOM element class
    titles = soup.find_all("a", class_='unified-search__result__title')[:6]
    
    #list to store lowercase title
    ps = []
    
    #list to store oriignal title
    bs = []
    
    for a in titles:
        
        s = a['data-title'].lower().strip()
    
        #titles containing these strings do not correspond to movies
        if ('/home media' not in s) and ('/transcript' not in s) and ('/credits' not in s):
            ps.append(s)
            bs.append(a['data-title'].strip())
    
    #if search result gives any relevant title
    if len(ps) > 0:

        #obtain scores
        titles_scores = [similar(title.lower(), film.lower()) for title in ps]
        
        #obtain max score
        max_score = max(titles_scores)

        #obtain equivalent title
        equal =  bs[titles_scores.index(max(titles_scores))]

        #return title only if the score is greater than 0.6
        if max_score >= 0.6:

            return(equal)

        else:
            return 'None'
    else:
        return 'None'

In [11]:
#add column for new titles
top_movies['new_title'] = 'None'

#iterate over each top movie to retrieve Movipedia's page content
for film in top_movies.Title:
    id_movie = top_movies['Const'][top_movies['Title'] == film].values[0]
    content = search_film(film)
    if content != 'None':
        top_movies.loc[top_movies['Title'] == film, 'new_title'] = film
        write_content(id_movie, content, movies_path)
    else:
        title_new = film + ' ' + '(' + str(top_movies['Release Date'][top_movies.Title == film].values[0]) + ')'
        content = search_film(title_new)
        if content != 'None':
            top_movies.loc[top_movies['Title'] == film, 'new_title'] = title_new
            write_content(id_movie, content, movies_path)
        else:
            search_result = search_results(film)
            if search_result != 'None':
                content = search_film(search_result)
                if content != 'None':
                    top_movies.loc[top_movies['Title'] == film, 'new_title'] = search_result
                    write_content(id_movie, content, movies_path)

In [12]:
#number of movies we could extract out of 1108
len(top_movies[top_movies['new_title'] != 'None'])

834

## <font color="blue" id = 'plot_extraction'> Extracting the plot </font>

Now we have the raw content for each movie we need to extract the plot. When analyzing the source text for each movie, we noticed that plot section name can present different formats (e.g. Movie plot, plot, Characters and plot). However, all of them contained the word "plot". We now need to extract the plot from each movie's raw content. When we looked at the source text for each film, we observed that the plot section names might come in a variety of formats (e.g. Movie plot, plot, Characters and plot). 

Example source text, plot section of *The Godfather*:

<img src = "images/plot1.png" width = 1000>

Example source text, plot section of *Forrest Gump*: 

<img src = "images/plot2.png" width = 1000>

However, they all had the term "plot" in them. We created a regular expression that can retrieve plot information besides the section name incongruencies.

## `plot[^=]*==([^=]*)=?` 

`plot[^=]*==` matches the existence or not of any character different than `=` between the word `plot` and the characters `==` . We did it this way because `=` marks the existence of a new section. Therefore, this part of the expression will allow us to find the starting point of the plot section, even if this section is not strictly named as `==Plot==`. 

`([^=]*)` this part of the expression will return us the plot of the movie. The whole part `==([^=]*)=?` matches a sequence of characters not containing the character `=` between `==` and `=`, with the possibility of the latter one being present or not. 

There will be some movie pages where plot is not present, or where the plot length is too small (just one sentence). We won't consider those cases for further analysis.


In [13]:
#set new column with plot
top_movies['plot'] = 'None'

for movie in os.listdir(movies_path):
    
    if movie.endswith('.txt'):
        
        movie = movie.replace('.txt', '')
    
        file1 = open(os.path.join(movies_path, "{}.txt".format(movie)),"r+")
        text = file1.read().lower().replace('\n', ' ')
        file1.close() 

        if re.search(r'plot[^=]*==([^=]*)=?', text):
            
            plot = re.findall(r'plot[^=]*==([^=]*)=?', text)[0]
            
            if len(plot) > 250:

                write_content(movie, plot, movies_plot)

                top_movies.loc[top_movies['Const'] == movie, 'plot'] = plot

In [14]:
#number of plots we could extract out of 835
movies_plot_df = top_movies[top_movies['plot'] != 'None'].copy()
len(movies_plot_df)

513

## <font color="blue"> Extracting the cast </font>

We'll find the cast of those movies for which we could extract the plot. We proceeded in this order because there is a better and safer source for cast than Moviepedia, but Moviepedia was the only source for movie plots. 

The name of the cast section is contained in the general information box for each movie as it is shown in the following image:

<img src="images/cast.png" width = 300>


It is spelled differently throughout Moviepedia film pages, as happened with plots. Moreover, when extracting the cast, cast names come also in multiple and different formats. Therefore, the number of actors becomes substantially reduced when using Moviepedia as the source to get this information.

Example source text, cast section 

Fortunately, IMDb provides cast information for every movie. A library known as [**IMDbPY**](https://imdbpy.github.io) allows us to get all the information related to a title with the title's name or id: cast, directors, top reviews, crew, etc. 

We will compare the number of actors we get with each of these data sources to justify why we decided to proceed with IMDbPY for this purpose. 

We used the following regular expression to extract the cast from Moviepedia film pages:

## `\|[\s+]*(cast|starring)[\s+]*=(([^\|=])*)`

This regular expression will allow us to extract the cast not in a case-sensitive way. The first part `\|[\s+]*(cast|starring)[\s+]*=` allows us to identify `|cast=`, with a variable number of white spaces between `|` and `cast`, and between `cast` and `=`. Then `(([^\|=])*)` will return the cast of a film by selecting every character not equal to `|`, or `=`.

In [16]:
cast_mp = []

#iterate over movies present in the plots folder
for movie in movies_plot_df['Const']:
                
    with open(os.path.join(movies_path, movie + '.txt'), 'r') as r:

        content = r.read()

    r_cast1 = r'\|[\s+]*(cast|starring)[\s+]*=(([^\|=])*)'        

    rc1 = re.findall(r_cast1, content)

    #if the regular expression returns any match
    if len(rc1) > 0:
        r_names = re.findall(r"\[\[\s*([a-z\s']*)\s*\]\]", rc1[0][1])
        for element in r_names:
            #append both actor's name and movie id to cast list
            cast_mp.append((movie.replace('.txt', ''), element))
                
cast_dmp = pd.DataFrame(cast_mp, columns =['movie_id', 'actor_name'])
cast_dmp['actor_name'] = cast_dmp['actor_name'].apply(lambda x: x.title())
cast_dmp

Unnamed: 0,movie_id,actor_name
0,tt0111161,Tim Robbins
1,tt0111161,Morgan Freeman
2,tt0111161,William Sadler
3,tt0111161,Clancy Brown
4,tt0111161,Gil Bellows
...,...,...
2140,tt0118655,Mindy Sterling
2141,tt0418763,Jake Gyllenhaal
2142,tt0418763,Jamie Foxx
2143,tt0418279,Shia Labeouf


In [17]:
#number of actors
len(cast_dmp['actor_name'].unique())

1296

Even when we get almost 1300 actors, we still need actor's biography and birth information. This information is not provided by Movipedia for most of the actors. We would need to move to another data source such as Wikipedia, and expect that there is a page for each of this actors. To do so, we could use the python library for [**Wikipedia**](https://pypi.org/project/wikipedia/) and proceed this way for each actor:

In [18]:
actors_data = {}

#check only for some actors due to computation times
for actor in cast_dmp['actor_name'].unique()[:20]:

    try: 

        source_text = wikipedia.page(wikipedia.search(actor, results = 1)[0]).content
        
        html = wikipedia.page(wikipedia.search(actor, results = 1)[0]).html()

    except:

        try:

            source_text = wikipedia.page(wikipedia.suggest(actor)).content
            
            html = wikipedia.page(wikipedia.suggest(actor)).html()

        except:

            source_text = 'None'
            html = 'None'
    
    if source_text != 'None' and html != 'None':
        
        soup = BeautifulSoup(html, 'html.parser')
        
        bday = soup.find_all("span", class_='bday')
        
        if len(bday) != 0:
            bday = pd.to_datetime(soup.find_all("span", class_='bday')[0].text, errors = 'coerce')
        else:
            bday = np.nan
        
        bplace = soup.find_all("div", class_='birthplace')
        
        if len(bplace) != 0:
            bplace= soup.find_all("div", class_='birthplace')[0].text
            country = bplace.split(',')[-1]
        
        else:
            country = np.nan
            
        #remove section names and pre-process biographies
        bio = source_text.lower()
        bio = bio.replace('\n', ' ')
        
        #remove from 'references' section on
        bio = re.sub(r'=+[\s]*(references).*', ' ', bio)
        
        #remove from 'external links' section on
        bio = re.sub(r'=+[\s]*(external links).*', ' ', bio)
        
        #remove section names
        bio = re.sub(r'=+[^=]*=+', ' ', bio)
                     
        #define gender
        words = nltk.tokenize.word_tokenize(bio)
        female = words.count('her') + words.count('she')
        male = words.count('he') + words.count('him')

        if female > male:
            gender = 'female'
        elif male > female:
            gender = 'male'
        else:
            gender = 'Unknown'
        
        actors_data[actor] = {'bday': bday, 'bplace': country, 'bio': bio, 'gender': gender}
    
    else:
        actors_data[actor] = {'bday': np.nan, 'bplace': np.nan, 'bio': np.nan, 'gender': np.nan}
                     
            
pd.DataFrame.from_dict(actors_data).transpose()
                                     

Unnamed: 0,bday,bplace,bio,gender
Tim Robbins,1932-07-22,,"thomas eugene robbins (born july 22, 1932) is ...",male
Morgan Freeman,1997-02-28,,"james morgan (born february 28, 1997) is an am...",male
William Sadler,1950-04-13,U.S.,"william thomas sadler (born april 13, 1950) is...",male
Clancy Brown,1959-01-05,U.S.,"clarence john brown iii (born january 5, 1959)...",male
Gil Bellows,NaT,,,
James Whitmore,1921-10-01,U.S.,"james allen whitmore jr. (october 1, 1921 – fe...",male
Marlon Brando,1924-04-03,U.S.,"marlon ernest brando jr. (april 3, 1924 – july...",male
Al Pacino,1940-04-25,U.S.,alfredo james pacino (; italian: [paˈtʃiːno]; ...,male
James Caan,NaT,,,
Richard Castellano,1933-09-04,U.S.,"richard salvatore castellano (september 4, 193...",male


We can get the majority of this information from IMDbPY for around 10 main actors in a film, which allows us to get around 7000 actors for each film from the set of 514 films. That is why, rather than using Movipedia or Wikipedia, we will use the IMDbPY api to obtain cast information.

In [19]:
#define dataframe that will contain the cast
actors_df = pd.DataFrame(columns = ['actor_name', 'actor_id', 'movie_id', 'relevance'])

for film in movies_plot_df['Const']:
        
    # create an instance of the IMDb class
    ia = IMDb()

    id_movie = film.replace('tt','')

    # get a movie
    movie = ia.get_movie(id_movie)

    for actor in range(len(movie['cast'][:10])):

        actors_df = actors_df.append({'actor_name':movie['cast'][actor]['name'] , 'actor_id':movie['cast'][actor].personID, 'movie_id': film, 'relevance': actor + 1}, ignore_index = True)

In [20]:
actors_df = actors_df.drop_duplicates()

Now we have the most relevant 10 actors from each cast, we need to find the information for every member of the cast. We will again call the IMDb api to get this information.

We first create a dataframe that contains the number, id, as well as the average rating and number of appearances for each actor we extracted.

In [21]:
#we will create an attribute for each actor that 
#calculates the number of appearances of each actor

#we will also add the average rating of the movies where 
#an actor has appeared

actors_data = psql.sqldf("""SELECT a.actor_name AS name, a.actor_id AS id, count(*) as appearances, 
                            null as birth_place, null as birth_date, 
                            null as gender, null as bio, AVG(m.'IMDb Rating') as rating
                                
                            FROM actors_df a INNER JOIN top_movies m
                            ON a.movie_id == m.Const
                          
                            GROUP BY (a.actor_id)
                            
                            ORDER BY appearances DESC, rating DESC
                            
                            """)

actors_data

Unnamed: 0,name,id,appearances,birth_place,birth_date,gender,bio,rating
0,Robert De Niro,0000134,16,,,,,7.918750
1,Tom Hanks,0000158,12,,,,,7.841667
2,Samuel L. Jackson,0000168,12,,,,,7.616667
3,Leonardo DiCaprio,0000138,10,,,,,8.090000
4,Al Pacino,0000199,10,,,,,8.060000
...,...,...,...,...,...,...,...,...
3537,Keith Lucas,5695570,1,,,,,7.000000
3538,Niall McNamee,6415305,1,,,,,7.000000
3539,Faithe Herman,7968936,1,,,,,7.000000
3540,Patrick L. Reyes,8423767,1,,,,,7.000000


In [22]:
def actor_data(id_actor):

    # creating instance of IMDb
    ia = imdb.IMDb()

    actor = ia.get_person(id_actor)

    # getting birth date
    if 'birth date' in actor.keys():
        date = actor['birth date']
    else:
        date = 'Unknown'
    
    # getting birth place
    if 'birth info' in actor.keys():
        place = actor['birth info']['birth place'].split(',')[-1].strip()
    else:
        place = 'Unknown'
        
    #bio
    if 'mini biography' in actor.keys():
        bio = actor['mini biography'][0].lower()

        # getting gender
        words = nltk.tokenize.word_tokenize(bio)
        female = words.count('her') + words.count('she')
        male = words.count('he') + words.count('him')

        if female > male:
            gender= 'female'
        elif male > female:
            gender = 'male'
        else:
            gender = 'Unknown'
    else:
        bio = 'Unknown'
        gender= 'Unknown'
    
    return date, place, gender, bio

In [23]:
for i in actors_data.id:
        
    date, place, gender, bio = actor_data(i)
    
    actors_data.loc[actors_data.id == i, 'birth_place'] = place
    
    actors_data.loc[actors_data.id == i, 'birth_date'] = date
    
    actors_data.loc[actors_data.id == i, 'gender'] = gender
    
    actors_data.loc[actors_data.id == i, 'bio'] = bio

2021-12-06 16:41:30,742 CRITICAL [imdbpy] /opt/anaconda3/lib/python3.8/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/name/nm0919798/', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/imdb/parser/http/__init__.py", line 220, in retrieve_unicode
    response = uopener.open(url)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 1393, in https_open
    return self.do_o

2021-12-06 16:46:42,049 ERROR [imdbpy.parser.http.build_movie] /opt/anaconda3/lib/python3.8/site-packages/imdb/parser/http/utils.py:335: empty title or movieID for " (7)
"
2021-12-06 17:28:58,525 CRITICAL [imdbpy] /opt/anaconda3/lib/python3.8/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/name/nm0005277/', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/imdb/parser/http/__init__.py", line 220, in retrieve_unicode
    response = uopener.open(url)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/opt/anaconda3/lib/python3.8/

2021-12-06 17:50:23,186 CRITICAL [imdbpy] /opt/anaconda3/lib/python3.8/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/name/nm0001856/', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/imdb/parser/http/__init__.py", line 220, in retrieve_unicode
    response = uopener.open(url)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 1393, in https_open
    return self.do_o

2021-12-06 18:01:41,860 CRITICAL [imdbpy] /opt/anaconda3/lib/python3.8/site-packages/imdb/_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/name/nm0054837/', 'proxy': '', 'exception type': 'IOError', 'original exception': timeout('The read operation timed out')},); kwds: {}
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/imdb/parser/http/__init__.py", line 220, in retrieve_unicode
    response = uopener.open(url)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 525, in open
    response = self._open(req, data)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 542, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 502, in _call_chain
    result = func(*args)
  File "/opt/anaconda3/lib/python3.8/urllib/request.py", line 1393, in https_open
    return self.do_o

In [24]:
#convert actor's birthdate to datetime
actors_data['birth_date'] = actors_data['birth_date'].apply(lambda x: pd.to_datetime(x, errors = 'coerce'))

#pre-process birth place
actors_data['birth_place'] = actors_data['birth_place'].apply(lambda x: re.findall(r'\[now(.*)\]', x)[0] if re.findall(r'\[now(.*)\]', x) != [] else x)


## <font color="blue"> Extracting reviews </font>

IMDb api allows to extract the most relevant reviews for each film. Therefore we will make a call to extract this data for each movie. We could have done this step at the same time we extracted the plots, but we wanted to explain it separately to make it more clear for the reader.

In [25]:
def extract_reviews(row):
    
    film = row['Const']

    # create an instance of the IMDb class
    ia = IMDb()

    id_movie = film.replace('tt','')

    # get a movie
    movie = ia.get_movie(id_movie, ['reviews'])
    
    reviews = ' '.join([d['content'] for d in movie['reviews']])
    
    return reviews

In [26]:
#extract reviews for each movie
movies_plot_df['reviews'] = movies_plot_df.apply(lambda row: extract_reviews(row), axis=1)


## <font color = blue> Save results for further analysis </font>

In [27]:
movies_plot_df.to_csv(os.path.join('datasets','top_movies_data.csv'), index = False)
actors_df.to_csv(os.path.join('datasets','cast_top_movies.csv'), index = False)
actors_data.to_csv(os.path.join('datasets','actors_top_movies.csv'), index = False)

## 2. Store results in the input format for the explainer notebook

We will first load the data we obtained from [data extraction](#data_extraction). 

In [29]:
movies_df = pd.read_csv(os.path.join('datasets','top_movies_data.csv'))
cast_df = pd.read_csv(os.path.join('datasets','cast_top_movies.csv'), dtype={'actor_id': object})
actors_df = pd.read_csv(os.path.join('datasets','actors_top_movies.csv'), dtype={'id': object})

In [30]:
movies_df.head()

Unnamed: 0,Const,Title,Release Date,Genres,IMDb Rating,new_title,plot,reviews
0,tt0111161,The Shawshank Redemption,1994,Drama,9.3,The Shawshank Redemption,"{{spoiler}} in 1947, andy dufresne (tim robbi...",Shawshank Redemption is without doubt one of t...
1,tt0068646,The Godfather,1972,"Crime,Drama",9.2,The Godfather,{{spoiler}}the film begins at the wedding of ...,"Up until today, I haven't bothered to review ""..."
2,tt0071562,The Godfather: Part II,1974,"Crime,Drama",9.0,The Godfather Part II,''the godfather part ii'' presents two parall...,"""The Godfather: Part II"" is seen, by many, as ..."
3,tt0468569,The Dark Knight,2008,"Action,Crime,Drama",9.0,The Dark Knight,"in gotham city, the joker ([[heath ledger]]) ...",I first need to point out that I generally hat...
4,tt0167260,The Lord of the Rings: The Return of the King,2003,"Action,Adventure,Drama",8.9,The Lord of the Rings: The Return of the King,{{spoiler}}''the lord of the rings; the retur...,"I admit it, I love all three Lord of the Rings..."


In [31]:
cast_df.head()

Unnamed: 0,actor_name,actor_id,movie_id,relevance
0,Tim Robbins,209,tt0111161,1
1,Morgan Freeman,151,tt0111161,2
2,Bob Gunton,348409,tt0111161,3
3,William Sadler,6669,tt0111161,4
4,Clancy Brown,317,tt0111161,5


In [32]:
actors_df.head()

Unnamed: 0,name,id,appearances,birth_place,birth_date,gender,bio,rating
0,Robert De Niro,134,16,USA,1943-08-17,male,"one of the greatest actors of all time, robert...",7.91875
1,Tom Hanks,158,12,USA,1956-07-09,male,"thomas jeffrey hanks was born in concord, cali...",7.841667
2,Samuel L. Jackson,168,12,USA,1948-12-21,male,samuel l. jackson is an american producer and ...,7.616667
3,Leonardo DiCaprio,138,10,USA,1974-11-11,male,few actors in the world have had a career quit...,8.09
4,Al Pacino,199,10,USA,1940-04-25,male,"alfredo james ""al"" 'pacino established himself...",8.06


In [33]:
new_cast = pd.merge(cast_df, actors_df, left_on='actor_id', right_on='id')[['actor_name', 'actor_id', 'movie_id', 'appearances']].sort_values('movie_id')
new_cast['rating'] = new_cast.apply(lambda row: movies_df[movies_df['Const'] == row['movie_id']]['IMDb Rating'].to_list()[0], axis=1)


In [34]:
new_cast.to_csv(os.path.join('datasets_explainer_notebook','cast_top_movies_EN.csv'), index=False)
movies_plot_df.to_csv(os.path.join('datasets_explainer_notebook','top_movies_data.csv'), index = False)
actors_data.to_csv(os.path.join('datasets_explainer_notebook','actors_top_movies.csv'), index = False)