### Part 1.1 - Extract_data_5000_movies

#### Import the libraries

In [None]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import os

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook
# Example of Use: tqdm_notebook(examples, desc="Converting examples to features")

#### First 5000 movies

Extract data about:

* field 1: Plot Summary
* field 2: Actors
* field 3: Directors
* field 4: IMDB rating
* field 5: Plot Synopsis
* field 6: Reviews

In [None]:
"""
Open the movie_content urls for the first 5000 movies and create the souplist of those URLs
"""
content_one = []

with open(os.path.join(os.getcwd(), 'movie_content_url\\data_one_10012020.pkl'), 'rb') as f:
    
    content_one = pickle.load(f)

print("Number of URLs: {}".format(len(content_one)))

content_souplist_one = []

for i in tqdm_notebook(content_one):
    
    content_souplist_one.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_one)))

In [None]:
"""
Open the synopsis urls of the first 5000 movies and create the souplist of those URLs
"""
synopsis_one = []

with open(os.path.join(os.getcwd(), 'synopsis_url\\synopsis_one_12012020.pkl'), 'rb') as f:
    
    synopsis_one = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_one)))

synopsis_souplist_one = []

for i in tqdm_notebook(synopsis_one):
    
    synopsis_souplist_one.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_one)))

In [None]:
"""
Open the reviews urls of the first 5000 movies and create the souplist of those URLs
"""
review_one = []

with open(os.path.join(os.getcwd(), 'reviews_url\\review_one_15012020.pkl'), 'rb') as f:
    
    review_one = pickle.load(f)

print("Number of URLs: {}".format(len(review_one)))

review_souplist_one = []

for i in tqdm_notebook(review_one):
    
    review_souplist_one.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_one)))

In [None]:
"""
Field 1: Extract plot summary
"""
myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

In [None]:
"""
Field 2: Extract actors
"""
myfield_cast = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_one)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

In [None]:
"""
Field 3: Extract director name(s)
"""
myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
"""
Field 4: Extract imdb movie rating
"""
myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_one)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
"""
Field 5: Extract plot synopsis

Having extracted the synopsis content for each movie, we noticed that roughly more than 50% percent of the movies have a synopsis text.
Thus, in order not to erase almost half of the first 5,000 we will not further proceed in keeping the synopsis text
"""
synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_one)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
"""
Print the movies with no synopsis text.
Result: We observed that many movies missed a synopsis text. Thus, we decided to not use synopsis for furhter analysis (although will be extracted)
"""
print([i for i,x in enumerate(synopsis_step_one) if not x])
print([i for i,x in enumerate(synopsis_step_two) if not x])
print([i for i,x in enumerate(synopsis_step_three) if not x])
# Many of the movies do not have a synopsis text, this is it won't be used for furhter analysis (although will be extracted)

In [None]:
"""
Field 6: Extract movie reviews
"""
myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_one)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Synopsis list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
"""
Print the movies with no user reviews text.
"""
print([i for i,x in enumerate(myfield_review_step_one) if not x])
print([i for i,x in enumerate(myfield_review_step_two) if not x])
print([i for i,x in enumerate(myfield_review_step_three) if not x])

<b> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - </b>

#### Dataframe creation based on the movie content, synopsis and reviews extracted

In [None]:
"""
Import the movie_content (the extracted HTML document of the column imdb_url)
Import the reviews content (the extracted HTML document of the column reviews_url)
"""
with open(os.path.join(os.getcwd(), 'movie_content_url\\data_one_10012020.pkl'), 'rb') as f:
    
    content_one = pickle.load(f)

print("Number of URLs: {}".format(len(content_one)))

with open(os.path.join(os.getcwd(), 'reviews_url\\review_one_15012020.pkl'), 'rb') as f:
    
    review_one = pickle.load(f)

print("Number of URLs: {}".format(len(review_one)))

In [None]:
"""
Below we import the 5 fields extracted previously
"""
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\pre-indexed files\\plot_one_16012020.pkl'), 'rb') as f:
    
    plot = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\pre-indexed files\\rating_one_16012020.pkl'), 'rb') as f:
    
    rating = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\pre-indexed files\\actors_one_16012020.pkl'), 'rb') as f:
    
    actors = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\pre-indexed files\\director_one_16012020.pkl'), 'rb') as f:
    
    director = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\pre-indexed files\\synopsis_one_17012020.pkl'), 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\pre-indexed files\\reviews_one_17012020.pkl'), 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

print(rating) #empty

In [None]:
"""
Running the commands below we can spot the number of indexes that should be removed from the data extracted.
"Add a Plot": Denoted that the movies does have a written plot on its IMDB website.
"It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.": Also denotes a non-written synopsis.

Synopsis indexes are not further considered. There are too many movies with no synopsis text and there is no poin in erasing almost half of the first 5000 movies due to missing synopsis text.
"""
matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print("Number of movies with 'Add Plot': ", len(matching_add_plot), '\n')

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print("Number of movies with no synopsis text: ", len(matching_add_synopsis))

In [None]:
"""
Remove the faulty indexes from the data. As faulty is characterized an index with no data relevant to movie content or user review of a movie or with an error 404 web page.
Indexes collected during the data extraction of the 6 fields and from the above python cell that prints number of movies with no written plot.
"""
index_remove=[708,718,757,1287,708,718,1387,1705,3587,4360,598,635,636,637,646,663,717,726,730,755,756,773,810,843,856,981,1085,1109,1117,1118,1121,1131,1142,1286,1289,1386,1399,1412,1424,1575,1633,1675,1704,2145,2503,4760,4786,137,637,717,726,738,756,777,810,1131,1286,1289,1509,1599,1646,1704,2185,2503,2519,2823,3190,3192,3269,4362,4366,4765]
index_remove = list(dict.fromkeys(index_remove))
print(len(index_remove))
content_index_one = [i for j, i in enumerate(content_one) if j not in index_remove]
review_index_one = [i for j, i in enumerate(review_one) if j not in index_remove]
print(len(content_index_one))
print(len(review_index_one))

In [None]:
"""
Run this cell once! For every re-execution of this notebook, please use the already pickled file.
"""
# # Pickle the files for further use!

# with open(os.path.join(os.getcwd(),'58,000 movies\\movies_one\\content_index_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(content_index_one, f)
    
# with open(os.path.join(os.getcwd(),'58,000 movies\\movies_one\\review_index_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(review_index_one, f)

In [None]:
"""
Remove the 60 indexes from the dataframe with the first 5000 movies
"""
dataset = pd.read_pickle(os.path.join(os.getcwd(),"dataset_58,000_14012020_latest_version.pkl"))

dataset_one = dataset.iloc[:5000].reset_index()

dataset_one = dataset_one[~dataset_one.index.isin(index_remove)]

print(dataset_one.shape)

# dataset_one.to_pickle("dataset_one_20012020.pkl") #older version to not use!

In [None]:
"""
Import the pruned list of movie_content web pages with 4940 movies. The 60 indexes removes did not contain one more -
of the following fields: plot, actors, director(s), imdb_rating, review(s)

It is important to re-run the process of extracting the information in order to assure that we have extracted the correct information and text for each movie
Otherwise, we might observe mismatches in the fields assigned to each movie title.
"""
content_souplist_one = []

with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\content_index_one_20012020.pkl'), 'rb') as f:
    content_index_one = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_one)))

for i in tqdm_notebook(content_index_one):
    content_souplist_one.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_one)))

#-----------------------------------------------------------------------------------------------------

with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\review_index_one_20012020.pkl'), 'rb') as f:
    review_one = pickle.load(f)

print("Number of URLs: {}".format(len(review_one)))

review_souplist_one = []

for i in tqdm_notebook(review_one):
    review_souplist_one.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_one)))

In [None]:
"""
Field 1: Extract plot summary
"""
myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

In [None]:
"""
Field 2: Extract actors
"""
myfield_cast = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_one)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

In [None]:
"""
Field 3: Extract director name(s)
"""
myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
"""
Field 4: Extract imdb movie rating
"""
myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_one)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
"""
Field 5: Extract movie user reviews
"""
myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_one)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Synopsis list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))
print([i for i,x in enumerate(myfield_review_step_two) if not x])
print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
"""
Run this cell once! For every re-execution of this notebook, please use the already pickled file.
"""
# # Pickle the files for further use!

# with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\plot_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(plot_summary, f)
    
# with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\rating_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(ratings, f)
    
# with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\actors_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(actors_list, f)
    
# with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\director_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(director_names, f)

# with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\reviews_one_20012020.pkl'), 'wb') as f:
#     pickle.dump(myfield_review_step_three, f)

In [None]:
"""
Remove the faulty indexes from the data. As faulty is characterized an index with no data relevant to movie content or user review of a movie or with an error 404 web page.
Indexes collected during the data extraction of the 6 fields and from the above python cell that prints number of movies with no written plot.
"""
dataset=pd.read_pickle(os.path.join(os.getcwd(),"dataset_58,000_14012020_latest_version.pkl"))
dataset_one=dataset.iloc[:5000].reset_index(drop=True)
index_remove=[]
index_remove=[708,718,757,1287,708,718,1387,1705,3587,4360,598,635,636,637,646,663,717,726,730,755,756,773,810,843,856,981,1085,1109,1117,1118,1121,1131,1142,1286,1289,1386,1399,1412,1424,1575,1633,1675,1704,2145,2503,4760,4786,137,637,717,726,738,756,777,810,1131,1286,1289,1509,1599,1646,1704,2185,2503,2519,2823,3190,3192,3269,4362,4366,4765]
index_remove=list(dict.fromkeys(index_remove))
print(len(index_remove))
dataset_one=dataset_one[~dataset_one.index.isin(index_remove)]

In [None]:
"""
Loading the data extracted.
Those are the most important data source across all the notebook. 
The whole data extraction part was executed to end up with the following 6 files per mini-batch.

The examiner can directly import the files below to review the movie content extracted for the first mini-batch.
"""
import os
import pandas as pd
import pickle

with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\plot_one_20012020.pkl'), 'rb') as f:
    
    plot = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\rating_one_20012020.pkl'), 'rb') as f:
    
    rating = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\actors_one_20012020.pkl'), 'rb') as f:
    
    actors = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\director_one_20012020.pkl'), 'rb') as f:
    
    director = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\synopsis_one_17012020.pkl'), 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open(os.path.join(os.getcwd(), '58,000 movies\\movies_one\\reviews_one_20012020.pkl'), 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [None]:
dataset_one['actors'] = actors
dataset_one['plot'] = plot
dataset_one['imdb_rating'] = rating
dataset_one['director'] = director
dataset_one['reviews'] = reviews
dataset_one = dataset_one.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

In [None]:
indexes_to_remove_no_reviews = [716,719,757,792,824,836,960,1063,1086,1093,1095,1104,1114,1258,1365,1377,1388,1537,1593,1633,2100,2456,4703,4727]
dataset_one = dataset_one[~dataset_one.index.isin(indexes_to_remove_no_reviews)]
dataset_one = dataset_one[dataset_one.astype(str)['reviews'] != '[]']
dataset_one

In [None]:
"""
This is the table to use in Part 1, and the table that will be concatenated with the rest of the mini-batch tables.
"""
# dataset_one.to_pickle(os.path.join(os.getcwd(), "58,000 movies\\movies_one\\dataset_one_final_2512020.pkl"))