### Part 1 - Update, clean & transform the dataset (latest changes on 16.02.2020)

#### Import the libraries

In [1]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook
# Example of Use: tqdm_notebook(examples, desc="Converting examples to features")

### Now that I have extracted the online HTML docs for each movie, it is time to extract the information I desire

Code Structure:
    
For each subset of my 58,098 movies:

* 11 sets of 5,000 movies
* 1 set of 3,098

I extract the following 6 fields:

* field 1: Plot Summary
* field 2: Actors
* field 3: Directors
* field 4: IMDB rating
* field 5: Plot Synopsis
* field 6: Reviews

To extract those 6 fields, I have first to transform the extracted urls list to a beautiful soup object. This process consumes an important amount of time and RAM capacity. So it should be done for each batch of 5000 movies explicitly.

After the transformation to a beautiful soup object each field is extracted with some for loops and the .find_all() method, which is an appropriate search tool of an HTML tag (i.e /href string).

#### 1) 5000 movies

In [None]:
content_one = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_one_10012020.pkl', 'rb') as f:
    
    content_one = pickle.load(f)

print("Number of URLs: {}".format(len(content_one)))

content_souplist_one = []

for i in tqdm_notebook(content_one):
    
    content_souplist_one.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_one)))

In [None]:
# Field 1: Extract plot summary

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract Actors

myfield_cast = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_one)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

In [None]:
# Field 4: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_one)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# Pickle plot_summary, actors_list, director_names files for further use!

with open('plot_one_16012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('actors_one_16012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_one_16012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

# Indexes to remove: [708,718,1387,1705,3587,4360] (no actors)

# 708, no plot and directors

In [None]:
# Pickle ratings file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\rating_one_16012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
# Indexes to remove: [708,718,757,1287] (not rated)

- - - - - - - - - - - - - - - - - - - - - - - - 

In [None]:
synopsis_one = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_one_12012020.pkl', 'rb') as f:
    
    synopsis_one = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_one)))

synopsis_souplist_one = []

for i in tqdm_notebook(synopsis_one):
    
    synopsis_souplist_one.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_one)))

In [None]:
# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_one)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
print([i for i,x in enumerate(synopsis_step_one) if not x])

print([i for i,x in enumerate(synopsis_step_two) if not x])

print([i for i,x in enumerate(synopsis_step_three) if not x])

# Many of the movies do not have a synopsis text, this is it won't be used for furhter analysis (althpugh will be extracted)

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_one_17012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)
    
# Indexes to remove: [708]

- - - - - - - - - - - - - - - - - - - - - - - - 

In [None]:
review_one = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_one_15012020.pkl', 'rb') as f:
    
    review_one = pickle.load(f)

print("Number of URLs: {}".format(len(review_one)))

review_souplist_one = []

for i in tqdm_notebook(review_one):
    
    review_souplist_one.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_one)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_one)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Synopsis list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
print([i for i,x in enumerate(myfield_review_step_one) if not x])

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
# Pickle the reviews file for further use!

with open('reviews_one_17012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: [598,635,636,637,646,663,708,717,726,730,755,756,773,810,843,856,981,1085,1109,1117,1118,1121,1131,1142,1286,1289,1386,1399,1412,1424,1575,1633,1675,1704,2145,2503,4760,4786]

<b> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - </b>

#### Dataframe creation based on the movie content extracted

In [None]:
with open('D:\\GitHub-Thesis\\movie_content_url\\data_one_10012020.pkl', 'rb') as f:
    
    content_one = pickle.load(f)

print("Number of URLs: {}".format(len(content_one)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_one_15012020.pkl', 'rb') as f:
    
    review_one = pickle.load(f)

print("Number of URLs: {}".format(len(review_one)))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\pre-indexed files\\plot_one_16012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\pre-indexed files\\rating_one_16012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\pre-indexed files\\actors_one_16012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\pre-indexed files\\director_one_16012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\synopsis_one_17012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\reviews_one_17012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# REMOVE EMPTY INDICES

index_remove=[708,718,757,1287,708,718,1387,1705,3587,4360,598,635,636,637,646,663,717,726,730,755,756,773,8,843,856,981,1085,1109,1117,1118,1121,1131,1142,1286,1289,1386,1399,1412,1424,1575,1633,1675,1704,2145,2503,4760,4786,137,637,717,726,738,756,777,810,1131,1286,1289,1509,1599,1646,1704,2185,2503,2519,2823,3190,3192,3269,4362,4366,4765]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_one = [i for j, i in enumerate(content_one) if j not in index_remove]

review_index_one = [i for j, i in enumerate(review_one) if j not in index_remove]

print(len(content_index_one))

print(len(review_index_one))

# The length of this content_one (4940) should now be extracted from souplist!

In [None]:
# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\content_index_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_one, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\review_index_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_one, f)

In [None]:
# REMOVE THOSE 60 INDEXES NOT ONLY FROM REQUEST URLS (CONTENT, REVIEWS) BUT ALSO FROM MY DATASET

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_one = dataset.iloc[:5000].reset_index()

dataset_one = dataset_one[~dataset_one.index.isin(index_remove)]

dataset_one.shape

# HAVING DELETED THE PROPER INDICES I SHOULD NOW RE_EXTRACT THE CONTENT + REVIEW

# PICKLE THE DATASET!!

In [None]:
dataset_one.to_pickle("dataset_one_20012020.pkl")

# CLEANED FROM UNWANTED INDICES!

In [None]:
content_souplist_one = []

with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\content_index_one_20012020.pkl', 'rb') as f:
    
    content_index_one = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_one)))

for i in tqdm_notebook(content_index_one):
    
    content_souplist_one.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_one)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_one)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_one)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_one)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

In [None]:
# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\plot_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\rating_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\actors_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\director_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\review_index_one_20012020.pkl', 'rb') as f:
    
    review_one = pickle.load(f)

print("Number of URLs: {}".format(len(review_one)))

review_souplist_one = []

for i in tqdm_notebook(review_one):
    
    review_souplist_one.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_one)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_one)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Synopsis list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\reviews_one_20012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
data_one = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_final_20012020.pkl")

In [None]:
index_remove = [716, 719, 757, 792, 824, 836, 960, 1063, 1086, 1093, 1095, 1104, 1114, 1258, 1365, 1377, 1388, 1537, 1593, 1633, 2100, 2456, 4703, 4727]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

In [None]:
len(myfield_review_step_three)

In [None]:
data_one.reviews = myfield_review_step_three

In [None]:
data_one.iloc[1093]

In [None]:
# AFTER THE LISTS OF DATA HAVE BEEN CREATED, I SHOULD REMEMBER TO ADD THEM ON EACH DATASET BEFORE MOVING TO THE NEXT ONE!

In [None]:
dataset_one = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_20012020.pkl")
dataset_one = dataset_one.reset_index(drop=True)

dataset_one.head()

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\plot_one_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\rating_one_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\actors_one_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\director_one_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\reviews_one_20012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [19]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_one\\rating_one_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)

In [20]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
len(reviews)

In [None]:
dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_one = dataset.iloc[:5000].reset_index(drop=True)

index_remove = [708, 718, 757, 1287, 708, 718, 1387, 1705, 3587, 4360, 598, 635, 636, 637, 646, 663, 717, 726, 730, 755, 756, 773, 810, 843, 856, 981, 1085, 1109, 1117, 1118, 1121, 1131, 1142, 1286, 1289, 1386, 1399, 1412, 1424, 1575, 1633, 1675, 1704, 2145, 2503, 4760, 4786, 137,
                637,
                717,
                726,
                738,
                756,
                777,
                810,
                1131,
                1286,
                1289,
                1509,
                1599,
                1646,
                1704,
                2185,
                2503,
                2519,
                2823,
                3190,
                3192,
                3269,
                4362,
                4366,
                4765]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

dataset_one = dataset_one[~dataset_one.index.isin(index_remove)]

In [None]:
dataset_one.columns

In [None]:
dataset_one['actors'] = actors

dataset_one['plot'] = plot

dataset_one['imdb_rating'] = rating

dataset_one['director'] = director

dataset_one['reviews'] = reviews

dataset_one = dataset_one.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

In [None]:
dataset_one.head()

In [None]:
dataset_one.shape

In [None]:
indexes_to_remove_no_reviews = [716, 719, 757, 792, 824, 836, 960, 1063, 1086, 1093, 1095, 1104, 1114, 1258, 1365, 1377, 1388, 1537, 1593, 1633, 2100, 2456, 4703, 4727]

dataset_one = dataset_one[~dataset_one.index.isin(indexes_to_remove_no_reviews)]

In [None]:
dataset_one = dataset_one[dataset_one.astype(str)['reviews'] != '[]']

In [None]:
dataset_one.shape

In [None]:
dataset_one.shape

In [None]:
# dataset_one.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_final_2012020.pkl") (old)

# data_one.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_final_2412020.pkl") old

dataset_one.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_final_2512020.pkl")

#### 2) 10000 movies

In [None]:
content_two = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_two_12012020.pkl', 'rb') as f:
    
    content_two = pickle.load(f)

print("Number of URLs: {}".format(len(content_two)))

In [None]:
content_souplist_two = []

for i in tqdm_notebook(content_two):
    
    content_souplist_two.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_two)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_two)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_two)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_two)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_two)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

In [None]:
# Pickle the requests file for further use!

with open('plot_two_17012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_two_17012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_two_17012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_two_17012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [1819, 2408, 2942, 3666, 4454] (No actors)

In [None]:
synopsis_two = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_two_12012020.pkl', 'rb') as f:
    
    synopsis_two = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_two)))

synopsis_souplist_two = []

for i in tqdm_notebook(synopsis_two):
    
    synopsis_souplist_two.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_two)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_two)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_two_17012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)
    
# Indexes to remove: [2349]

In [None]:
review_two = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_two_15012020.pkl', 'rb') as f:
    
    review_two = pickle.load(f)

print("Number of URLs: {}".format(len(review_two)))

review_souplist_two = []

for i in tqdm_notebook(review_two):
    
    review_souplist_two.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_two)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_two)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
# Pickle the reviews file for further use!

with open('reviews_two_17012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_two_12012020.pkl', 'rb') as f:
    
    content_two = pickle.load(f)

print("Number of URLs: {}".format(len(content_two)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_two_15012020.pkl', 'rb') as f:
    
    review_two = pickle.load(f)

print("Number of URLs: {}".format(len(review_two)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\pre-indexed files\\plot_two_17012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\pre-indexed files\\rating_two_17012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\pre-indexed files\\actors_two_17012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\pre-indexed files\\director_two_17012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\synopsis_two_17012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\reviews_two_17012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [1819, 
                2408, 
                2942, 
                3666, 
                4454, 
                920, 
                1830, 
                1844, 
                4117, 
                4224,
                642, 
                773, 
                1565, 
                2322, 
                2944, 
                3070, 
                4268, 
                4573]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_two = [i for j, i in enumerate(content_two) if j not in index_remove]

review_index_two = [i for j, i in enumerate(review_two) if j not in index_remove]

print(len(content_index_two))

print(len(review_index_two))

# The length of this content_one (4940) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\content_index_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_two, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\review_index_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_two, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\content_index_two_20012020.pkl', 'rb') as f:
    
    content_index_two = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_two)))

content_souplist_two = []

for i in tqdm_notebook(content_index_two):
    
    content_souplist_two.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_two)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_two)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_two)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_two)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_two)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

In [None]:
myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_two)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 8

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\plot_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\actors_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\director_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\rating_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\review_index_two_20012020.pkl', 'rb') as f:
    
    review_two = pickle.load(f)

print("Number of URLs: {}".format(len(review_two)))

review_souplist_two = []

for i in tqdm_notebook(review_two):
    
    review_souplist_two.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_two)))

In [None]:
# STEP 9

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_two)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\reviews_two_20012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
data_two = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_two\\dataset_two_final_20012020.pkl")

index_remove = []

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_two.reviews = myfield_review_step_three

print(data_two.head())

In [None]:
# STEP 10

index_remove = [1819, 
                2408, 
                2942, 
                3666, 
                4454, 
                920, 
                1830, 
                1844, 
                4117, 
                4224,
                642, 
                773, 
                1565, 
                2322, 
                2944, 
                3070, 
                4268, 
                4573]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_two = dataset.iloc[5000:10000].reset_index(drop=True)

dataset_two = dataset_two[~dataset_two.index.isin(index_remove)]

dataset_two.shape

In [None]:
dataset_two.columns

In [21]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\plot_two_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\rating_two_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\actors_two_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\director_two_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_two\\reviews_two_20012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [22]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
dataset_two['actors'] = actors

dataset_two['plot'] = plot

dataset_two['imdb_rating'] = rating

dataset_two['director'] = director

dataset_two['reviews'] = reviews

dataset_two = dataset_two.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

In [None]:
dataset_two.iloc[2005]

In [None]:
dataset_two.shape

In [None]:
dataset_two = dataset_two[dataset_two.astype(str)['reviews'] != '[]']

dataset_two.shape

In [None]:
# dataset_two.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_two\\dataset_two_final_20012020.pkl") old

# data_two.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_two\\dataset_two_final_24012020.pkl")

dataset_two.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_two\\dataset_two_final_25012020.pkl")

#### 3) 15000 movies

In [None]:
content_three = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_three_12012020.pkl', 'rb') as f:
    
    content_three = pickle.load(f)

print("Number of URLs: {}".format(len(content_three)))

In [None]:
content_souplist_three = []

for i in tqdm_notebook(content_three):
    
    content_souplist_three.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_three)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_three)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_three)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_three)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

In [None]:
# Pickle the requests file for further use!

with open('plot_three_17012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_three_17012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_three_17012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_three_17012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [577, 657, 725, 902, 1224, 1540, 1599, 2425, 2570, 2617, 2642, 2690, 3316, 3598, 3649, 4037, 
# 4047, 4093, 4128, 4316, 4439, 4440, 4451, 4493, 4554, 4556, 4563, 4570, 4576, 4627, 4640, 4757, 4832, 4994] (No actors)

# Indexes to remove: [4503] (not rated)

In [None]:
synopsis_three = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_three_12012020.pkl', 'rb') as f:
    
    synopsis_three = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_three)))

synopsis_souplist_three = []

for i in tqdm_notebook(synopsis_three):
    
    synopsis_souplist_three.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_three)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_three)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_three_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)
    
# Indexes to remove: 0

In [None]:
review_three = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_three_15012020.pkl', 'rb') as f:
    
    review_three = pickle.load(f)

print("Number of URLs: {}".format(len(review_three)))

review_souplist_three = []

for i in tqdm_notebook(review_three):
    
    review_souplist_three.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_three)))

#------------------------------------------------------------------------------------------------

In [None]:
# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_three)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
# Pickle the reviews file for further use!

with open('reviews_three_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
myfield_review_step_three[104]

In [None]:
dataset = pd.read_pickle('D://GitHub-Thesis//dataset_58,000_14012020_latest_version.pkl') 

In [None]:
dataset_two = dataset.iloc[10000:15000].reset_index()

In [None]:
dataset_two.iloc[104]

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_three_12012020.pkl', 'rb') as f:
    
    content_three = pickle.load(f)

print("Number of URLs: {}".format(len(content_three)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_three_15012020.pkl', 'rb') as f:
    
    review_three = pickle.load(f)

print("Number of URLs: {}".format(len(review_three)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\pre-indexed files\\plot_three_17012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\pre-indexed files\\rating_three_17012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\pre-indexed files\\actors_three_17012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\pre-indexed files\\director_three_17012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\synopsis_three_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\reviews_three_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [868, 
                1418, 
                2611, 
                2807, 
                3208, 
                3387, 
                3390, 3391, 3441, 3501, 
                3557, 3588, 3646, 3649, 
                3661, 3709, 3751, 3755, 
                3845, 4012, 4121, 4157, 
                4553, 4646, 4757, 4772, 
                4804, 4977, 32, 924, 1264, 
                1265, 2570, 2611, 3010, 
                3155, 3304, 3374, 3390, 
                3428, 3441, 3505, 3547, 
                3646, 3649, 3650, 3741, 
                3751, 3755, 3792, 3945, 
                3948, 3950, 3992, 4037, 
                4120, 4157, 4187, 4214, 
                4238, 4381, 4499, 4503, 
                4553, 4566, 4627, 4646, 
                4706, 4751, 4772, 4832, 
                4898, 4946, 4974, 4976, 4977, 4997,
                577, 657, 725, 902, 1224, 1540, 1599, 2425, 2570, 2617, 2642, 2690, 3316, 3598, 3649, 4037, 
                4047, 4093, 4128, 4316, 4439, 4440, 4451, 4493, 4554, 4556, 4563, 4570, 4576, 4627, 4640, 4757, 4832, 4994,
                4503]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_three = [i for j, i in enumerate(content_three) if j not in index_remove]

review_index_three = [i for j, i in enumerate(review_three) if j not in index_remove]

print(len(content_index_three))

print(len(review_index_three))

# The length of this content_one (4907) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\content_index_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_three, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\review_index_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_three, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\content_index_three_20012020.pkl', 'rb') as f:
    
    content_index_three = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_three)))

content_souplist_three = []

for i in tqdm_notebook(content_index_three):
    
    content_souplist_three.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_three)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_three)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_three)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_three)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 8

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\plot_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\actors_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\director_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\rating_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\review_index_three_20012020.pkl', 'rb') as f:
    
    review_three = pickle.load(f)

print("Number of URLs: {}".format(len(review_three)))

review_souplist_three = []

for i in tqdm_notebook(review_three):
    
    review_souplist_three.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_three)))

In [None]:
# STEP 9

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_three)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))
    
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\reviews_three_20012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
data_three = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_three\\dataset_three_final_20012020.pkl")

In [None]:
index_remove = []

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_three.reviews = myfield_review_step_three

print(data_three.head())

In [None]:
# STEP 10

index_remove = [868, 
                1418, 
                2611, 
                2807, 
                3208, 
                3387, 
                3390, 3391, 3441, 3501, 
                3557, 3588, 3646, 3649, 
                3661, 3709, 3751, 3755, 
                3845, 4012, 4121, 4157, 
                4553, 4646, 4757, 4772, 
                4804, 4977, 32, 924, 1264, 
                1265, 2570, 2611, 3010, 
                3155, 3304, 3374, 3390, 
                3428, 3441, 3505, 3547, 
                3646, 3649, 3650, 3741, 
                3751, 3755, 3792, 3945, 
                3948, 3950, 3992, 4037, 
                4120, 4157, 4187, 4214, 
                4238, 4381, 4499, 4503, 
                4553, 4566, 4627, 4646, 
                4706, 4751, 4772, 4832, 
                4898, 4946, 4974, 4976, 
                4977, 4997, 577, 657, 
                725, 902, 1224, 1540, 
                1599, 2425, 2570, 2617, 
                2642, 2690, 3316, 3598, 
                3649, 4037, 4047, 4093, 
                4128, 4316, 4439, 4440, 
                4451, 4493, 4554, 4556, 
                4563, 4570, 4576, 4627, 
                4640, 4757, 4832, 4994,
                4503]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_three = dataset.iloc[10000:15000].reset_index(drop=True)

dataset_three = dataset_three[~dataset_three.index.isin(index_remove)]

dataset_three.shape

In [23]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\plot_three_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\rating_three_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\actors_three_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\director_three_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_three\\reviews_three_20012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [24]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
dataset_three['actors'] = actors

dataset_three['plot'] = plot

dataset_three['imdb_rating'] = rating

dataset_three['director'] = director

dataset_three['reviews'] = reviews

dataset_three = dataset_three.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_three.iloc[2005]

In [None]:
dataset_three.shape

In [None]:
dataset_three = dataset_three[dataset_three.astype(str)['reviews'] != '[]']

dataset_three.shape

In [None]:
# dataset_three.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_three\\dataset_three_final_20012020.pkl") old

# data_three.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_three\\dataset_three_final_24012020.pkl") old

dataset_three.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_three\\dataset_three_final_25012020.pkl")

#### 4) 20000 movies

In [None]:
content_four = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_four_12012020.pkl', 'rb') as f:
    
    content_four = pickle.load(f)

print("Number of URLs: {}".format(len(content_four)))

content_souplist_four = []

for i in tqdm_notebook(content_four):
    
    content_souplist_four.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_four)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_four)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_four)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_four)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_four)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

# myfield_director = []
director_name = []
index_to_remove_no_directors = []

# [myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_four)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

#------------------------------------------------------------------------------------------------

In [None]:
# Pickle the requests file for further use!

with open('plot_four_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_four_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_four_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_four_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [152, 261, 316, 343, 381, 391, 529, 778, 793, 876, 938, 1031, 1097, 1148, 1153, 1170, 1238, 1300, 1381, 
# 1482, 1497, 1568, 1639, 1902, 1910, 2046, 2086, 2189, 2190, 2224, 2234, 2235, 2426, 2439, 2502, 2504, 2578, 2671, 2715, 2730, 
#2749, 2880, 2976, 2985, 2996, 3061, 3073, 3138, 3271, 3283, 3364, 3371, 3538, 3613, 3653, 3688, 3689, 3742, 3759, 3774, 3777, 
#3809, 3831, 3841, 3951, 4049, 4086, 4106, 4171, 4208, 4322, 4327, 4381, 4433, 4457, 4465, 4618, 4672] (No actors)

# Indexes to remove: [1482, 1902, 2224, 4180, 4672] (not rated)

# Indexes to remove: [1482, 1902, 2224, 4180, 4672] (no summary)

# Indexes to remove: [2224, 4170, 4464] (no directors)

In [None]:
synopsis_four = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_four_12012020.pkl', 'rb') as f:
    
    synopsis_four = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_four)))

synopsis_souplist_four = []

for i in tqdm_notebook(synopsis_four):
    
    synopsis_souplist_four.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_four)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_four)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_four_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)
    
# Indexes to remove: [2224]

In [None]:
review_four = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_four_15012020.pkl', 'rb') as f:
    
    review_four = pickle.load(f)

print("Number of URLs: {}".format(len(review_four)))

review_souplist_four = []

for i in tqdm_notebook(review_four):
    
    review_souplist_four.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_four)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_four)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))
    
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
myfield_review_step_three[4947]

In [None]:
# Pickle the reviews file for further use!

with open('reviews_four_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: [2224]

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_four_12012020.pkl', 'rb') as f:
    
    content_four = pickle.load(f)

print("Number of URLs: {}".format(len(content_four)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_four_15012020.pkl', 'rb') as f:
    
    review_four = pickle.load(f)

print("Number of URLs: {}".format(len(review_four)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\pre-indexed files\\plot_four_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\pre-indexed files\\rating_four_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\pre-indexed files\\actors_four_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\pre-indexed files\\director_four_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\synopsis_four_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\reviews_four_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [35, 110, 170, 290, 365, 366, 449, 573, 642, 643, 714, 866, 929, 1011, 1170, 1257, 1474, 1488, 1639, 1812, 1902, 1994, 2036, 2046, 2180, 2189, 2259, 2343, 2357, 2422, 2438, 2494, 2503, 2507, 2584, 2661, 2857, 2867, 3040, 3117, 3124, 3184, 3241, 3326, 3368, 3506, 3688, 3735, 3763, 3766, 3772, 3869, 3928, 3947, 4004, 4008, 4020, 4034, 4049, 4119, 4153, 4179, 4197, 4287, 4301, 4411, 4456, 4508, 4526, 4537, 4541, 4609, 4649, 4659, 4714, 4727, 4774, 4795, 4908, 4948, 4955,
                152, 261, 316, 343, 381, 391, 529, 778, 793, 876, 938, 1031, 1097, 1148, 1153, 1170, 1238, 1300, 1381, 
                1482, 1497, 1568, 1639, 1902, 1910, 2046, 2086, 2189, 2190, 2224, 2234, 2235, 2426, 2439, 2502, 2504, 2578, 2671, 2715, 2730, 
                2749, 2880, 2976, 2985, 2996, 3061, 3073, 3138, 3271, 3283, 3364, 3371, 3538, 3613, 3653, 3688, 3689, 3742, 3759, 3774, 3777, 
                3809, 3831, 3841, 3951, 4049, 4086, 4106, 4171, 4208, 4322, 4327, 4381, 4433, 4457, 4465, 4618, 4672,
                1482, 1902, 2224, 4180, 4672,
                1482, 1902, 2224, 4180, 4672,
                2224, 4170, 4464,
                45, 146, 150, 153, 160, 170, 236, 290, 307, 327, 343, 365, 366, 369, 410, 520, 539, 545, 570, 575, 643, 705, 714, 792, 864, 866, 876, 881, 889, 1086, 1101, 1142, 1148, 1151, 1173, 1202, 1254, 1257, 1286, 1412, 1414, 1482, 1488, 1522, 1639, 1664, 1738, 1759, 1813, 1856, 1902, 1933, 1934, 1950, 1989, 1994, 1995, 2023, 2028, 2036, 2046, 2086, 2189, 2193, 2259, 2268, 2321, 2344, 2355, 2368, 2414, 2422, 2438, 2439, 2442, 2443, 2486, 2494, 2507, 2508, 2517, 2518, 2578, 2584, 2596, 2605, 2670, 2786, 2810, 2852, 2913, 2975, 2995, 3071, 3084, 3117, 3119, 3124, 3130, 3146, 3173, 3174, 3184, 3229, 3238, 3281, 3282, 3290, 3368, 3369, 3466, 3489, 3506, 3518, 3519, 3535, 3537, 3538, 3576, 3610, 3631, 3666, 3688, 3695, 3699, 3715, 3759, 3762, 3767, 3774, 3789, 3809, 3830, 3887, 3899, 3902, 3910, 3928, 3936, 3947, 3977, 3979, 3983, 4008, 4020, 4049, 4066, 4119, 4137, 4159, 4167, 4170, 4179, 4184, 4185, 4186, 4216, 4222, 4231, 4250, 4290, 4338, 4362, 4363, 4364, 4376, 4399, 4405, 4411, 4464, 4508, 4515, 4518, 4540, 4541, 4546, 4573, 4576, 4584, 4587, 4588, 4594, 4649, 4659, 4666, 4671, 4675, 4695, 4716, 4727, 4736, 4775, 4793, 4862, 4872, 4881, 4892, 4910, 4919, 4937, 4947]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_four = [i for j, i in enumerate(content_four) if j not in index_remove]

review_index_four = [i for j, i in enumerate(review_four) if j not in index_remove]

print(len(content_index_four))

print(len(review_index_four))

# The length of this content_one (4697) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\content_index_four_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_four, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\review_index_four_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_four, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\content_index_four_20012020.pkl', 'rb') as f:
    
    content_index_four = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_four)))

content_souplist_four = []

for i in tqdm_notebook(content_index_four):
    
    content_souplist_four.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_four)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_four)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_four)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_four)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_four)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 8

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\plot_four_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\actors_four_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\director_four_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\rating_four_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\review_index_four_20012020.pkl', 'rb') as f:
    
    review_four = pickle.load(f)

print("Number of URLs: {}".format(len(review_four)))

review_souplist_four = []

for i in tqdm_notebook(review_four):
    
    review_souplist_four.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_four)))

In [None]:
# STEP 9

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_four)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))
    
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_four = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_four\\dataset_four_final_20012020.pkl")

print(data_four.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\reviews_four_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
# STEP 10

index_remove = [35, 110, 170, 290, 365, 366, 449, 573, 642, 643, 714, 866, 929, 1011, 1170, 1257, 1474, 1488, 1639, 1812, 1902, 1994, 2036, 2046, 2180, 2189, 2259, 2343, 2357, 2422, 2438, 2494, 2503, 2507, 2584, 2661, 2857, 2867, 3040, 3117, 3124, 3184, 3241, 3326, 3368, 3506, 3688, 3735, 3763, 3766, 3772, 3869, 3928, 3947, 4004, 4008, 4020, 4034, 4049, 4119, 4153, 4179, 4197, 4287, 4301, 4411, 4456, 4508, 4526, 4537, 4541, 4609, 4649, 4659, 4714, 4727, 4774, 4795, 4908, 4948, 4955,
                152, 261, 316, 343, 381, 391, 529, 778, 793, 876, 938, 1031, 1097, 1148, 1153, 1170, 1238, 1300, 1381, 
                1482, 1497, 1568, 1639, 1902, 1910, 2046, 2086, 2189, 2190, 2224, 2234, 2235, 2426, 2439, 2502, 2504, 2578, 2671, 2715, 2730, 
                2749, 2880, 2976, 2985, 2996, 3061, 3073, 3138, 3271, 3283, 3364, 3371, 3538, 3613, 3653, 3688, 3689, 3742, 3759, 3774, 3777, 
                3809, 3831, 3841, 3951, 4049, 4086, 4106, 4171, 4208, 4322, 4327, 4381, 4433, 4457, 4465, 4618, 4672,
                1482, 1902, 2224, 4180, 4672,
                1482, 1902, 2224, 4180, 4672,
                2224, 4170, 4464,
                45, 146, 150, 153, 160, 170, 236, 290, 307, 327, 343, 365, 366, 369, 410, 520, 539, 545, 570, 575, 643, 705, 714, 792, 864, 866, 876, 881, 889, 1086, 1101, 1142, 1148, 1151, 1173, 1202, 1254, 1257, 1286, 1412, 1414, 1482, 1488, 1522, 1639, 1664, 1738, 1759, 1813, 1856, 1902, 1933, 1934, 1950, 1989, 1994, 1995, 2023, 2028, 2036, 2046, 2086, 2189, 2193, 2259, 2268, 2321, 2344, 2355, 2368, 2414, 2422, 2438, 2439, 2442, 2443, 2486, 2494, 2507, 2508, 2517, 2518, 2578, 2584, 2596, 2605, 2670, 2786, 2810, 2852, 2913, 2975, 2995, 3071, 3084, 3117, 3119, 3124, 3130, 3146, 3173, 3174, 3184, 3229, 3238, 3281, 3282, 3290, 3368, 3369, 3466, 3489, 3506, 3518, 3519, 3535, 3537, 3538, 3576, 3610, 3631, 3666, 3688, 3695, 3699, 3715, 3759, 3762, 3767, 3774, 3789, 3809, 3830, 3887, 3899, 3902, 3910, 3928, 3936, 3947, 3977, 3979, 3983, 4008, 4020, 4049, 4066, 4119, 4137, 4159, 4167, 4170, 4179, 4184, 4185, 4186, 4216, 4222, 4231, 4250, 4290, 4338, 4362, 4363, 4364, 4376, 4399, 4405, 4411, 4464, 4508, 4515, 4518, 4540, 4541, 4546, 4573, 4576, 4584, 4587, 4588, 4594, 4649, 4659, 4666, 4671, 4675, 4695, 4716, 4727, 4736, 4775, 4793, 4862, 4872, 4881, 4892, 4910, 4919, 4937, 4947
                ]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_four = dataset.iloc[15000:20000].reset_index(drop=True)

dataset_four = dataset_four[~dataset_four.index.isin(index_remove)]

dataset_four.shape

In [25]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\plot_four_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\rating_four_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\actors_four_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\director_four_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_four\\reviews_four_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [26]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

len(index_remove)

In [None]:
dataset_four['actors'] = actors

dataset_four['plot'] = plot

dataset_four['imdb_rating'] = rating

dataset_four['director'] = director

dataset_four['reviews'] = reviews

dataset_four = dataset_four.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_four.iloc[2005]

In [None]:
dataset_four.shape

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

index_remove = list(dict.fromkeys(index_remove))

dataset_four = dataset_four[~dataset_four.index.isin(index_remove)]

In [None]:
dataset_four.shape

In [None]:
# dataset_four.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_four\\dataset_four_final_24012020.pkl") (old)

dataset_four.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_four\\dataset_four_final_25012020.pkl")

#### 5) 25000 movies

In [None]:
content_five = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_five_13012020.pkl', 'rb') as f:
    
    content_five = pickle.load(f)

print("Number of URLs: {}".format(len(content_five)))

content_souplist_five = []

for i in tqdm_notebook(content_five):
    
    content_souplist_five.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_five)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_five)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_five)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_five)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_five)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_five_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_five_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_five_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_five_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [183, 366, 374, 465, 484, 522, 567, 571, 580, 594, 606, 623, 630, 707, 711, 729, 733, 735, 747, 750, 753, 
# 780, 864, 875, 1041, 1043, 1050, 1093, 1249, 1250, 1274, 1373, 1427, 1457, 1486, 1503, 1508, 1553, 1610, 1662, 1679, 1693, 
# 1748, 1820, 1871, 1893, 1990, 2036, 2133, 2158, 2166, 2231, 2250, 2395, 2510, 2886, 2963, 2972, 3071, 3189, 3243, 3283, 3295, 
# 3298, 3329, 3396, 3410, 3429, 3451, 3485, 3555, 3672, 3754, 3819, 3834, 3934, 3936, 3973, 4068, 4077, 4216, 4227, 4551, 4564, 
# 4614, 4645, 4646, 4674, 4712, 4720, 4746, 4787, 4852, 4858, 4882, 4889, 4942, 4957, 4978, 4995] (No actors)

# Indexes to remove: [112, 115, 183, 366, 374, 501, 747, 1487, 1674, 2400, 4089, 4843] (not rated)

In [None]:
synopsis_five = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_five_13012020.pkl', 'rb') as f:
    
    synopsis_five = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_five)))

synopsis_souplist_five = []

for i in tqdm_notebook(synopsis_five):
    
    synopsis_souplist_five.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_five)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_five)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_five_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)
    
# Indexes to remove: [2224]

In [None]:
review_five = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_five_15012020.pkl', 'rb') as f:
    
    review_five = pickle.load(f)

print("Number of URLs: {}".format(len(review_five)))

review_souplist_five = []

for i in tqdm_notebook(review_five):
    
    review_souplist_five.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_five)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_five)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_five_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: 0

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_five_13012020.pkl', 'rb') as f:
    
    content_five = pickle.load(f)

print("Number of URLs: {}".format(len(content_five)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_five_15012020.pkl', 'rb') as f:
    
    review_five = pickle.load(f)

print("Number of URLs: {}".format(len(review_five)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\pre-indexed files\\plot_five_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\pre-indexed files\\rating_five_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\pre-indexed files\\actors_five_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\pre-indexed files\\director_five_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\synopsis_five_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\reviews_five_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [183, 366, 374, 465, 484, 522, 567, 571, 580, 594, 606, 623, 630, 707, 711, 729, 733, 735, 747, 750, 753, 
                780, 864, 875, 1041, 1043, 1050, 1093, 1249, 1250, 1274, 1373, 1427, 1457, 1486, 1503, 1508, 1553, 1610, 1662, 
                1679, 1693, 
                1748, 1820, 1871, 1893, 1990, 2036, 2133, 2158, 2166, 2231, 2250, 2395, 2510, 2886, 2963, 2972, 3071, 3189, 3243, 3283, 3295, 
                3298, 3329, 3396, 3410, 3429, 3451, 3485, 3555, 3672, 3754, 3819, 3834, 3934, 3936, 3973, 4068, 4077, 4216, 4227, 4551, 4564, 
                4614, 4645, 4646, 4674, 4712, 4720, 4746, 4787, 4852, 4858, 4882, 4889, 4942, 4957, 4978, 4995,
                112, 115, 183, 366, 374, 501, 747, 1487, 1674, 2400, 4089, 4843,
                12, 70, 89, 90, 91, 196, 214, 215, 242, 248, 319, 330, 366, 381, 403, 410, 432, 505, 518, 550, 581, 624, 684, 
                744, 746, 789, 823, 824, 829, 908, 921, 968, 1097, 1166, 1265, 1288, 1325, 1363, 1381, 1422, 1527, 1535, 1575, 
                1622, 1674, 1936, 1989, 2026, 2067, 2122, 2220, 2299, 2340, 2396, 2400, 2434, 2435, 2517, 2685, 2686, 2693, 
                2704, 2724, 2731, 2748, 2770, 2907, 2949, 2953, 3117, 3126, 3189, 3280, 3293, 3337, 3398, 3435, 3533, 3555, 
                3584, 3645, 3709, 3756, 3844, 3845, 3863, 4016, 4122, 4172, 4213, 4225, 4369, 4450, 4457, 4476, 4489, 4498, 
                4539, 4551, 4578, 4646, 4689, 4691, 4732, 4843, 4855, 4860, 4927, 4942, 4954, 4962, 4978]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_five = [i for j, i in enumerate(content_five) if j not in index_remove]

review_index_five = [i for j, i in enumerate(review_five) if j not in index_remove]

print(len(content_index_five))

print(len(review_index_five))

# The length of this content_one (4697) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\content_index_five_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_five, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\review_index_five_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_five, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\content_index_five_20012020.pkl', 'rb') as f:
    
    content_index_five = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_five)))

content_souplist_five = []

for i in tqdm_notebook(content_index_five):
    
    content_souplist_five.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_five)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_five)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_five)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_five)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_five)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 8

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\plot_five_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\actors_five_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\director_five_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\rating_five_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\review_index_five_20012020.pkl', 'rb') as f:
    
    review_five = pickle.load(f)

print("Number of URLs: {}".format(len(review_five)))

review_souplist_five = []

for i in tqdm_notebook(review_five):
    
    review_souplist_five.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_five)))

In [None]:
# STEP 9

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_five)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))
    
print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_five = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_five\\dataset_five_final_20012020.pkl")

print(data_five.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

dataset_five = dataset_five[dataset_five.astype(str)['reviews'] != '[]']

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\reviews_five_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
# STEP 10

index_remove = [183, 366, 374, 465, 484, 522, 567, 571, 580, 594, 606, 623, 630, 707, 711, 729, 733, 735, 747, 750, 753, 
                780, 864, 875, 1041, 1043, 1050, 1093, 1249, 1250, 1274, 1373, 1427, 1457, 1486, 1503, 1508, 1553, 1610, 1662, 
                1679, 1693, 
                1748, 1820, 1871, 1893, 1990, 2036, 2133, 2158, 2166, 2231, 2250, 2395, 2510, 2886, 2963, 2972, 3071, 3189, 3243, 3283, 3295, 
                3298, 3329, 3396, 3410, 3429, 3451, 3485, 3555, 3672, 3754, 3819, 3834, 3934, 3936, 3973, 4068, 4077, 4216, 4227, 4551, 4564, 
                4614, 4645, 4646, 4674, 4712, 4720, 4746, 4787, 4852, 4858, 4882, 4889, 4942, 4957, 4978, 4995,
                112, 115, 183, 366, 374, 501, 747, 1487, 1674, 2400, 4089, 4843,
                12, 70, 89, 90, 91, 196, 214, 215, 242, 248, 319, 330, 366, 381, 403, 410, 432, 505, 518, 550, 581, 624, 684, 
                744, 746, 789, 823, 824, 829, 908, 921, 968, 1097, 1166, 1265, 1288, 1325, 1363, 1381, 1422, 1527, 1535, 1575, 
                1622, 1674, 1936, 1989, 2026, 2067, 2122, 2220, 2299, 2340, 2396, 2400, 2434, 2435, 2517, 2685, 2686, 2693, 
                2704, 2724, 2731, 2748, 2770, 2907, 2949, 2953, 3117, 3126, 3189, 3280, 3293, 3337, 3398, 3435, 3533, 3555, 
                3584, 3645, 3709, 3756, 3844, 3845, 3863, 4016, 4122, 4172, 4213, 4225, 4369, 4450, 4457, 4476, 4489, 4498, 
                4539, 4551, 4578, 4646, 4689, 4691, 4732, 4843, 4855, 4860, 4927, 4942, 4954, 4962, 4978]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_five = dataset.iloc[20000:25000].reset_index(drop=True)

dataset_five = dataset_five[~dataset_five.index.isin(index_remove)]

dataset_five.shape

In [27]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\plot_five_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\rating_five_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\actors_five_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\director_five_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_five\\reviews_five_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [28]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
len(reviews)

In [None]:
dataset_five['actors'] = actors

dataset_five['plot'] = plot

dataset_five['imdb_rating'] = rating

dataset_five['director'] = director

dataset_five['reviews'] = reviews

dataset_five = dataset_five.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_five.iloc[2005]

In [None]:
index_remove = [47, 52, 57, 110, 111, 149, 204, 305, 344, 361, 375, 438, 491, 542, 564, 568, 570, 573, 595, 597, 610, 630, 631, 635, 682, 689, 711, 712, 715, 732, 734, 793, 861, 893, 897, 971, 974, 975, 1070, 1082, 1110, 1135, 1137, 1156, 1170, 1199, 1204, 1209, 1284, 1294, 1301, 1307, 1386, 1404, 1453, 1471, 1483, 1516, 1523, 1551, 1576, 1652, 1656, 1676, 1699, 1700, 1724, 1769, 1778, 1787, 1816, 1818, 1835, 1852, 1869, 1887, 1910, 1942, 1970, 1988, 1996, 2003, 2018, 2032, 2068, 2078, 2090, 2094, 2208, 2220, 2242, 2244, 2263, 2264, 2312, 2353, 2356, 2360, 2385, 2400, 2445, 2462, 2463, 2535, 2570, 2572, 2573, 2576, 2577, 2584, 2603, 2620, 2628, 2633, 2683, 2694, 2727, 2764, 2820, 2868, 2876, 2904, 2946, 2966, 2977, 2984, 3003, 3006, 3023, 3067, 3075, 3081, 3092, 3105, 3147, 3211, 3280, 3295, 3296, 3326, 3358, 3359, 3363, 3365, 3367, 3374, 3394, 3405, 3432, 3433, 3447, 3523, 3524, 3526, 3564, 3589, 3731, 3790, 3815, 3842, 3845, 3851, 3902, 3903, 3928, 3935, 3958, 3976, 3980, 3987, 4038, 4108, 4150, 4152, 4189, 4222, 4244, 4263, 4270, 4273, 4295, 4297, 4304, 4323, 4324, 4358, 4393, 4435, 4490, 4493, 4496, 4501, 4535, 4555, 4576, 4656, 4669, 4724, 4738, 4747, 4761, 4778]

dataset_five = dataset_five[~dataset_five.index.isin(index_remove)]

dataset_five.shape

In [None]:
dataset_five = dataset_five[dataset_five.astype(str)['reviews'] != '[]']

In [None]:
dataset_five.shape

In [None]:
# dataset_five.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_five\\dataset_five_final_24012020.pkl") old

dataset_five.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_five\\dataset_five_final_25012020.pkl")

#### 6) 30000 movies

In [None]:
content_six = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_six_13012020.pkl', 'rb') as f:
    
    content_six = pickle.load(f)

print("Number of URLs: {}".format(len(content_six)))

content_souplist_six = []

for i in tqdm_notebook(content_six):
    
    content_souplist_six.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_six)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_six)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_six)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_six)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_six)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_six_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_six_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_six_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_six_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [45, 56, 72, 74, 169, 313, 532, 585, 689, 805, 838, 977, 1041, 1042, 1043, 1157, 1158, 1159, 1160, 1161, 
# 1164, 1165, 1168, 1170, 1171, 1214, 1228, 1229, 1258, 1442, 1445, 1491, 1670, 1671, 1678, 1706, 1995, 2078, 2217, 2622, 2783,
# 3248, 3349, 3522, 3523, 3784, 3846, 3851, 3853, 3863, 3871, 3890, 3900, 3908, 3912, 3913, 3918, 3919, 3923, 3929, 3932, 4018,
# 4027, 4038, 4062, 4122, 4136, 4142, 4145, 4153, 4165, 4166, 4188, 4192, 4193, 4206, 4241, 4248, 4256, 4352, 4354, 4355, 4356, 
#4437, 4607, 4733, 4808, 4809, 4852, 4883]

# Indexes to remove: [1987, 2551, 3482, 3981, 4887] (not rated)

In [None]:
synopsis_six = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_six_13012020.pkl', 'rb') as f:
    
    synopsis_six = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_six)))

synopsis_souplist_six = []

for i in tqdm_notebook(synopsis_six):
    
    synopsis_souplist_six.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_six)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_six)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_six_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)

# Indexes to remove: 0

In [None]:
review_six = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_six_15012020.pkl', 'rb') as f:
    
    review_six = pickle.load(f)

print("Number of URLs: {}".format(len(review_six)))

review_souplist_six = []

for i in tqdm_notebook(review_six):
    
    review_souplist_six.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_six)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_six)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_six_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: 0

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_six_13012020.pkl', 'rb') as f:
    
    content_six = pickle.load(f)

print("Number of URLs: {}".format(len(content_six)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_six_15012020.pkl', 'rb') as f:
    
    review_six = pickle.load(f)

print("Number of URLs: {}".format(len(review_six)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\pre-indexed files\\plot_six_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\pre-indexed files\\rating_six_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\pre-indexed files\\actors_six_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\pre-indexed files\\director_six_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\synopsis_six_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\reviews_six_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [45, 56, 72, 74, 169, 313, 532, 585, 689, 805, 838, 977, 1041, 1042, 1043, 1157, 1158, 1159, 1160, 1161, 
                1164, 1165, 1168, 1170, 1171, 1214, 1228, 1229, 1258, 1442, 1445, 1491, 1670, 1671, 1678, 1706, 1995, 2078, 
                2217, 2622, 2783,
                3248, 3349, 3522, 3523, 3784, 3846, 3851, 3853, 3863, 3871, 3890, 3900, 3908, 3912, 3913, 3918, 3919, 3923, 
                3929, 3932, 4018,
                4027, 4038, 4062, 4122, 4136, 4142, 4145, 4153, 4165, 4166, 4188, 4192, 4193, 4206, 4241, 4248, 4256, 4352, 
                4354, 4355, 4356, 
                4437, 4607, 4733, 4808, 4809, 4852, 4883,
                1987, 2551, 3482, 3981, 4887,
                68, 176, 177, 178, 179, 180, 299, 309, 391, 509, 528, 531, 585, 658, 815, 977, 1001, 1088, 1099, 1109, 1156, 
                1178, 1180, 1185, 1190, 1195, 1197, 1198, 1210, 1229, 1396, 1438, 1451, 1629, 1676, 1698, 1706, 1944, 1964, 
                2002, 2021, 2043, 2618, 2743, 3482, 3800, 3819, 3838, 3896, 4038, 4099, 4147, 4179, 4356, 4446, 4496, 4575, 
                4576, 4592, 4606, 4829, 4846, 4855, 4887, 4894, 4952, 4986, 4990]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_six = [i for j, i in enumerate(content_six) if j not in index_remove]

review_index_six = [i for j, i in enumerate(review_six) if j not in index_remove]

print(len(content_index_six))

print(len(review_index_six))

# The length of this content_one (4845) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\content_index_six_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_six, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\review_index_six_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_six, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\content_index_six_20012020.pkl', 'rb') as f:
    
    content_index_six = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_six)))

content_souplist_six = []

for i in tqdm_notebook(content_index_six):
    
    content_souplist_six.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_six)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_six)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_six)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_six)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_six)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 8

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\plot_six_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\actors_six_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\director_six_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\rating_six_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\review_index_six_20012020.pkl', 'rb') as f:
    
    review_six = pickle.load(f)

print("Number of URLs: {}".format(len(review_six)))

review_souplist_six = []

for i in tqdm_notebook(review_six):
    
    review_souplist_six.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_six)))

In [None]:
# STEP 9

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_six)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_six = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_six\\dataset_six_final_20012020.pkl")

print(data_six.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\reviews_six_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

dataset_six = dataset_six[dataset_six.astype(str)['reviews'] != '[]']

dataset_six.shape

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\reviews_six_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 10

index_remove = [45, 56, 72, 74, 169, 313, 532, 585, 689, 805, 838, 977, 1041, 1042, 1043, 1157, 1158, 1159, 1160, 1161, 
                1164, 1165, 1168, 1170, 1171, 1214, 1228, 1229, 1258, 1442, 1445, 1491, 1670, 1671, 1678, 1706, 1995, 2078, 
                2217, 2622, 2783,
                3248, 3349, 3522, 3523, 3784, 3846, 3851, 3853, 3863, 3871, 3890, 3900, 3908, 3912, 3913, 3918, 3919, 3923, 
                3929, 3932, 4018,
                4027, 4038, 4062, 4122, 4136, 4142, 4145, 4153, 4165, 4166, 4188, 4192, 4193, 4206, 4241, 4248, 4256, 4352, 
                4354, 4355, 4356, 
                4437, 4607, 4733, 4808, 4809, 4852, 4883,
                1987, 2551, 3482, 3981, 4887,
                68, 176, 177, 178, 179, 180, 299, 309, 391, 509, 528, 531, 585, 658, 815, 977, 1001, 1088, 1099, 1109, 1156, 
                1178, 1180, 1185, 1190, 1195, 1197, 1198, 1210, 1229, 1396, 1438, 1451, 1629, 1676, 1698, 1706, 1944, 1964, 
                2002, 2021, 2043, 2618, 2743, 3482, 3800, 3819, 3838, 3896, 4038, 4099, 4147, 4179, 4356, 4446, 4496, 4575, 
                4576, 4592, 4606, 4829, 4846, 4855, 4887, 4894, 4952, 4986, 4990]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_six = dataset.iloc[25000:30000].reset_index(drop=True)

dataset_six = dataset_six[~dataset_six.index.isin(index_remove)]

dataset_six.shape

In [29]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\plot_six_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\rating_six_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\actors_six_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\director_six_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_six\\reviews_six_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [30]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
dataset_six['actors'] = actors

dataset_six['plot'] = plot

dataset_six['imdb_rating'] = rating

dataset_six['director'] = director

dataset_six['reviews'] = reviews

dataset_six = dataset_six.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_six.iloc[2005]

In [None]:
index_remove = [8, 29, 91, 92, 132, 143, 171, 186, 249, 286, 299, 300, 321, 329, 337, 406, 435, 448, 457, 459, 491, 506, 511, 513, 562, 631, 639, 649, 650, 669, 672, 693, 697, 783, 787, 788, 793, 800, 808, 833, 861, 862, 870, 907, 933, 976, 984, 992, 995, 996, 999, 1033, 1038, 1103, 1135, 1146, 1148, 1150, 1151, 1155, 1159, 1161, 1168, 1191, 1221, 1258, 1261, 1317, 1333, 1335, 1347, 1379, 1493, 1536, 1538, 1540, 1543, 1566, 1570, 1586, 1606, 1608, 1617, 1621, 1623, 1631, 1648, 1666, 1674, 1780, 1844, 1867, 1885, 1914, 1956, 1972, 1996, 1998, 2026, 2047, 2073, 2080, 2101, 2125, 2134, 2157, 2317, 2328, 2330, 2366, 2492, 2535, 2536, 2572, 2677, 2814, 2836, 2865, 2867, 2928, 2936, 3082, 3147, 3154, 3217, 3229, 3310, 3364, 3395, 3647, 3691, 3695, 3697, 3707, 3714, 3718, 3746, 3751, 3758, 3769, 3792, 3794, 3796, 3797, 3804, 3817, 3818, 3827, 3829, 3837, 3851, 3852, 3861, 3924, 3931, 3944, 3956, 3957, 3972, 3979, 4007, 4010, 4029, 4031, 4039, 4040, 4041, 4057, 4078, 4083, 4117, 4146, 4151, 4165, 4189, 4214, 4215, 4221, 4234, 4299, 4309, 4323, 4357, 4404, 4445, 4448, 4451, 4468, 4472, 4486, 4489, 4523, 4525, 4527, 4529, 4533, 4546, 4552, 4577, 4579, 4597, 4603, 4625, 4644, 4698, 4716, 4768, 4798, 4800, 4836]

dataset_six = dataset_six[~dataset_six.index.isin(index_remove)]

dataset_six.shape

In [None]:
dataset_six = dataset_six[dataset_six.astype(str)['reviews'] != '[]']

dataset_six.shape

In [None]:
# dataset_six.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_six\\dataset_six_final_20012020.pkl") old

# dataset_six.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_six\\dataset_six_final_24012020.pkl") old

dataset_six.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_six\\dataset_six_final_25012020.pkl")

#### 7) 35000 movies

In [None]:
content_seven = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_seven_13012020.pkl', 'rb') as f:
    
    content_seven = pickle.load(f)

print("Number of URLs: {}".format(len(content_seven)))

content_souplist_seven = []

for i in tqdm_notebook(content_seven):
    
    content_souplist_seven.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_seven)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_seven)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_seven)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_seven)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_seven)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
[i for i,x in enumerate(director_name) if not x]

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_seven_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_seven_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_seven_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_seven_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [44, 79, 170, 176, 233, 237, 315, 328, 347, 638, 831, 855, 870, 938, 952, 1115, 1125, 1176, 1600, 1611, 
# 1619, 1620, 1705, 1868, 1940, 2183, 2245, 2277, 2322, 2337, 2382, 2402, 2420, 2429, 2525, 2534, 2569, 2581, 2714, 2957, 
# 3332, 3622, 3898, 4058, 4063, 4107, 4251, 4267, 4309, 4334, 4489, 4524, 4673, 4675, 4676, 4677, 4681, 4690, 4699, 4703, 
# 4711, 4880] (no actors)

# Indexes to remove: [1687, 2402, 2405, 2527, 2528, 2561, 2569, 3825, 4524] (not rated)

# Indexes to remove: [938, 2402, 2569] (no directors)

In [None]:
synopsis_seven = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_seven_13012020.pkl', 'rb') as f:
    
    synopsis_seven = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_seven)))

synopsis_souplist_seven = []

for i in tqdm_notebook(synopsis_seven):
    
    synopsis_souplist_seven.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_seven)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_seven)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_seven_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)

# Indexes to remove: [2402, 2569]

In [None]:
review_seven = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_seven_15012020.pkl', 'rb') as f:
    
    review_seven = pickle.load(f)

print("Number of URLs: {}".format(len(review_seven)))

review_souplist_seven = []

for i in tqdm_notebook(review_seven):
    
    review_souplist_seven.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_seven)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_seven)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_seven_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: [2402, 2569]

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_seven_13012020.pkl', 'rb') as f:
    
    content_seven = pickle.load(f)

print("Number of URLs: {}".format(len(content_seven)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_seven_15012020.pkl', 'rb') as f:
    
    review_seven = pickle.load(f)

print("Number of URLs: {}".format(len(review_seven)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\pre-indexed files\\plot_seven_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\pre-indexed files\\rating_seven_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\pre-indexed files\\actors_seven_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\pre-indexed files\\director_seven_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\synopsis_seven_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\reviews_seven_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [44, 79, 170, 176, 233, 237, 315, 328, 347, 638, 831, 855, 870, 938, 952, 1115, 1125, 1176, 1600, 1611, 
                1619, 1620, 1705, 1868, 1940, 2183, 2245, 2277, 2322, 2337, 2382, 2402, 2420, 2429, 2525, 2534, 2569, 2581, 
                2714, 2957, 
                3332, 3622, 3898, 4058, 4063, 4107, 4251, 4267, 4309, 4334, 4489, 4524, 4673, 4675, 4676, 4677, 4681, 4690, 
                4699, 4703, 4711, 4880, 1687, 2402, 2405, 2527, 2528, 2561, 2569, 3825, 4524, 938, 2402, 2569,
                36, 52, 88, 93, 107, 149, 174, 227, 323, 395, 589, 621, 623, 631, 635, 636, 640, 641, 652, 655, 664, 670, 
                846, 847, 966, 967, 1076, 1079, 1102, 1122, 1238, 1249, 1256, 1266, 1310, 1408, 1526, 1527, 1571, 1576, 1687, 
                1951, 1952, 1972, 2043, 2115, 2121, 2156, 2183, 2198, 2250, 2256, 2325, 2341, 2350, 2351, 2356, 2377, 2386, 
                2389, 2494, 2502, 2539, 2582, 2708, 2743, 2799, 2817, 2829, 2870, 2889, 2892, 2894, 3068, 3069, 3070, 3084, 
                3089, 3123, 3130, 3323, 3325, 3380, 3438, 3581, 3723, 3732, 3733, 3741, 3748, 3751, 3756, 3758, 3760, 3761, 
                3768, 3772, 3776, 3790, 3791, 3807, 3808, 3823, 3899, 3976, 4023, 4037, 4061, 4089, 4090, 4092, 4098, 4147, 
                4286, 4335, 4438, 4439, 4472, 4499, 4667, 4676, 4680, 4731, 4801, 4855, 4865, 4866, 4890, 4902]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_seven = [i for j, i in enumerate(content_seven) if j not in index_remove]

review_index_seven = [i for j, i in enumerate(review_seven) if j not in index_remove]

print(len(content_index_seven))

print(len(review_index_seven))

# The length of this content_one (4806) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\content_index_seven_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_seven, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\review_index_seven_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_seven, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\content_index_seven_20012020.pkl', 'rb') as f:
    
    content_index_seven = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_seven)))

content_souplist_seven = []

for i in tqdm_notebook(content_index_seven):
    
    content_souplist_seven.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_seven)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_seven)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_seven)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_seven)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# STEP 8

#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_seven)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 9

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\plot_seven_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\actors_seven_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\director_seven_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
# STEP 10

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\rating_seven_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
# STEP 11

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\review_index_seven_20012020.pkl', 'rb') as f:
    
    review_seven = pickle.load(f)
    
print("Number of URLs: {}".format(len(review_seven)))

review_souplist_seven = []

for i in tqdm_notebook(review_seven):
    
    review_souplist_seven.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_seven)))

In [None]:
# STEP 12

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_seven)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_seven = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\dataset_seven_final_20012020.pkl")

print(data_seven.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\reviews_seven_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 13

index_remove = [44, 79, 170, 176, 233, 237, 315, 328, 347, 638, 831, 855, 870, 938, 952, 1115, 1125, 1176, 1600, 1611, 
                1619, 1620, 1705, 1868, 1940, 2183, 2245, 2277, 2322, 2337, 2382, 2402, 2420, 2429, 2525, 2534, 2569, 2581, 
                2714, 2957, 
                3332, 3622, 3898, 4058, 4063, 4107, 4251, 4267, 4309, 4334, 4489, 4524, 4673, 4675, 4676, 4677, 4681, 4690, 
                4699, 4703, 4711, 4880, 1687, 2402, 2405, 2527, 2528, 2561, 2569, 3825, 4524, 938, 2402, 2569,
                36, 52, 88, 93, 107, 149, 174, 227, 323, 395, 589, 621, 623, 631, 635, 636, 640, 641, 652, 655, 664, 670, 
                846, 847, 966, 967, 1076, 1079, 1102, 1122, 1238, 1249, 1256, 1266, 1310, 1408, 1526, 1527, 1571, 1576, 1687, 
                1951, 1952, 1972, 2043, 2115, 2121, 2156, 2183, 2198, 2250, 2256, 2325, 2341, 2350, 2351, 2356, 2377, 2386, 
                2389, 2494, 2502, 2539, 2582, 2708, 2743, 2799, 2817, 2829, 2870, 2889, 2892, 2894, 3068, 3069, 3070, 3084, 
                3089, 3123, 3130, 3323, 3325, 3380, 3438, 3581, 3723, 3732, 3733, 3741, 3748, 3751, 3756, 3758, 3760, 3761, 
                3768, 3772, 3776, 3790, 3791, 3807, 3808, 3823, 3899, 3976, 4023, 4037, 4061, 4089, 4090, 4092, 4098, 4147, 
                4286, 4335, 4438, 4439, 4472, 4499, 4667, 4676, 4680, 4731, 4801, 4855, 4865, 4866, 4890, 4902]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_seven = dataset.iloc[30000:35000].reset_index(drop=True)

dataset_seven = dataset_seven[~dataset_seven.index.isin(index_remove)]

dataset_seven.shape

In [31]:
# STEP 14

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\plot_seven_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\rating_seven_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\actors_seven_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\director_seven_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\reviews_seven_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [32]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
# STEP 15

dataset_seven['actors'] = actors

dataset_seven['plot'] = plot

dataset_seven['imdb_rating'] = rating

dataset_seven['director'] = director

dataset_seven['reviews'] = reviews

dataset_seven = dataset_seven.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_seven.iloc[2005]

In [None]:
# STEP 16

index_remove = [7, 19, 35, 58, 80, 91, 93, 114, 141, 146, 155, 162, 163, 165, 188, 211, 212, 213, 215, 221, 246, 251, 296, 299, 303, 305, 357, 367, 417, 431, 437, 534, 535, 549, 574, 614, 619, 625, 627, 639, 666, 671, 675, 679, 722, 770, 793, 794, 799, 800, 847, 890, 902, 981, 990, 1026, 1030, 1033, 1034, 1036, 1039, 1052, 1056, 1075, 1111, 1113, 1173, 1186, 1188, 1189, 1223, 1255, 1268, 1326, 1331, 1332, 1349, 1357, 1382, 1386, 1401, 1403, 1416, 1419, 1471, 1473, 1475, 1518, 1519, 1540, 1551, 1563, 1594, 1597, 1599, 1609, 1629, 1635, 1638, 1681, 1696, 1719, 1725, 1732, 1746, 1749, 1759, 1769, 1800, 1871, 1901, 1909, 1940, 1955, 2023, 2026, 2027, 2033, 2036, 2037, 2040, 2041, 2043, 2045, 2056, 2057, 2058, 2085, 2086, 2087, 2109, 2111, 2115, 2118, 2120, 2121, 2126, 2127, 2130, 2193, 2208, 2222, 2230, 2237, 2245, 2246, 2261, 2263, 2265, 2267, 2269, 2272, 2273, 2275, 2281, 2283, 2284, 2289, 2292, 2294, 2299, 2300, 2316, 2357, 2365, 2383, 2384, 2386, 2387, 2388, 2390, 2396, 2399, 2400, 2401, 2404, 2405, 2407, 2413, 2415, 2417, 2437, 2439, 2511, 2519, 2526, 2532, 2548, 2608, 2613, 2637, 2640, 2676, 2684, 2692, 2697, 2704, 2714, 2718, 2719, 2721, 2764, 2766, 2777, 2813, 2853, 2860, 2883, 2886, 2889, 2894, 2953, 2954, 2966, 2975, 3003, 3031, 3039, 3110, 3139, 3149, 3155, 3158, 3175, 3202, 3203, 3210, 3230, 3266, 3279, 3328, 3344, 3345, 3346, 3349, 3352, 3354, 3359, 3373, 3374, 3406, 3422, 3424, 3434, 3456, 3467, 3468, 3481, 3521, 3525, 3548, 3581, 3582, 3583, 3587, 3596, 3598, 3600, 3601, 3604, 3605, 3608, 3615, 3620, 3622, 3623, 3624, 3625, 3626, 3627, 3628, 3629, 3630, 3631, 3632, 3633, 3637, 3641, 3647, 3648, 3650, 3651, 3654, 3655, 3669, 3696, 3711, 3718, 3729, 3730, 3734, 3750, 3764, 3766, 3774, 3832, 3869, 3872, 3875, 3915, 3921, 3922, 3926, 3934, 3951, 3971, 3997, 4012, 4013, 4039, 4047, 4048, 4051, 4074, 4091, 4094, 4098, 4112, 4122, 4148, 4165, 4168, 4196, 4202, 4212, 4215, 4236, 4237, 4240, 4242, 4243, 4247, 4248, 4249, 4250, 4251, 4252, 4254, 4257, 4258, 4263, 4270, 4281, 4284, 4290, 4303, 4317, 4321, 4326, 4327, 4334, 4336, 4339, 4350, 4361, 4383, 4404, 4412, 4485, 4495, 4516, 4517, 4521, 4550, 4641, 4646, 4650, 4671, 4681, 4691, 4692, 4695, 4699, 4700, 4705, 4710, 4720, 4721, 4800]

dataset_seven = dataset_seven[~dataset_seven.index.isin(index_remove)]

dataset_seven.shape

In [None]:
dataset_seven = dataset_seven[dataset_seven.astype(str)['reviews'] != '[]']

dataset_seven.shape

In [None]:
# dataset_seven.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\dataset_seven_final_20012020.pkl") old

# dataset_seven.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\dataset_seven_final_24012020.pkl") old

dataset_seven.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\dataset_seven_final_25012020.pkl")

In [None]:
dataset_seven.iloc[146]

In [None]:
# STEP 17

dataset_seven.shape

#### 8) 40000 movies

In [None]:
content_eight = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_eight_13012020.pkl', 'rb') as f:
    
    content_eight = pickle.load(f)

print("Number of URLs: {}".format(len(content_eight)))

content_souplist_eight = []

for i in tqdm_notebook(content_eight):
    
    content_souplist_eight.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_eight)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eight)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_eight)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_eight)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eight)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

# [myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eight)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

In [None]:
plot_summary_two = []
[[plot_summary_two.append(x.find_all('div', {'class':'summary_text'})) for x in i] for i in myfield_plot]

In [None]:
plot_summary_two[908]

In [None]:
myfield_plot[913]

In [None]:
[i for i,x in enumerate(plot_summary_two) if not x]

In [None]:
dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

In [None]:
dataset_two = dataset.iloc[35000:40000].reset_index()

In [None]:
dataset_two.iloc[908]

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_eight_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_eight_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_eight_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_eight_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [44, 70, 82, 94, 106, 260, 263, 547, 602, 603, 690, 693, 694, 695, 811, 906, 909, 910, 939, 980, 991, 
# 994, 1124, 1236, 1276, 1469, 1526, 1625, 2160, 2165, 2181, 2190, 2235, 2241, 2298, 2317, 2347, 2423, 2511, 2603, 2766, 2772, 
# 2793, 2879, 2884, 2885, 2886, 2891, 2933, 3053, 3054, 3139, 3152, 3181, 3192, 3196, 3213, 3384, 3387, 3393, 3538, 3630, 3818,
# 3828, 3927, 3964, 3974, 3977, 4027, 4186, 4248, 4308, 4342, 4371, 4504, 4568, 4628, 4657, 4740, 4756, 4787, 4828, 4861, 4862,
#4863, 4864, 4865, 4866, 4867, 4868, 4869, 4871, 4873, 4904, 4932, 4933, 4935, 4936, 4942, 4949, 4968, 4973, 4975] (no actors)

# Indexes to remove: [905, 906, 907, 909, 910, 1963, 3849, 4191] (not rated)

# Indexes to remove: [906, 908, 909, 4864, 4866, 4867] (no directors)

# Indexes to remove: [906, 909] (no plot) -> Movie 909 can be replaced with the Independence day 2 (2016)

In [None]:
synopsis_eight = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_eight_14012020.pkl', 'rb') as f:
    
    synopsis_eight = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_eight)))

synopsis_souplist_eight = []

for i in tqdm_notebook(synopsis_eight):
    
    synopsis_souplist_eight.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_eight)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_eight)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_eight_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)

# Indexes to remove: [906, 909]

In [None]:
review_eight = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_eight_15012020.pkl', 'rb') as f:
    
    review_eight = pickle.load(f)

print("Number of URLs: {}".format(len(review_eight)))

review_souplist_eight = []

for i in tqdm_notebook(review_eight):
    
    review_souplist_eight.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_eight)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_eight)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_eight_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: [906, 909]

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_eight_13012020.pkl', 'rb') as f:
    
    content_eight = pickle.load(f)

print("Number of URLs: {}".format(len(content_eight)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_eight_15012020.pkl', 'rb') as f:
    
    review_eight = pickle.load(f)

print("Number of URLs: {}".format(len(review_eight)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\pre-indexed files\\plot_eight_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\pre-indexed files\\rating_eight_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\pre-indexed files\\actors_eight_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\pre-indexed files\\director_eight_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\synopsis_eight_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\reviews_eight_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [44, 70, 82, 94, 106, 260, 263, 547, 602, 603, 690, 693, 694, 695, 811, 906, 909, 910, 939, 980, 991, 
                994, 1124, 1236, 1276, 1469, 1526, 1625, 2160, 2165, 2181, 2190, 2235, 2241, 2298, 2317, 2347, 2423, 2511, 
                2603, 2766, 2772, 
                2793, 2879, 2884, 2885, 2886, 2891, 2933, 3053, 3054, 3139, 3152, 3181, 3192, 3196, 3213, 3384, 3387, 3393, 
                3538, 3630, 3818,
                3828, 3927, 3964, 3974, 3977, 4027, 4186, 4248, 4308, 4342, 4371, 4504, 4568, 4628, 4657, 4740, 4756, 4787, 
                4828, 4861, 4862,
                4863, 4864, 4865, 4866, 4867, 4868, 4869, 4871, 4873, 4904, 4932, 4933, 4935, 4936, 4942, 4949, 4968, 4973, 
                4975, 905, 906, 907, 909, 910, 1963, 3849, 4191, 906, 908, 909, 4864, 4866, 4867, 906, 909,
                82, 122, 145, 169, 316, 330, 334, 438, 493, 943, 945, 968, 987, 996, 1073, 1086, 1254, 1274, 1276, 1308, 1315, 1352, 1354, 1388, 1562, 1614, 1641, 1653, 1654, 1661, 1954, 1958, 1960, 1967, 1990, 2029, 2030, 2031, 2192, 2347, 2383, 2391, 2488, 2553, 2558, 2565, 2572, 2576, 2607, 2817, 2833, 2855, 2893, 2963, 2969, 3012, 3044, 3080, 3154, 3211, 3225, 3426, 3434, 3555, 3591, 3611, 3614, 3634, 3745, 3749, 3821, 3823, 3834, 3837, 3856, 3885, 3992, 4003, 4009, 4020, 4021, 4094, 4127, 4147, 4150, 4188, 4200, 4240, 4244, 4314, 4387, 4447, 4515, 4530, 4591, 4593, 4599, 4601, 4667, 4732, 4762, 4830, 4875, 4882, 4917, 4919, 4938, 4966, 4970, 4972, 4974, 4978, 4988, 4990]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_eight = [i for j, i in enumerate(content_eight) if j not in index_remove]

review_index_eight = [i for j, i in enumerate(review_eight) if j not in index_remove]

print(len(content_index_eight))

print(len(review_index_eight))

# The length of this content_one (4780) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\content_index_eight_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_eight, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\review_index_eight_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_eight, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\content_index_eight_20012020.pkl', 'rb') as f:
    
    content_index_eight = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_eight)))

content_souplist_eight = []

for i in tqdm_notebook(content_index_eight):
    
    content_souplist_eight.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_eight)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eight)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_eight)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eight)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# STEP 8

#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_eight)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 9

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\plot_eight_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\actors_eight_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\director_eight_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
# STEP 10

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\rating_eight_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
# STEP 11

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\review_index_eight_20012020.pkl', 'rb') as f:
    
    review_eight = pickle.load(f)
    
print("Number of URLs: {}".format(len(review_eight)))

review_souplist_eight = []

for i in tqdm_notebook(review_eight):
    
    review_souplist_eight.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_eight)))

In [None]:
# STEP 12

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_eight)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_eight = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\dataset_eight_final_20012020.pkl")

print(data_eight.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\reviews_eight_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 13

index_remove = [44, 70, 82, 94, 106, 260, 263, 547, 602, 603, 690, 693, 694, 695, 811, 906, 909, 910, 939, 980, 991, 
                994, 1124, 1236, 1276, 1469, 1526, 1625, 2160, 2165, 2181, 2190, 2235, 2241, 2298, 2317, 2347, 2423, 2511, 
                2603, 2766, 2772, 
                2793, 2879, 2884, 2885, 2886, 2891, 2933, 3053, 3054, 3139, 3152, 3181, 3192, 3196, 3213, 3384, 3387, 3393, 
                3538, 3630, 3818,
                3828, 3927, 3964, 3974, 3977, 4027, 4186, 4248, 4308, 4342, 4371, 4504, 4568, 4628, 4657, 4740, 4756, 4787, 
                4828, 4861, 4862,
                4863, 4864, 4865, 4866, 4867, 4868, 4869, 4871, 4873, 4904, 4932, 4933, 4935, 4936, 4942, 4949, 4968, 4973, 
                4975, 905, 906, 907, 909, 910, 1963, 3849, 4191, 906, 908, 909, 4864, 4866, 4867, 906, 909,
                82, 122, 145, 169, 316, 330, 334, 438, 493, 943, 945, 968, 987, 996, 1073, 1086, 1254, 1274, 1276, 1308, 1315, 1352, 1354, 1388, 1562, 1614, 1641, 1653, 1654, 1661, 1954, 1958, 1960, 1967, 1990, 2029, 2030, 2031, 2192, 2347, 2383, 2391, 2488, 2553, 2558, 2565, 2572, 2576, 2607, 2817, 2833, 2855, 2893, 2963, 2969, 3012, 3044, 3080, 3154, 3211, 3225, 3426, 3434, 3555, 3591, 3611, 3614, 3634, 3745, 3749, 3821, 3823, 3834, 3837, 3856, 3885, 3992, 4003, 4009, 4020, 4021, 4094, 4127, 4147, 4150, 4188, 4200, 4240, 4244, 4314, 4387, 4447, 4515, 4530, 4591, 4593, 4599, 4601, 4667, 4732, 4762, 4830, 4875, 4882, 4917, 4919, 4938, 4966, 4970, 4972, 4974, 4978, 4988, 4990]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_eight = dataset.iloc[35000:40000].reset_index(drop=True)

dataset_eight = dataset_eight[~dataset_eight.index.isin(index_remove)]

dataset_eight.shape

In [None]:
len(index_remove)

In [33]:
# STEP 14

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\plot_eight_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\rating_eight_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\actors_eight_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\director_eight_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\reviews_eight_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [34]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
# STEP 15

dataset_eight['actors'] = actors

dataset_eight['plot'] = plot

dataset_eight['imdb_rating'] = rating

dataset_eight['director'] = director

dataset_eight['reviews'] = reviews

dataset_eight = dataset_eight.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_eight.iloc[2005]

In [None]:
# STEP 16

index_remove = [54, 58, 117, 122, 151, 172, 186, 191, 203, 214, 224, 228, 238, 271, 273, 300, 314, 320, 377, 435, 475, 478, 492, 493, 508, 515, 522, 523, 546, 548, 553, 562, 581, 584, 675, 702, 730, 735, 738, 739, 742, 838, 853, 857, 858, 869, 876, 909, 914, 953, 956, 961, 973, 983, 984, 1033, 1037, 1049, 1052, 1056, 1062, 1064, 1068, 1072, 1082, 1100, 1130, 1180, 1182, 1183, 1184, 1214, 1227, 1283, 1304, 1306, 1369, 1405, 1417, 1423, 1439, 1441, 1464, 1467, 1491, 1507, 1513, 1536, 1563, 1564, 1567, 1568, 1578, 1588, 1590, 1636, 1691, 1726, 1727, 1732, 1734, 1741, 1756, 1768, 1770, 1773, 1780, 1790, 1801, 1802, 1804, 1805, 1806, 1807, 1809, 1826, 1835, 1857, 1859, 1873, 1879, 1895, 1897, 1899, 1901, 1905, 1906, 1917, 1929, 1936, 1950, 1964, 1965, 1968, 1971, 1976, 1979, 1981, 1988, 1994, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2009, 2010, 2012, 2043, 2059, 2109, 2117, 2118, 2120, 2121, 2122, 2125, 2126, 2127, 2137, 2151, 2170, 2174, 2179, 2203, 2220, 2229, 2235, 2289, 2291, 2292, 2293, 2294, 2315, 2324, 2329, 2330, 2332, 2341, 2359, 2361, 2363, 2365, 2366, 2368, 2369, 2370, 2371, 2388, 2401, 2409, 2428, 2432, 2433, 2435, 2461, 2462, 2463, 2466, 2468, 2471, 2472, 2476, 2477, 2482, 2484, 2485, 2488, 2490, 2491, 2492, 2494, 2496, 2499, 2501, 2504, 2507, 2521, 2626, 2641, 2656, 2667, 2670, 2702, 2705, 2750, 2762, 2803, 2805, 2817, 2829, 2867, 2889, 2907, 2915, 2951, 2962, 2966, 2973, 2975, 3030, 3045, 3047, 3048, 3064, 3065, 3078, 3090, 3095, 3096, 3101, 3109, 3119, 3148, 3162, 3174, 3179, 3188, 3189, 3197, 3198, 3206, 3210, 3211, 3215, 3216, 3217, 3218, 3221, 3222, 3223, 3224, 3226, 3294, 3299, 3305, 3320, 3323, 3324, 3329, 3340, 3345, 3361, 3371, 3381, 3383, 3385, 3388, 3398, 3399, 3404, 3409, 3432, 3433, 3439, 3444, 3446, 3467, 3470, 3475, 3484, 3485, 3489, 3506, 3536, 3537, 3546, 3557, 3562, 3567, 3575, 3580, 3582, 3584, 3588, 3589, 3595, 3596, 3607, 3610, 3615, 3616, 3645, 3649, 3674, 3681, 3688, 3690, 3704, 3718, 3730, 3742, 3746, 3762, 3768, 3770, 3777, 3789, 3790, 3794, 3804, 3809, 3810, 3826, 3858, 3863, 3866, 3867, 3868, 3873, 3949, 3966, 3967, 3975, 3997, 3999, 4001, 4021, 4029, 4050, 4060, 4064, 4069, 4075, 4076, 4085, 4086, 4089, 4110, 4129, 4143, 4171, 4173, 4176, 4204, 4208, 4210, 4214, 4227, 4229, 4243, 4246, 4265, 4288, 4289, 4341, 4383, 4394, 4396, 4421, 4422, 4424, 4425, 4429, 4489, 4497, 4510, 4521, 4537, 4542, 4581, 4595, 4625, 4642, 4676, 4713, 4717, 4726, 4733, 4760, 4764]

dataset_eight = dataset_eight[~dataset_eight.index.isin(index_remove)]

In [None]:
# dataset_eight.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\dataset_eight_final_20012020.pkl") old

# dataset_eight.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\dataset_eight_final_24012020.pkl") old

dataset_eight.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\dataset_eight_final_25012020.pkl")

In [None]:
dataset_eight = dataset_eight[dataset_eight.astype(str)['reviews'] != '[]']

In [None]:
# STEP 17

dataset_eight.shape

#### 9) 45000 movies

In [None]:
content_nine = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_nine_14012020.pkl', 'rb') as f:
    
    content_nine = pickle.load(f)

print("Number of URLs: {}".format(len(content_nine)))

content_souplist_nine = []

for i in tqdm_notebook(content_nine):
    
    content_souplist_nine.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_nine)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_nine)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_nine)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_nine)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_nine)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_nine_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_nine_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_nine_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_nine_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove: [26, 30, 53, 71, 99, 181, 203, 238, 262, 264, 308, 311, 326, 327, 377, 446, 497, 521, 554, 665, 667, 668, 
# 669, 676, 719, 733, 736, 737, 738, 751, 757, 758, 762, 765, 767, 768, 770, 771, 772, 773, 774, 775, 776, 777, 779, 780, 782, 
# 783, 839, 840, 894, 903, 953, 959, 962, 1037, 1044, 1057, 1216, 1242, 1355, 1360, 1436, 1437, 1438, 1439, 1444, 1445, 1448, 
# 1449, 1454, 1491, 1592, 1609, 1610, 1647, 1691, 1787, 1790, 1859, 1875, 1886, 1887, 1993, 2013, 2090, 2094, 2102, 2116, 2120, 
# 2123, 2128, 2406, 2437, 2450, 2554, 2558, 2559, 2577, 2584, 2590, 2593, 2611, 2613, 2711, 2765, 2836, 3052, 3211, 3228, 3299, 
# 3300, 3425, 3534, 3780, 3781, 4021, 4153, 4258, 4507, 4549, 4577, 4583, 4586, 4617, 4724, 4725, 4733, 4786, 4951, 4971, 4990, 
# 4991] (no actors)

# Indexes to remove: [1028, 1790, 2043, 3497, 4491, 4536, 4562, 4910] (not rated)

In [None]:
synopsis_nine = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_nine_14012020.pkl', 'rb') as f:
    
    synopsis_nine = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_nine)))

synopsis_souplist_nine = []

for i in tqdm_notebook(synopsis_nine):
    
    synopsis_souplist_nine.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_nine)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_nine)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_nine_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)

# Indexes to remove: 0

In [None]:
review_nine = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_nine_16012020.pkl', 'rb') as f:
    
    review_nine = pickle.load(f)

print("Number of URLs: {}".format(len(review_nine)))

review_souplist_nine = []

for i in tqdm_notebook(review_nine):
    
    review_souplist_nine.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_nine)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_nine)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_nine_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: 0

In [None]:
# myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

# [myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eight)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

plot_summary_two = []
[[plot_summary_two.append(x.find_all('div', {'class':'summary_text'})) for x in i] for i in myfield_plot]

plot_summary_two[908]

myfield_plot[913]

[i for i,x in enumerate(plot_summary_two) if not x]

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_two = dataset.iloc[35000:40000].reset_index()

dataset_two.iloc[908]

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_nine_14012020.pkl', 'rb') as f:
    
    content_nine = pickle.load(f)

print("Number of URLs: {}".format(len(content_nine)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_nine_16012020.pkl', 'rb') as f:
    
    review_nine = pickle.load(f)

print("Number of URLs: {}".format(len(review_nine)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\pre-indexed files\\plot_nine_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\pre-indexed files\\rating_nine_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\pre-indexed files\\actors_nine_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\pre-indexed files\\director_nine_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\synopsis_nine_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\reviews_nine_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [26, 30, 53, 71, 99, 181, 203, 238, 262, 264, 308, 311, 326, 327, 377, 446, 497, 521, 554, 665, 667, 668, 
                669, 676, 719, 733, 736, 737, 738, 751, 757, 758, 762, 765, 767, 768, 770, 771, 772, 773, 774, 775, 776, 777, 779, 780, 782, 
                783, 839, 840, 894, 903, 953, 959, 962, 1037, 1044, 1057, 1216, 1242, 1355, 1360, 1436, 1437, 1438, 1439, 1444, 1445, 1448, 
                1449, 1454, 1491, 1592, 1609, 1610, 1647, 1691, 1787, 1790, 1859, 1875, 1886, 1887, 1993, 2013, 2090, 2094, 2102, 2116, 2120, 
                2123, 2128, 2406, 2437, 2450, 2554, 2558, 2559, 2577, 2584, 2590, 2593, 2611, 2613, 2711, 2765, 2836, 3052, 3211, 3228, 3299, 
                3300, 3425, 3534, 3780, 3781, 4021, 4153, 4258, 4507, 4549, 4577, 4583, 4586, 4617, 4724, 4725, 4733, 4786, 4951, 4971, 4990, 
                4991, 1028, 1790, 2043, 3497, 4491, 4536, 4562, 4910,
                17, 18, 38, 45, 55, 63, 68, 83, 92, 125, 130, 142, 147, 154, 186, 193, 202, 206, 210, 241, 242, 251, 255, 267, 313, 328, 377, 386, 401, 402, 453, 455, 468, 469, 473, 485, 529, 535, 537, 549, 554, 569, 598, 609, 626, 629, 642, 645, 660, 661, 667, 670, 676, 678, 686, 709, 722, 724, 734, 754, 757, 765, 771, 773, 778, 793, 812, 833, 838, 871, 906, 907, 908, 910, 911, 926, 998, 1028, 1147, 1160, 1270, 1278, 1368, 1525, 1650, 1674, 1717, 1842, 1845, 1859, 1886, 1887, 1901, 1931, 1942, 1993, 1999, 2051, 2055, 2056, 2066, 2091, 2115, 2117, 2119, 2120, 2167, 2297, 2358, 2386, 2417, 2420, 2569, 2575, 2577, 2629, 2943, 2966, 3034, 3108, 3187, 3199, 3202, 3204, 3300, 3315, 3316, 3331, 3335, 3337, 3349, 3351, 3529, 3531, 3594, 3635, 3636, 3702, 4032, 4208, 4223, 4224, 4227, 4231, 4262, 4264, 4265, 4266, 4288, 4304, 4415, 4470, 4516, 4572, 4576, 4581, 4595, 4707, 4764, 4767, 4945, 4972, 4975]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_nine = [i for j, i in enumerate(content_nine) if j not in index_remove]

review_index_nine = [i for j, i in enumerate(review_nine) if j not in index_remove]

print(len(content_index_nine))

print(len(review_index_nine))

# The length of this content_one (4713) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\content_index_nine_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_nine, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\review_index_nine_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_nine, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\content_index_nine_20012020.pkl', 'rb') as f:
    
    content_index_nine = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_nine)))

content_souplist_nine = []

for i in tqdm_notebook(content_index_nine):
    
    content_souplist_nine.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_nine)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_nine)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_nine)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_nine)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# STEP 8

#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_nine)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 9

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\plot_nine_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\actors_nine_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\director_nine_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
# STEP 10

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\rating_nine_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
# STEP 11

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\review_index_nine_20012020.pkl', 'rb') as f:
    
    review_nine = pickle.load(f)
    
print("Number of URLs: {}".format(len(review_nine)))

review_souplist_nine = []

for i in tqdm_notebook(review_nine):
    
    review_souplist_nine.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_nine)))

In [None]:
# STEP 12

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_nine)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_nine = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\dataset_nine_final_20012020.pkl")

print(data_nine.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\reviews_nine_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 13

index_remove = [26, 30, 53, 71, 99, 181, 203, 238, 262, 264, 308, 311, 326, 327, 377, 446, 497, 521, 554, 665, 667, 668, 
                669, 676, 719, 733, 736, 737, 738, 751, 757, 758, 762, 765, 767, 768, 770, 771, 772, 773, 774, 775, 776, 777, 779, 780, 782, 
                783, 839, 840, 894, 903, 953, 959, 962, 1037, 1044, 1057, 1216, 1242, 1355, 1360, 1436, 1437, 1438, 1439, 1444, 1445, 1448, 
                1449, 1454, 1491, 1592, 1609, 1610, 1647, 1691, 1787, 1790, 1859, 1875, 1886, 1887, 1993, 2013, 2090, 2094, 2102, 2116, 2120, 
                2123, 2128, 2406, 2437, 2450, 2554, 2558, 2559, 2577, 2584, 2590, 2593, 2611, 2613, 2711, 2765, 2836, 3052, 3211, 3228, 3299, 
                3300, 3425, 3534, 3780, 3781, 4021, 4153, 4258, 4507, 4549, 4577, 4583, 4586, 4617, 4724, 4725, 4733, 4786, 4951, 4971, 4990, 
                4991, 1028, 1790, 2043, 3497, 4491, 4536, 4562, 4910,
                17, 18, 38, 45, 55, 63, 68, 83, 92, 125, 130, 142, 147, 154, 186, 193, 202, 206, 210, 241, 242, 251, 255, 267, 313, 328, 377, 386, 401, 402, 453, 455, 468, 469, 473, 485, 529, 535, 537, 549, 554, 569, 598, 609, 626, 629, 642, 645, 660, 661, 667, 670, 676, 678, 686, 709, 722, 724, 734, 754, 757, 765, 771, 773, 778, 793, 812, 833, 838, 871, 906, 907, 908, 910, 911, 926, 998, 1028, 1147, 1160, 1270, 1278, 1368, 1525, 1650, 1674, 1717, 1842, 1845, 1859, 1886, 1887, 1901, 1931, 1942, 1993, 1999, 2051, 2055, 2056, 2066, 2091, 2115, 2117, 2119, 2120, 2167, 2297, 2358, 2386, 2417, 2420, 2569, 2575, 2577, 2629, 2943, 2966, 3034, 3108, 3187, 3199, 3202, 3204, 3300, 3315, 3316, 3331, 3335, 3337, 3349, 3351, 3529, 3531, 3594, 3635, 3636, 3702, 4032, 4208, 4223, 4224, 4227, 4231, 4262, 4264, 4265, 4266, 4288, 4304, 4415, 4470, 4516, 4572, 4576, 4581, 4595, 4707, 4764, 4767, 4945, 4972, 4975]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_nine = dataset.iloc[40000:45000].reset_index(drop=True)

dataset_nine = dataset_nine[~dataset_nine.index.isin(index_remove)]

dataset_nine.shape

In [35]:
# STEP 14

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\plot_nine_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\rating_nine_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\actors_nine_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\director_nine_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\reviews_nine_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [36]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
# STEP 15

dataset_nine['actors'] = actors

dataset_nine['plot'] = plot

dataset_nine['imdb_rating'] = rating

dataset_nine['director'] = director

dataset_nine['reviews'] = reviews

dataset_nine = dataset_nine.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_nine.iloc[2005]

In [None]:
# STEP 16

index_remove = [2, 10, 21, 28, 33, 41, 42, 77, 81, 93, 95, 97, 99, 100, 102, 127, 142, 166, 186, 209, 214, 256, 257, 260, 270, 282, 312, 319, 335, 337, 340, 344, 363, 374, 375, 376, 379, 381, 382, 395, 403, 404, 420, 436, 438, 443, 453, 455, 469, 476, 478, 480, 486, 493, 503, 506, 511, 530, 545, 560, 573, 575, 584, 597, 599, 601, 603, 642, 643, 649, 654, 655, 659, 664, 666, 667, 691, 701, 707, 725, 728, 739, 745, 763, 787, 791, 797, 798, 819, 849, 859, 872, 895, 921, 922, 928, 935, 940, 941, 946, 950, 959, 965, 969, 971, 981, 983, 985, 986, 988, 1011, 1018, 1022, 1023, 1025, 1026, 1031, 1070, 1074, 1078, 1082, 1088, 1091, 1092, 1110, 1118, 1143, 1164, 1165, 1207, 1211, 1214, 1225, 1229, 1236, 1238, 1306, 1321, 1362, 1391, 1418, 1429, 1493, 1505, 1517, 1524, 1538, 1564, 1565, 1603, 1616, 1617, 1632, 1637, 1650, 1651, 1680, 1681, 1682, 1687, 1690, 1717, 1718, 1723, 1725, 1728, 1734, 1755, 1780, 1799, 1800, 1810, 1814, 1816, 1825, 1826, 1831, 1833, 1834, 1844, 1862, 1868, 1878, 1879, 1880, 1881, 1893, 1903, 1907, 1911, 1914, 1919, 1953, 1959, 1970, 1993, 2008, 2009, 2024, 2053, 2081, 2089, 2106, 2119, 2154, 2155, 2159, 2160, 2163, 2188, 2215, 2216, 2226, 2248, 2309, 2329, 2349, 2350, 2382, 2413, 2415, 2417, 2418, 2419, 2424, 2426, 2429, 2430, 2435, 2436, 2437, 2446, 2454, 2457, 2468, 2474, 2475, 2483, 2484, 2493, 2506, 2539, 2574, 2628, 2630, 2638, 2678, 2689, 2695, 2696, 2697, 2711, 2716, 2720, 2730, 2734, 2737, 2750, 2762, 2777, 2779, 2781, 2782, 2784, 2805, 2872, 2877, 2881, 2887, 2919, 2965, 2972, 2984, 3012, 3034, 3060, 3065, 3067, 3077, 3086, 3092, 3095, 3099, 3100, 3103, 3104, 3107, 3108, 3112, 3120, 3121, 3141, 3144, 3148, 3149, 3150, 3151, 3155, 3168, 3171, 3195, 3202, 3206, 3211, 3221, 3241, 3293, 3359, 3360, 3394, 3409, 3419, 3434, 3469, 3489, 3490, 3491, 3494, 3495, 3499, 3512, 3564, 3566, 3570, 3608, 3610, 3611, 3613, 3614, 3619, 3642, 3648, 3661, 3662, 3666, 3670, 3677, 3700, 3701, 3708, 3712, 3732, 3752, 3753, 3759, 3770, 3771, 3835, 3882, 3911, 3916, 3917, 3922, 3950, 3951, 3963, 3969, 3974, 3980, 3982, 3992, 3993, 3995, 3997, 4009, 4010, 4011, 4012, 4016, 4030, 4031, 4036, 4039, 4043, 4068, 4088, 4090, 4119, 4127, 4162, 4202, 4206, 4209, 4211, 4250, 4261, 4274, 4280, 4287, 4295, 4314, 4317, 4323, 4333, 4348, 4350, 4364, 4383, 4388, 4426, 4456, 4460, 4461, 4471, 4475, 4476, 4479, 4482, 4483, 4490, 4491, 4497, 4500, 4534, 4550, 4551, 4560, 4561, 4564, 4572, 4586, 4602, 4617, 4622, 4627, 4632, 4634, 4642, 4651, 4663, 4673, 4675, 4681, 4688, 4689, 4690, 4704, 4710]

dataset_nine = dataset_nine[~dataset_nine.index.isin(index_remove)]

In [None]:
dataset_nine = dataset_nine[dataset_nine.astype(str)['reviews'] != '[]']

In [None]:
# dataset_nine.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\dataset_nine_final_20012020.pkl") old

# dataset_nine.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\dataset_nine_final_24012020.pkl") old

dataset_nine.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\dataset_nine_final_25012020.pkl")

In [None]:
# STEP 17

dataset_nine.shape

In [None]:
dataset_nine.shape

#### 10) 50000 movies

In [None]:
content_ten = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_ten_14012020.pkl', 'rb') as f:
    
    content_ten = pickle.load(f)

print("Number of URLs: {}".format(len(content_ten)))

content_souplist_ten = []

for i in tqdm_notebook(content_ten):
    
    content_souplist_ten.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_ten)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_ten)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_ten)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_ten)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_ten)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
[i for i,x in enumerate(director_name) if not x]

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_ten_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_ten_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_ten_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_ten_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove:  [18, 147, 184, 224, 252, 414, 466, 517, 567, 574, 597, 626, 661, 795, 806, 814, 821, 884, 904, 905, 927, 
# 940, 973, 989, 1032, 1047, 1063, 1068, 1136, 1143, 1174, 1281, 1308, 1310, 1312, 1317, 1321, 1322, 1323, 1325, 1326, 1337, 
# 1339, 1408, 1409, 1426, 1437, 1654, 1685, 1703, 1772, 1777, 1784, 1807, 1813, 1850, 1882, 1893, 1931, 1938, 1963, 1964, 2025, 
# 2033, 2099, 2133, 2169, 2202, 2247, 2248, 2337, 2419, 2468, 2485, 2510, 2574, 2575, 2576, 2633, 2668, 2797, 2847, 2868, 2888, 
# 2917, 2923, 2951, 2992, 3009, 3051, 3094, 3153, 3154, 3194, 3231, 3304, 3306, 3350, 3485, 3606, 3686, 3713, 3736, 3804, 3818,
# 3996, 4055, 4056, 4093, 4098, 4119, 4335, 4385, 4400, 4405, 4418, 4422, 4431, 4439, 4440, 4441, 4442, 4443, 4447, 4449, 4454, 
# 4463, 4467, 4469, 4470, 4471, 4472, 4473, 4475, 4476, 4477, 4479, 4484, 4493, 4495, 4496, 4497, 4500, 4501, 4503, 4505, 4507, 
# 4508, 4509, 4510, 4512, 4514, 4515, 4622, 4652, 4715, 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4770, 4873, 4874, 4881, 
# 4887, 4920, 4943, 4976, 4986, 4987] (no actors)

# Indexes to remove: [585, 942, 1229, 1340, 1954, 2270, 2282, 2296, 2816, 3188, 4034, 4395] (not rated)

# Indexes to remove: [4431, 4439, 4440, 4441, 4449, 4467, 4503] )no directors

In [None]:
synopsis_ten = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_ten_14012020.pkl', 'rb') as f:
    
    synopsis_ten = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_ten)))

synopsis_souplist_ten = []

for i in tqdm_notebook(synopsis_ten):
    
    synopsis_souplist_ten.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_ten)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_ten)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_ten_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)

# Indexes to remove: 0

In [None]:
review_ten = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_ten_16012020.pkl', 'rb') as f:
    
    review_ten = pickle.load(f)

print("Number of URLs: {}".format(len(review_ten)))

review_souplist_ten = []

for i in tqdm_notebook(review_ten):
    
    review_souplist_ten.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_ten)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_ten)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_ten_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: 0

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_ten_14012020.pkl', 'rb') as f:
    
    content_ten = pickle.load(f)

print("Number of URLs: {}".format(len(content_ten)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_ten_16012020.pkl', 'rb') as f:
    
    review_ten = pickle.load(f)

print("Number of URLs: {}".format(len(review_ten)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\pre-indexed files\\plot_ten_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\pre-indexed files\\rating_ten_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\pre-indexed files\\actors_ten_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\pre-indexed files\\director_ten_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\synopsis_ten_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\reviews_ten_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [18, 147, 184, 224, 252, 414, 466, 517, 567, 574, 597, 626, 661, 795, 806, 814, 821, 884, 904, 905, 927, 
                940, 973, 989, 1032, 1047, 1063, 1068, 1136, 1143, 1174, 1281, 1308, 1310, 1312, 1317, 1321, 1322, 1323, 1325, 1326, 1337, 
                1339, 1408, 1409, 1426, 1437, 1654, 1685, 1703, 1772, 1777, 1784, 1807, 1813, 1850, 1882, 1893, 1931, 1938, 1963, 1964, 2025, 
                2033, 2099, 2133, 2169, 2202, 2247, 2248, 2337, 2419, 2468, 2485, 2510, 2574, 2575, 2576, 2633, 2668, 2797, 2847, 2868, 2888, 
                2917, 2923, 2951, 2992, 3009, 3051, 3094, 3153, 3154, 3194, 3231, 3304, 3306, 3350, 3485, 3606, 3686, 3713, 3736, 3804, 3818,
                3996, 4055, 4056, 4093, 4098, 4119, 4335, 4385, 4400, 4405, 4418, 4422, 4431, 4439, 4440, 4441, 4442, 4443, 4447, 4449, 4454, 
                4463, 4467, 4469, 4470, 4471, 4472, 4473, 4475, 4476, 4477, 4479, 4484, 4493, 4495, 4496, 4497, 4500, 4501, 4503, 4505, 4507, 
                4508, 4509, 4510, 4512, 4514, 4515, 4622, 4652, 4715, 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4770, 4873, 4874, 4881, 
                4887, 4920, 4943, 4976, 4986, 4987,
                585, 942, 1229, 1340, 1954, 2270, 2282, 2296, 2816, 3188, 4034, 4395, 4431, 4439, 4440, 4441, 4449, 4467, 4503,
                10, 110, 145, 372, 505, 600, 621, 672, 729, 731, 774, 848, 914, 932, 1085, 1107, 1111, 1120, 1132, 1185, 1246, 1256, 1340, 1368, 1438, 1620, 1643, 1741, 1784, 1867, 1881, 1890, 1954, 2069, 2071, 2094, 2178, 2250, 2251, 2252, 2253, 2270, 2277, 2279, 2282, 2314, 2319, 2327, 2400, 2419, 2488, 2536, 2608, 2611, 2612, 2617, 2658, 2676, 2685, 2751, 2766, 2768, 2799, 2816, 2846, 2944, 2947, 2952, 2961, 3063, 3064, 3067, 3099, 3160, 3191, 3243, 3269, 3275, 3282, 3288, 3334, 3337, 3339, 3365, 3378, 3413, 3414, 3449, 3452, 3460, 3496, 3551, 3606, 3616, 3617, 3619, 3627, 3650, 3696, 3753, 3764, 3765, 3774, 3776, 3781, 3784, 3786, 3790, 3797, 3812, 3824, 3827, 3830, 3833, 3836, 3855, 3856, 3877, 3990, 4018, 4032, 4035, 4085, 4103, 4130, 4141, 4166, 4176, 4212, 4225, 4238, 4259, 4287, 4290, 4300, 4339, 4349, 4391, 4418, 4419, 4422, 4424, 4431, 4486, 4487, 4573, 4574, 4577, 4630, 4631, 4648, 4649, 4655, 4677, 4856, 4971, 4976]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_ten = [i for j, i in enumerate(content_ten) if j not in index_remove]

review_index_ten = [i for j, i in enumerate(review_ten) if j not in index_remove]

print(len(content_index_ten))

print(len(review_index_ten))

# The length of this content_one (4669) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\content_index_ten_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_ten, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\review_index_ten_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_ten, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\content_index_ten_20012020.pkl', 'rb') as f:
    
    content_index_ten = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_ten)))

content_souplist_ten = []

for i in tqdm_notebook(content_index_ten):
    
    content_souplist_ten.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_ten)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_ten)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_ten)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_ten)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# STEP 8

#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_ten)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 9

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\plot_ten_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\actors_ten_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\director_ten_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
# STEP 10

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\rating_ten_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
# STEP 11

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\review_index_ten_20012020.pkl', 'rb') as f:
    
    review_ten = pickle.load(f)
    
print("Number of URLs: {}".format(len(review_ten)))

review_souplist_ten = []

for i in tqdm_notebook(review_ten):
    
    review_souplist_ten.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_ten)))

In [None]:
# STEP 12

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_ten)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_ten = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\dataset_ten_final_20012020.pkl")

print(data_ten.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\reviews_ten_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 13

index_remove = [18, 147, 184, 224, 252, 414, 466, 517, 567, 574, 597, 626, 661, 795, 806, 814, 821, 884, 904, 905, 927, 
                940, 973, 989, 1032, 1047, 1063, 1068, 1136, 1143, 1174, 1281, 1308, 1310, 1312, 1317, 1321, 1322, 1323, 1325, 1326, 1337, 
                1339, 1408, 1409, 1426, 1437, 1654, 1685, 1703, 1772, 1777, 1784, 1807, 1813, 1850, 1882, 1893, 1931, 1938, 1963, 1964, 2025, 
                2033, 2099, 2133, 2169, 2202, 2247, 2248, 2337, 2419, 2468, 2485, 2510, 2574, 2575, 2576, 2633, 2668, 2797, 2847, 2868, 2888, 
                2917, 2923, 2951, 2992, 3009, 3051, 3094, 3153, 3154, 3194, 3231, 3304, 3306, 3350, 3485, 3606, 3686, 3713, 3736, 3804, 3818,
                3996, 4055, 4056, 4093, 4098, 4119, 4335, 4385, 4400, 4405, 4418, 4422, 4431, 4439, 4440, 4441, 4442, 4443, 4447, 4449, 4454, 
                4463, 4467, 4469, 4470, 4471, 4472, 4473, 4475, 4476, 4477, 4479, 4484, 4493, 4495, 4496, 4497, 4500, 4501, 4503, 4505, 4507, 
                4508, 4509, 4510, 4512, 4514, 4515, 4622, 4652, 4715, 4718, 4719, 4720, 4721, 4722, 4723, 4724, 4725, 4770, 4873, 4874, 4881, 
                4887, 4920, 4943, 4976, 4986, 4987,
                585, 942, 1229, 1340, 1954, 2270, 2282, 2296, 2816, 3188, 4034, 4395, 4431, 4439, 4440, 4441, 4449, 4467, 4503,
                10, 110, 145, 372, 505, 600, 621, 672, 729, 731, 774, 848, 914, 932, 1085, 1107, 1111, 1120, 1132, 1185, 1246, 1256, 1340, 1368, 1438, 1620, 1643, 1741, 1784, 1867, 1881, 1890, 1954, 2069, 2071, 2094, 2178, 2250, 2251, 2252, 2253, 2270, 2277, 2279, 2282, 2314, 2319, 2327, 2400, 2419, 2488, 2536, 2608, 2611, 2612, 2617, 2658, 2676, 2685, 2751, 2766, 2768, 2799, 2816, 2846, 2944, 2947, 2952, 2961, 3063, 3064, 3067, 3099, 3160, 3191, 3243, 3269, 3275, 3282, 3288, 3334, 3337, 3339, 3365, 3378, 3413, 3414, 3449, 3452, 3460, 3496, 3551, 3606, 3616, 3617, 3619, 3627, 3650, 3696, 3753, 3764, 3765, 3774, 3776, 3781, 3784, 3786, 3790, 3797, 3812, 3824, 3827, 3830, 3833, 3836, 3855, 3856, 3877, 3990, 4018, 4032, 4035, 4085, 4103, 4130, 4141, 4166, 4176, 4212, 4225, 4238, 4259, 4287, 4290, 4300, 4339, 4349, 4391, 4418, 4419, 4422, 4424, 4431, 4486, 4487, 4573, 4574, 4577, 4630, 4631, 4648, 4649, 4655, 4677, 4856, 4971, 4976]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_ten = dataset.iloc[45000:50000].reset_index(drop=True)

dataset_ten = dataset_ten[~dataset_ten.index.isin(index_remove)]

dataset_ten.shape

In [37]:
# STEP 14

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\plot_ten_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\rating_ten_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\actors_ten_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\director_ten_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\reviews_ten_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [38]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
# STEP 15

dataset_ten['actors'] = actors

dataset_ten['plot'] = plot

dataset_ten['imdb_rating'] = rating

dataset_ten['director'] = director

dataset_ten['reviews'] = reviews

dataset_ten = dataset_ten.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_ten.iloc[253]

In [None]:
dataset_ten.iloc[253]

In [None]:
# STEP 16

index_remove = [5, 19, 21, 25, 26, 27, 31, 32, 56, 102, 124, 126, 131, 143, 144, 155, 250, 253, 274, 303, 324, 325, 327, 332, 334, 351, 352, 353, 354, 361, 366, 392, 417, 433, 434, 441, 447, 452, 453, 490, 504, 506, 507, 518, 521, 527, 561, 571, 601, 602, 615, 619, 629, 632, 642, 663, 689, 703, 705, 709, 710, 711, 726, 727, 731, 732, 739, 765, 767, 771, 772, 788, 789, 795, 822, 825, 828, 838, 845, 889, 909, 911, 916, 932, 934, 985, 990, 992, 996, 1003, 1007, 1010, 1016, 1030, 1031, 1047, 1063, 1064, 1065, 1080, 1083, 1084, 1093, 1100, 1102, 1107, 1110, 1112, 1123, 1133, 1135, 1140, 1147, 1148, 1149, 1166, 1167, 1179, 1190, 1201, 1213, 1223, 1225, 1232, 1240, 1245, 1249, 1277, 1279, 1283, 1285, 1289, 1290, 1295, 1299, 1315, 1331, 1336, 1343, 1370, 1388, 1389, 1393, 1453, 1457, 1460, 1467, 1468, 1473, 1474, 1477, 1496, 1498, 1514, 1516, 1557, 1574, 1594, 1595, 1607, 1611, 1613, 1626, 1628, 1634, 1635, 1668, 1676, 1679, 1682, 1684, 1685, 1689, 1690, 1698, 1710, 1713, 1714, 1731, 1750, 1764, 1769, 1773, 1774, 1778, 1782, 1792, 1810, 1818, 1829, 1847, 1848, 1849, 1854, 1869, 1875, 1877, 1885, 1887, 1900, 1904, 1917, 1928, 1929, 1939, 1940, 1942, 1952, 1960, 1963, 1965, 1986, 1996, 2002, 2006, 2010, 2015, 2024, 2032, 2033, 2045, 2047, 2048, 2049, 2051, 2054, 2055, 2058, 2059, 2061, 2065, 2066, 2071, 2072, 2075, 2079, 2081, 2082, 2083, 2084, 2087, 2089, 2097, 2101, 2105, 2107, 2108, 2144, 2147, 2153, 2154, 2155, 2156, 2157, 2161, 2164, 2168, 2169, 2172, 2174, 2177, 2180, 2184, 2189, 2190, 2191, 2193, 2196, 2198, 2199, 2235, 2265, 2282, 2284, 2290, 2291, 2298, 2321, 2326, 2327, 2330, 2345, 2357, 2358, 2361, 2363, 2374, 2377, 2382, 2389, 2403, 2407, 2428, 2434, 2435, 2474, 2479, 2491, 2501, 2525, 2539, 2562, 2572, 2580, 2601, 2603, 2607, 2616, 2661, 2670, 2697, 2707, 2708, 2718, 2737, 2765, 2767, 2769, 2778, 2781, 2783, 2793, 2802, 2812, 2828, 2834, 2838, 2866, 2896, 2913, 2920, 2926, 2932, 2934, 2941, 2944, 2946, 2950, 2963, 2964, 2966, 2970, 2974, 2976, 2977, 2991, 3003, 3009, 3017, 3022, 3024, 3025, 3026, 3028, 3031, 3044, 3053, 3054, 3064, 3073, 3074, 3089, 3095, 3110, 3116, 3141, 3147, 3153, 3156, 3162, 3182, 3188, 3194, 3200, 3202, 3204, 3206, 3210, 3223, 3233, 3235, 3243, 3256, 3262, 3264, 3284, 3285, 3286, 3287, 3294, 3295, 3302, 3349, 3376, 3414, 3416, 3422, 3424, 3456, 3460, 3463, 3465, 3467, 3468, 3476, 3478, 3484, 3487, 3493, 3498, 3499, 3501, 3513, 3517, 3519, 3523, 3529, 3539, 3540, 3548, 3557, 3564, 3579, 3580, 3583, 3585, 3601, 3602, 3612, 3616, 3619, 3625, 3635, 3636, 3670, 3759, 3764, 3800, 3801, 3805, 3820, 3824, 3835, 3838, 3852, 3858, 3861, 3878, 3882, 3893, 3894, 3945, 3963, 3965, 3966, 3968, 3969, 3973, 3974, 3975, 3977, 3978, 3979, 3981, 3982, 3986, 3996, 4002, 4013, 4026, 4039, 4040, 4042, 4043, 4048, 4051, 4065, 4066, 4079, 4083, 4084, 4096, 4117, 4163, 4217, 4235, 4237, 4258, 4266, 4272, 4275, 4277, 4289, 4290, 4291, 4299, 4306, 4319, 4336, 4337, 4338, 4339, 4342, 4343, 4345, 4367, 4368, 4392, 4394, 4400, 4410, 4414, 4425, 4444, 4447, 4455, 4457, 4463, 4468, 4470, 4472, 4475, 4480, 4485, 4505, 4506, 4510, 4519, 4539, 4541, 4553, 4570, 4578, 4602, 4607, 4613, 4622, 4624, 4626, 4629, 4645, 4649, 4650, 4653, 4655, 4656, 4658, 4660, 4661, 4663, 4666, 4667]

dataset_ten = dataset_ten[~dataset_ten.index.isin(index_remove)]

In [None]:
dataset_ten = dataset_ten[dataset_ten.astype(str)['reviews'] != '[]']

In [None]:
# dataset_ten.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\dataset_ten_final_20012020.pkl") old

# dataset_ten.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\dataset_ten_final_24012020.pkl") old

dataset_ten.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\dataset_ten_final_25012020.pkl")

In [None]:
# STEP 17

dataset_ten.shape

In [None]:
dataset_ten.shape

#### 11) 55000 movies

In [None]:
content_eleven = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_eleven_14012020.pkl', 'rb') as f:
    
    content_eleven = pickle.load(f)

print("Number of URLs: {}".format(len(content_eleven)))

content_souplist_eleven = []

for i in tqdm_notebook(content_eleven):
    
    content_souplist_eleven.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_eleven)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eleven)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_eleven)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_eleven)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eleven)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
[i for i,x in enumerate(myfield_plot) if not x]

In [None]:
dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

In [None]:
dataset_two = dataset.iloc[50000:55000].reset_index()

In [None]:
dataset_two.iloc[874]

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_eleven_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_eleven_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_eleven_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_eleven_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove:  [4, 5, 15, 20, 24, 37, 105, 121, 128, 195, 236, 271, 321, 322, 323, 324, 361, 366, 375, 376, 386, 414, 
# 415, 416, 455, 504, 528, 532, 612, 631, 632, 633, 634, 635, 724, 752, 874, 963, 982, 1031, 1073, 1119, 1145, 1164, 1178, 
# 1226, 1227, 1228, 1229, 1230, 1273, 1282, 1295, 1296, 1330, 1348, 1363, 1464, 1470, 1473, 1837, 1838, 1841, 1928, 2068, 2073, 
# 2074, 2076, 2077, 2091, 2092, 2094, 2107, 2123, 2149, 2180, 2194, 2243, 2268, 2277, 2281, 2283, 2312, 2315, 2321, 2327, 2501, 
# 2534, 2593, 2636, 2698, 2738, 2759, 2761, 2864, 2875, 3008, 3021, 3022, 3027, 3031, 3034, 3046, 3047, 3051, 3056, 3058, 3061, 
# 3071, 3072, 3073, 3074, 3075, 3076, 3077, 3080, 3081, 3084, 3087, 3089, 3090, 3091, 3092, 3093, 3094, 3095, 3096, 3119, 3121, 
# 3134, 3145, 3190, 3226, 3227, 3228, 3229, 3230, 3231, 3233, 3285, 3316, 3332, 3458, 3519, 3520, 3521, 3527, 3586, 3621, 3666, 
# 3739, 3759, 3770, 3787, 3809, 3832, 3842, 3938, 3942, 3960, 4010, 4039, 4048, 4081, 4083, 4120, 4129, 4190, 4222, 4223, 4227, 
# 4284, 4306, 4317, 4335, 4429, 4454, 4464, 4522, 4531, 4596, 4629, 4638, 4642, 4671, 4697, 4698, 4737, 4757, 4788, 4795, 4920, 
# 4921, 4926, 4927, 4928, 4933, 4963, 4964, 4976, 4977, 4995] (no actors)

# Indexes to remove: [874, 1031, 1079, 1273, 1932, 2177, 2269, 2283, 2336, 2419, 3455, 3581, 3740, 4585, 4834] (not rated)

# Indexes to remove: [612, 874, 1031, 2281, 2698, 3008] )no directors

# Indexes to remove: [874] -> it has class = credit_summary_item instead of class = summary_text

In [None]:
synopsis_eleven = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_eleven_14012020.pkl', 'rb') as f:
    
    synopsis_eleven = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_eleven)))

synopsis_souplist_eleven = []

for i in tqdm_notebook(synopsis_eleven):
    
    synopsis_souplist_eleven.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_eleven)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_eleven)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_eleven_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)

# Indexes to remove: 0

In [None]:
review_eleven = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_eleven_16012020.pkl', 'rb') as f:
    
    review_eleven = pickle.load(f)

print("Number of URLs: {}".format(len(review_eleven)))

review_souplist_eleven = []

for i in tqdm_notebook(review_eleven):
    
    review_souplist_eleven.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_eleven)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_eleven)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
# Pickle the reviews file for further use!

with open('reviews_eleven_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: [7, 8, 19, 22,27,49,63,89,99,103,109,123,127,130,141, 146, 148, 149, 164, 166, 171, 191, 193, 199, 209, 216,
# 217, 220, 221, 233, 236, 237, 241, 242, 244, 247, 249, 260, 271, 292, 304, 311, 314, 322, 323, 336, 340, 341, 342, 344, 348,
# 349,354, 355, 358, 360, 365, 375,376, 382, 397, 410, 414, 416, 423, 425, 431, 433, 436, 439, 451, 455, 456, 464, 476, 487, 521,
# 526, 528, 534, 553, 564, 574, 585, 592, 593, 595, 612, 623, 624, 638, 671, 680, 699, 704, 724, 735, 747, 751, 762, 767, 784,
# 793, 799, 805, 807, 828, 847, 867, 874, 880, 892, 896, 911, 913, 914, 949, 956, 960, 965, 968, 972, 975, 977, 978, 981, 982, 
# 987, 994, 998, 1013, 1017, 1029, 1031, 1032, 1033, 1035, 1038, 1048, 1049, 1055, 1057, 1069, 1071, 1072, 1075, 1079, 1084,
# 1085, 1107 1109, 1112,1116, 1131, 1162, 1171, 1175, 1176, 1200, 1228, 1229, 1231, 1235, 1238, 1239, 1243, 1257, 1258, 1259, 
# 1260, 1269, 1273, 1282, 1284, 1286, 1288, 1290, 1295, 1302, 1330, 1332, 1345, 1355, 1358, 1369, 1377, 1389, 1395, 1421, 1426,
# 1434, 1435, 1440, 1454, 1456, 1463, 1485, 1487, 1493, 1495, 1507, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1519, 1522, 1526,
# 1531, 1540, 1558, 1563, 1564, 1569, 1611, 1615, 1639, 1644, 1654, 1656, 1660, 1710, 1714, 1715, 1726, 1739, 1740, 1746, 1747,
#  1749,
#  1761,
#  1766,
#  1777,
#  1778,
#  1796,
#  1815,
#  1838,
#  1844,
#  1873,
#  1874,
#  1875,
#  1876,
#  1895,
#  1909,
#  1910,
#  1911,
#  1912,
#  1913,
#  1914,
#  1915,
#  1916,
#  1917,
#  1918,
#  1920,
#  1921,
#  1922,
#  1930,
#  1932,
#  1937,
#  1938,
#  1960,
#  1975,
#  1976,
#  1978,
#  1982,
#  1997,
#  2019,
#  2022,
#  2055,
#  2075,
#  2076,
#  2078,
#  2082,
#  2085,
#  2086,
#  2088,
#  2089,
#  2095,
#  2098,
#  2104,
#  2123,
#  2126,
#  2127,
#  2144,
#  2147,
#  2155,
#  2162,
#  2165,
#  2166,
#  2168,
#  2173,
#  2177,
#  2179,
#  2180,
#  2184,
#  2185,
#  2186,
#  2198,
#  2199,
#  2209,
#  2217,
#  2240,
#  2241,
#  2251,
#  2254,
#  2261,
#  2268,
#  2269,
#  2273,
#  2275,
#  2276,
#  2281,
#  2283,
#  2284,
#  2307,
#  2308,
#  2312,
#  2320,
#  2330,
#  2336,
#  2358,
#  2378,
#  2398,
#  2413,
#  2419,
#  2449,
#  2487,
#  2488,
#  2493,
#  2529,
#  2533,
#  2534,
#  2539,
#  2549,
#  2553,
#  2555,
#  2575,
#  2593,
#  2608,
#  2615,
#  2621,
#  2623,
#  2625,
#  2626,
#  2633,
#  2634,
#  2635,
#  2636,
#  2638,
#  2639,
#  2641,
#  2644,
#  2646,
#  2647,
#  2651,
#  2655,
#  2667,
#  2669,
#  2670,
#  2673,
#  2674,
#  2676,
#  2677,
#  2683,
#  2684,
#  2685,
#  2694,
#  2705,
#  2706,
#  2712,
#  2714,
#  2722,
#  2724,
#  2729,
#  2731,
#  2733,
#  2734,
#  2739,
#  2747,
#  2759,
#  2761,
#  2767,
#  2768,
#  2773,
#  2774,
#  2775,
#  2776,
#  2781,
#  2784,
#  2785,
#  2795,
#  2799,
#  2803,
#  2804,
#  2808,
#  2811,
#  2813,
#  2822,
#  2960,
#  3008,
#  3122,
#  3125,
#  3128,
#  3130,
#  3132,
#  3134,
#  3139,
#  3145,
#  3150,
#  3159,
#  3163,
#  3168,
#  3173,
#  3175,
#  3176,
#  3178,
#  3187,
#  3188,
#  3192,
#  3193,
#  3197,
#  3210,
#  3220,
#  3221,
#  3222,
#  3225,
#  3242,
#  3313,
#  3320,
#  3321,
#  3329,
#  3331,
#  3343,
#  3346,
#  3347,
#  3348,
#  3350,
#  3352,
#  3355,
#  3370,
#  3371,
#  3387,
#  3390,
#  3413,
#  3416,
#  3423,
#  3426,
#  3430,
#  3434,
#  3438,
#  3445,
#  3448,
#  3455,
#  3457,
#  3458,
#  3465,
#  3466,
#  3484,
#  3486,
#  3487,
#  3502,
#  3519,
#  3520,
#  3521,
#  3524,
#  3529,
#  3530,
#  3534,
#  3550,
#  3558,
#  3559,
#  3567,
#  3571,
#  3574,
#  3581,
#  3587,
#  3588,
#  3590,
#  3593,
#  3599,
#  3606,
#  3611,
#  3614,
#  3640,
#  3660,
#  3663,
#  3665,
#  3670,
#  3679,
#  3680,
#  3684,
#  3690,
#  3692,
#  3697,
#  3700,
#  3703,
#  3713,
#  3717,
#  3725,
#  3727,
#  3731,
#  3740,
#  3751,
#  3759,
#  3760,
#  3768,
#  3779,
#  3785,
#  3817,
#  3831,
#  3873,
#  3874,
#  3875,
#  3876,
#  3879,
#  3883,
#  3886,
#  3895,
#  3897,
#  3898,
#  3902,
#  3910,
#  3922,
#  3934,
#  3937,
#  3938,
#  3948,
#  3952,
#  3954,
#  3966,
#  3973,
#  3983,
#  4010,
#  4015,
#  4023,
#  4024,
#  4026,
#  4036,
#  4037,
#  4039,
#  4053,
#  4076,
#  4085,
#  4096,
#  4121,
#  4125,
#  4140,
#  4143,
#  4168,
#  4178,
#  4190,
#  4201,
#  4206,
#  4213,
#  4233,
#  4251,
#  4256,
#  4257,
#  4275,
#  4284,
#  4286,
#  4290,
#  4291,
#  4317,
#  4320,
#  4330,
#  4342,
#  4343,
#  4359,
#  4361,
#  4366,
#  4398,
#  4399,
#  4405,
#  4413,
#  4415,
#  4416,
#  4420,
#  4421,
#  4422,
#  4423,
#  4427,
#  4428,
#  4430,
#  4433,
#  4438,
#  4440,
#  4443,
#  4446,
#  4448,
#  4455,
#  4470,
#  4472,
#  4474,
#  4475,
#  4476,
#  4477,
#  4478,
#  4480,
#  4503,
#  4512,
#  4532,
#  4536,
#  4540,
#  4543,
#  4546,
#  4547,
#  4550,
#  4552,
#  4573,
#  4574,
#  4585,
#  4586,
#  4588,
#  4589,
#  4595,
#  4613,
#  4617,
#  4621,
#  4624,
#  4625,
#  4629,
#  4635,
#  4638,
#  4643,
#  4647,
#  4648,
#  4652,
#  4657,
#  4667,
#  4669,
#  4676,
#  4677,
#  4683,
#  4691,
#  4693,
#  4697,
#  4730,
#  4732,
#  4733,
#  4738,
#  4751,
#  4763,
#  4764,
#  4777,
#  4781,
#  4782,
#  4783,
#  4786,
#  4816,
#  4825,
#  4843,
#  4847,
#  4864,
#  4872,
#  4885,
#  4901,
#  4914,
#  4916,
#  4918,
#  4920,
#  4922,
#  4924,
#  4927,
#  4933,
#  4934,
#  4941,
#  4951,
#  4960,
#  4961,
#  4964,
#  4987,
#  4990,
#  4991,
#  4996] #678 indexes

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_eleven_14012020.pkl', 'rb') as f:
    
    content_eleven = pickle.load(f)

print("Number of URLs: {}".format(len(content_eleven)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_eleven_16012020.pkl', 'rb') as f:
    
    review_eleven = pickle.load(f)

print("Number of URLs: {}".format(len(review_eleven)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\pre-indexed files\\plot_eleven_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\pre-indexed files\\rating_eleven_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\pre-indexed files\\actors_eleven_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\pre-indexed files\\director_eleven_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\synopsis_eleven_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\reviews_eleven_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [4, 5, 15, 20, 24, 37, 105, 121, 128, 195, 236, 271, 321, 322, 323, 324, 361, 366, 375, 376, 386, 414, 
                415, 416, 455, 504, 528, 532, 612, 631, 632, 633, 634, 635, 724, 752, 874, 963, 982, 1031, 1073, 1119, 1145, 1164, 1178, 
                1226, 1227, 1228, 1229, 1230, 1273, 1282, 1295, 1296, 1330, 1348, 1363, 1464, 1470, 1473, 1837, 1838, 1841, 1928, 2068, 2073, 
                2074, 2076, 2077, 2091, 2092, 2094, 2107, 2123, 2149, 2180, 2194, 2243, 2268, 2277, 2281, 2283, 2312, 2315, 2321, 2327, 2501, 
                2534, 2593, 2636, 2698, 2738, 2759, 2761, 2864, 2875, 3008, 3021, 3022, 3027, 3031, 3034, 3046, 3047, 3051, 3056, 3058, 3061, 
                3071, 3072, 3073, 3074, 3075, 3076, 3077, 3080, 3081, 3084, 3087, 3089, 3090, 3091, 3092, 3093, 3094, 3095, 3096, 3119, 3121, 
                3134, 3145, 3190, 3226, 3227, 3228, 3229, 3230, 3231, 3233, 3285, 3316, 3332, 3458, 3519, 3520, 3521, 3527, 3586, 3621, 3666, 
                3739, 3759, 3770, 3787, 3809, 3832, 3842, 3938, 3942, 3960, 4010, 4039, 4048, 4081, 4083, 4120, 4129, 4190, 4222, 4223, 4227, 
                4284, 4306, 4317, 4335, 4429, 4454, 4464, 4522, 4531, 4596, 4629, 4638, 4642, 4671, 4697, 4698, 4737, 4757, 4788, 4795, 4920, 
                4921, 4926, 4927, 4928, 4933, 4963, 4964, 4976, 4977, 4995,
                874, 1031, 1079, 1273, 1932, 2177, 2269, 2283, 2336, 2419, 3455, 3581, 3740, 4585, 4834,
                612, 874, 1031, 2281, 2698, 3008,
                14, 18, 22, 44, 66, 84, 146, 149, 166, 171, 207, 244, 312, 322, 323, 324, 353, 412, 433, 455, 456, 537, 586, 592, 593, 624, 635, 705, 735, 747, 843, 932, 937, 993, 1012, 1031, 1056, 1072, 1078, 1144, 1199, 1229, 1256, 1331, 1378, 1484, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1689, 1739, 1877, 1912, 1931, 1936, 1987, 2039, 2099, 2104, 2149, 2154, 2160, 2172, 2274, 2285, 2311, 2335, 2341, 2345, 2398, 2404, 2418, 2446, 2452, 2544, 2547, 2599, 2613, 2615, 2620, 2622, 2623, 2631, 2640, 2650, 2668, 2670, 2671, 2672, 2676, 2680, 2682, 2718, 2746, 2774, 2794, 3121, 3167, 3174, 3220, 3221, 3296, 3320, 3322, 3345, 3389, 3425, 3457, 3470, 3485, 3486, 3501, 3518, 3549, 3556, 3570, 3580, 3589, 3592, 3613, 3659, 3689, 3730, 3739, 3758, 3759, 3772, 3872, 3874, 3927, 3947, 3953, 4008, 4018, 4036, 4038, 4120, 4127, 4148, 4285, 4297, 4339, 4404, 4418, 4419, 4420, 4447, 4452, 4472, 4473, 4476, 4539, 4549, 4554, 4646, 4655, 4715, 4762, 4763, 4771, 4786, 4917, 4926, 4958, 4963, 4966, 4988]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_eleven = [i for j, i in enumerate(content_eleven) if j not in index_remove]

review_index_eleven = [i for j, i in enumerate(review_eleven) if j not in index_remove]

print(len(content_index_eleven))

print(len(review_index_eleven))

# The length of this content_one (4629) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\content_index_eleven_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_eleven, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\review_index_eleven_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_eleven, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\content_index_eleven_20012020.pkl', 'rb') as f:
    
    content_index_eleven = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_eleven)))

content_souplist_eleven = []

for i in tqdm_notebook(content_index_eleven):
    
    content_souplist_eleven.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_eleven)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eleven)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_eleven)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_eleven)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# STEP 8

#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_eleven)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 9

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\plot_eleven_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\actors_eleven_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\director_eleven_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
# STEP 10

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\rating_eleven_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
# STEP 11

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\review_index_eleven_20012020.pkl', 'rb') as f:
    
    review_eleven = pickle.load(f)
    
print("Number of URLs: {}".format(len(review_eleven)))

review_souplist_eleven = []

for i in tqdm_notebook(review_eleven):
    
    review_souplist_eleven.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_eleven)))

In [None]:
# STEP 12

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_eleven)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_eleven = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\dataset_eleven_final_20012020.pkl")

print(data_eleven.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\reviews_eleven_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 13

index_remove = [4, 5, 15, 20, 24, 37, 105, 121, 128, 195, 236, 271, 321, 322, 323, 324, 361, 366, 375, 376, 386, 414, 
                415, 416, 455, 504, 528, 532, 612, 631, 632, 633, 634, 635, 724, 752, 874, 963, 982, 1031, 1073, 1119, 1145, 1164, 1178, 
                1226, 1227, 1228, 1229, 1230, 1273, 1282, 1295, 1296, 1330, 1348, 1363, 1464, 1470, 1473, 1837, 1838, 1841, 1928, 2068, 2073, 
                2074, 2076, 2077, 2091, 2092, 2094, 2107, 2123, 2149, 2180, 2194, 2243, 2268, 2277, 2281, 2283, 2312, 2315, 2321, 2327, 2501, 
                2534, 2593, 2636, 2698, 2738, 2759, 2761, 2864, 2875, 3008, 3021, 3022, 3027, 3031, 3034, 3046, 3047, 3051, 3056, 3058, 3061, 
                3071, 3072, 3073, 3074, 3075, 3076, 3077, 3080, 3081, 3084, 3087, 3089, 3090, 3091, 3092, 3093, 3094, 3095, 3096, 3119, 3121, 
                3134, 3145, 3190, 3226, 3227, 3228, 3229, 3230, 3231, 3233, 3285, 3316, 3332, 3458, 3519, 3520, 3521, 3527, 3586, 3621, 3666, 
                3739, 3759, 3770, 3787, 3809, 3832, 3842, 3938, 3942, 3960, 4010, 4039, 4048, 4081, 4083, 4120, 4129, 4190, 4222, 4223, 4227, 
                4284, 4306, 4317, 4335, 4429, 4454, 4464, 4522, 4531, 4596, 4629, 4638, 4642, 4671, 4697, 4698, 4737, 4757, 4788, 4795, 4920, 
                4921, 4926, 4927, 4928, 4933, 4963, 4964, 4976, 4977, 4995,
                874, 1031, 1079, 1273, 1932, 2177, 2269, 2283, 2336, 2419, 3455, 3581, 3740, 4585, 4834,
                612, 874, 1031, 2281, 2698, 3008,
                14, 18, 22, 44, 66, 84, 146, 149, 166, 171, 207, 244, 312, 322, 323, 324, 353, 412, 433, 455, 456, 537, 586, 592, 593, 624, 635, 705, 735, 747, 843, 932, 937, 993, 1012, 1031, 1056, 1072, 1078, 1144, 1199, 1229, 1256, 1331, 1378, 1484, 1511, 1512, 1513, 1514, 1515, 1516, 1517, 1518, 1689, 1739, 1877, 1912, 1931, 1936, 1987, 2039, 2099, 2104, 2149, 2154, 2160, 2172, 2274, 2285, 2311, 2335, 2341, 2345, 2398, 2404, 2418, 2446, 2452, 2544, 2547, 2599, 2613, 2615, 2620, 2622, 2623, 2631, 2640, 2650, 2668, 2670, 2671, 2672, 2676, 2680, 2682, 2718, 2746, 2774, 2794, 3121, 3167, 3174, 3220, 3221, 3296, 3320, 3322, 3345, 3389, 3425, 3457, 3470, 3485, 3486, 3501, 3518, 3549, 3556, 3570, 3580, 3589, 3592, 3613, 3659, 3689, 3730, 3739, 3758, 3759, 3772, 3872, 3874, 3927, 3947, 3953, 4008, 4018, 4036, 4038, 4120, 4127, 4148, 4285, 4297, 4339, 4404, 4418, 4419, 4420, 4447, 4452, 4472, 4473, 4476, 4539, 4549, 4554, 4646, 4655, 4715, 4762, 4763, 4771, 4786, 4917, 4926, 4958, 4963, 4966, 4988]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_eleven = dataset.iloc[50000:55000].reset_index(drop=True)

dataset_eleven = dataset_eleven[~dataset_eleven.index.isin(index_remove)]

dataset_eleven.shape

In [39]:
# STEP 14

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\plot_eleven_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\rating_eleven_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\actors_eleven_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\director_eleven_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\reviews_eleven_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [40]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
# STEP 15

dataset_eleven['actors'] = actors

dataset_eleven['plot'] = plot

dataset_eleven['imdb_rating'] = rating

dataset_eleven['director'] = director

dataset_eleven['reviews'] = reviews

dataset_eleven = dataset_eleven.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_eleven.iloc[2005]

In [None]:
dataset_eleven.reviews.iloc[2005]

In [None]:
# STEP 16

index_remove = [5, 6, 14, 19, 39, 53, 77, 87, 91, 96, 109, 113, 115, 126, 132, 147, 172, 174, 179, 188, 195, 196, 199, 200, 212, 215, 219, 220, 224, 226, 237, 268, 280, 287, 289, 307, 311, 312, 313, 315, 319, 320, 324, 325, 328, 330, 334, 348, 362, 375, 384, 386, 392, 396, 399, 411, 422, 434, 445, 478, 483, 489, 507, 518, 528, 539, 546, 573, 582, 615, 624, 643, 648, 691, 701, 706, 723, 732, 738, 744, 746, 767, 785, 805, 817, 829, 833, 848, 850, 851, 884, 891, 895, 899, 902, 906, 909, 911, 912, 915, 920, 926, 930, 944, 948, 960, 962, 963, 965, 968, 978, 979, 985, 986, 998, 1000, 1002, 1009, 1010, 1032, 1034, 1037, 1041, 1055, 1084, 1092, 1096, 1097, 1119, 1145, 1149, 1152, 1153, 1157, 1170, 1171, 1172, 1173, 1182, 1195, 1197, 1199, 1201, 1211, 1239, 1252, 1261, 1264, 1274, 1282, 1293, 1299, 1325, 1330, 1338, 1339, 1344, 1358, 1360, 1367, 1385, 1387, 1393, 1395, 1407, 1411, 1414, 1418, 1423, 1432, 1450, 1455, 1456, 1461, 1503, 1507, 1531, 1536, 1546, 1548, 1552, 1601, 1605, 1606, 1617, 1630, 1636, 1637, 1639, 1651, 1656, 1667, 1668, 1686, 1705, 1731, 1760, 1761, 1762, 1763, 1781, 1795, 1796, 1797, 1798, 1799, 1800, 1801, 1802, 1803, 1805, 1806, 1807, 1814, 1818, 1819, 1841, 1856, 1857, 1859, 1863, 1877, 1899, 1902, 1934, 1951, 1952, 1956, 1959, 1960, 1962, 1963, 1966, 1969, 1993, 1994, 2011, 2014, 2020, 2026, 2029, 2030, 2032, 2036, 2041, 2045, 2046, 2047, 2058, 2059, 2069, 2077, 2100, 2101, 2110, 2113, 2120, 2130, 2131, 2132, 2137, 2159, 2160, 2169, 2177, 2201, 2221, 2254, 2287, 2324, 2325, 2330, 2365, 2369, 2374, 2382, 2386, 2388, 2408, 2439, 2449, 2451, 2452, 2458, 2459, 2460, 2462, 2463, 2464, 2467, 2469, 2470, 2473, 2477, 2489, 2490, 2491, 2492, 2494, 2498, 2499, 2500, 2509, 2519, 2520, 2526, 2528, 2535, 2537, 2542, 2544, 2546, 2547, 2551, 2558, 2576, 2577, 2582, 2583, 2584, 2589, 2592, 2593, 2602, 2606, 2610, 2611, 2615, 2618, 2620, 2629, 2765, 2894, 2897, 2900, 2902, 2904, 2910, 2920, 2929, 2933, 2937, 2942, 2943, 2944, 2946, 2955, 2956, 2959, 2960, 2964, 2977, 2987, 2990, 3000, 3069, 3075, 3082, 3084, 3095, 3097, 3098, 3099, 3101, 3103, 3106, 3121, 3122, 3138, 3140, 3163, 3166, 3173, 3175, 3179, 3183, 3187, 3194, 3197, 3211, 3212, 3229, 3230, 3244, 3262, 3266, 3267, 3271, 3286, 3293, 3294, 3302, 3305, 3308, 3318, 3319, 3320, 3322, 3328, 3335, 3340, 3342, 3367, 3386, 3389, 3391, 3395, 3404, 3405, 3409, 3414, 3416, 3421, 3424, 3427, 3437, 3441, 3449, 3451, 3454, 3472, 3479, 3487, 3496, 3502, 3532, 3546, 3585, 3586, 3587, 3590, 3594, 3597, 3606, 3608, 3609, 3613, 3621, 3633, 3644, 3647, 3655, 3659, 3660, 3671, 3678, 3688, 3718, 3725, 3726, 3728, 3738, 3751, 3774, 3781, 3792, 3816, 3820, 3833, 3836, 3860, 3870, 3892, 3897, 3904, 3921, 3939, 3944, 3945, 3963, 3972, 3976, 3977, 4003, 4013, 4023, 4024, 4040, 4042, 4047, 4079, 4080, 4085, 4093, 4095, 4096, 4098, 4099, 4100, 4104, 4105, 4106, 4109, 4114, 4116, 4119, 4122, 4123, 4128, 4142, 4144, 4145, 4146, 4147, 4149, 4172, 4181, 4199, 4203, 4206, 4209, 4212, 4213, 4215, 4217, 4237, 4238, 4249, 4251, 4252, 4258, 4275, 4279, 4283, 4286, 4287, 4296, 4302, 4305, 4306, 4310, 4314, 4324, 4326, 4332, 4333, 4339, 4347, 4349, 4383, 4385, 4386, 4390, 4403, 4413, 4425, 4429, 4430, 4431, 4461, 4470, 4487, 4491, 4508, 4516, 4529, 4545, 4558, 4560, 4561, 4563, 4565, 4571, 4578, 4588, 4596, 4597, 4618, 4620, 4621, 4625]

dataset_eleven = dataset_eleven[~dataset_eleven.index.isin(index_remove)]

In [None]:
dataset_eleven = dataset_eleven[dataset_eleven.astype(str)['reviews'] != '[]']

In [None]:
# dataset_eleven.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\dataset_eleven_final_20012020.pkl") old

# dataset_eleven.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\dataset_eleven_final_24012020.pkl") old

dataset_eleven.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\dataset_eleven_final_25012020.pkl")

In [None]:
# STEP 17

dataset_eleven.shape

In [None]:
dataset_eleven.shape

#### 12) 58098 movies

In [None]:
content_twelve = []

with open('D:\\GitHub-Thesis\\movie_content_url\\data_twelve_14012020.pkl', 'rb') as f:
    
    content_twelve = pickle.load(f)

print("Number of URLs: {}".format(len(content_twelve)))

content_souplist_twelve = []

for i in tqdm_notebook(content_twelve):
    
    content_souplist_twelve.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_twelve)))

In [None]:
# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_twelve)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_twelve)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_twelve)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_twelve)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name if len(item)!=0]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
for i,j in enumerate(index_to_remove_no_actors):
    
    print(j, plot_summary[j])

In [None]:
plot_summary[1988]

In [None]:
myfield_plot[1988]

In [None]:
dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

In [None]:
dataset_two = dataset.iloc[55000:].reset_index()

dataset_two.iloc[1988]

In [None]:
dataset_two.shape

In [None]:
plot_two = []

[[[plot_two.append(y.text) for y in x.find_all('div', {'class':'credit_summary_item'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]

In [None]:
len(plot_two)

In [None]:
[i for i,x in enumerate(director_name) if not x]

In [None]:
plot_summary[451]

In [None]:
#------------------------------------------------------------------------------------------------

# Pickle the requests file for further use!

with open('plot_twelve_18012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('rating_twelve_18012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)
    
with open('actors_twelve_18012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('director_twelve_18012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)
    
# Indexes to remove:  [185, 189, 196, 200, 201, 235, 292, 293, 370, 398, 402, 425, 431, 436, 437, 440, 445, 451, 457, 461, 462, 
# 466, 469, 473, 476, 477, 488, 493, 535, 664, 695, 763, 783, 829, 840, 849, 858, 868, 883, 938, 939, 951, 995, 1003, 1034, 
# 1040, 1043, 1070, 1088, 1120, 1150, 1151, 1152, 1163, 1170, 1171, 1176, 1178, 1259, 1274, 1352, 1359, 1367, 1368, 1397, 1406,
# 1472, 1474, 1478, 1479, 1480, 1482, 1483, 1487, 1526, 1560, 1570, 1586, 1597, 1606, 1640, 1646, 1648, 1660, 1684, 1701, 1717,
# 1760, 1779, 1817, 1853, 1858, 1917, 1923, 1938, 1988, 2114, 2118, 2132, 2138, 2144, 2150, 2158, 2194, 2277, 2303, 2341, 2342, 
# 2347, 2349, 2352, 2366, 2467, 2500, 2550, 2607, 2644, 2656, 2728, 2729, 2768, 2793, 2801, 2825, 2858, 2894, 2903, 2940, 2950, 
# 2992, 2995, 3014, 3018] (no actors)

# Indexes to remove: [265, 999, 1064, 1120, 1170, 1347, 1478, 1693, 1788, 1988, 2051, 2342, 2347, 2349, 2350, 2352, 2353, 2861, 3024, 3062] (not rated)

# Indexes to remove: [189, 476, 535, 664, 1170, 1472, 1760, 1988, 2656] (no directors)

# Indexes to remove: [1988] -> it has no text ("Add a Plot")

In [None]:
synopsis_twelve = []

with open('D:\\GitHub-Thesis\\synopsis_url\\synopsis_twelve_14012020.pkl', 'rb') as f:
    
    synopsis_twelve = pickle.load(f)

print("Number of URLs: {}".format(len(synopsis_twelve)))

synopsis_souplist_twelve = []

for i in tqdm_notebook(synopsis_twelve):
    
    synopsis_souplist_twelve.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(synopsis_souplist_twelve)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 5: Extract Plot Synopsis

synopsis_step_one = []
synopsis_step_two = []
synopsis_step_three = []

[synopsis_step_one.append(i.find_all('ul', {'class':'ipl-zebra-list', 'id':'plot-synopsis-content'})) for i in tqdm_notebook(synopsis_souplist_twelve)]

[[synopsis_step_two.append(j.find_all('li', {'class':'ipl-zebra-list__item'})) for j in i] for i in synopsis_step_one]

[[synopsis_step_three.append(j.text.strip(' ').replace('\n', '').replace('\\', '')) for j in i] for i in synopsis_step_two]

index_to_remove_no_synopsis = [i for i,x in enumerate(synopsis_step_one) if not x]

print("Length of Synopsis list: {}".format(len(synopsis_step_three)))
print("Length of the list with Movies that don't have synopsis: {}".format(len(index_to_remove_no_synopsis)))
if len(index_to_remove_no_synopsis) == 0:
    print("None of the movies miss a synopsis")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_synopsis))

In [None]:
[i for i,x in enumerate(synopsis_step_one) if not x]

In [None]:
# Pickle the synopsis file for further use!

with open('synopsis_twelve_18012020.pkl', 'wb') as f:
    
    pickle.dump(synopsis_step_three, f)
    
# Indexes to remove: 0

In [None]:
review_twelve = []

with open('D:\\GitHub-Thesis\\reviews_url\\review_twelve_16012020.pkl', 'rb') as f:
    
    review_twelve = pickle.load(f)

print("Number of URLs: {}".format(len(review_twelve)))

review_souplist_twelve = []

for i in tqdm_notebook(review_twelve):
    
    review_souplist_twelve.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_twelve)))

In [None]:
#------------------------------------------------------------------------------------------------

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_twelve)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

In [None]:
[i for i,x in enumerate(myfield_review_step_three) if not x]

In [None]:
# Pickle the reviews file for further use!

with open('reviews_twelve_18012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)
    
# Indexes to remove: 

# [2,
#  18,
#  45,
#  47,
#  56,
#  69,
#  85,
#  88,
#  94,
#  107,
#  110,
#  113,
#  161,
#  172,
#  173,
#  176,
#  181,
#  182,
#  183,
#  189,
#  201,
#  217,
#  220,
#  224,
#  230,
#  232,
#  234,
#  239,
#  240,
#  247,
#  254,
#  262,
#  265,
#  290,
#  292,
#  293,
#  294,
#  297,
#  312,
#  316,
#  329,
#  363,
#  375,
#  385,
#  396,
#  402,
#  413,
#  434,
#  476,
#  480,
#  511,
#  513,
#  514,
#  521,
#  524,
#  547,
#  557,
#  566,
#  571,
#  572,
#  574,
#  577,
#  584,
#  599,
#  600,
#  602,
#  605,
#  608,
#  610,
#  616,
#  617,
#  618,
#  633,
#  636,
#  643,
#  644,
#  645,
#  652,
#  654,
#  655,
#  656,
#  657,
#  672,
#  673,
#  674,
#  676,
#  677,
#  678,
#  679,
#  680,
#  681,
#  683,
#  688,
#  699,
#  705,
#  706,
#  707,
#  717,
#  718,
#  723,
#  726,
#  736,
#  742,
#  745,
#  751,
#  752,
#  753,
#  754,
#  756,
#  758,
#  759,
#  760,
#  761,
#  762,
#  765,
#  767,
#  771,
#  773,
#  774,
#  775,
#  776,
#  778,
#  779,
#  782,
#  783,
#  784,
#  785,
#  787,
#  788,
#  790,
#  792,
#  793,
#  794,
#  797,
#  798,
#  800,
#  801,
#  803,
#  804,
#  806,
#  807,
#  808,
#  809,
#  812,
#  816,
#  817,
#  821,
#  822,
#  824,
#  825,
#  826,
#  829,
#  830,
#  831,
#  833,
#  839,
#  849,
#  862,
#  873,
#  876,
#  878,
#  879,
#  881,
#  882,
#  888,
#  890,
#  891,
#  894,
#  896,
#  897,
#  903,
#  904,
#  912,
#  917,
#  918,
#  919,
#  920,
#  931,
#  932,
#  933,
#  935,
#  936,
#  937,
#  938,
#  939,
#  940,
#  951,
#  953,
#  956,
#  958,
#  963,
#  967,
#  972,
#  974,
#  981,
#  988,
#  999,
#  1000,
#  1001,
#  1003,
#  1006,
#  1020,
#  1028,
#  1034,
#  1041,
#  1042,
#  1043,
#  1046,
#  1047,
#  1057,
#  1060,
#  1061,
#  1064,
#  1072,
#  1075,
#  1077,
#  1083,
#  1088,
#  1109,
#  1113,
#  1120,
#  1121,
#  1132,
#  1151,
#  1152,
#  1161,
#  1162,
#  1163,
#  1168,
#  1170,
#  1171,
#  1172,
#  1183,
#  1188,
#  1193,
#  1194,
#  1195,
#  1196,
#  1201,
#  1211,
#  1212,
#  1213,
#  1226,
#  1228,
#  1230,
#  1237,
#  1240,
#  1244,
#  1268,
#  1270,
#  1271,
#  1272,
#  1273,
#  1274,
#  1276,
#  1277,
#  1308,
#  1309,
#  1310,
#  1313,
#  1320,
#  1330,
#  1343,
#  1347,
#  1351,
#  1352,
#  1354,
#  1367,
#  1368,
#  1375,
#  1376,
#  1377,
#  1380,
#  1390,
#  1396,
#  1397,
#  1402,
#  1404,
#  1412,
#  1416,
#  1420,
#  1421,
#  1423,
#  1431,
#  1441,
#  1445,
#  1455,
#  1458,
#  1459,
#  1465,
#  1466,
#  1478,
#  1479,
#  1480,
#  1481,
#  1486,
#  1502,
#  1503,
#  1506,
#  1514,
#  1518,
#  1520,
#  1531,
#  1533,
#  1534,
#  1536,
#  1545,
#  1547,
#  1548,
#  1560,
#  1569,
#  1597,
#  1600,
#  1601,
#  1602,
#  1603,
#  1604,
#  1620,
#  1625,
#  1626,
#  1641,
#  1644,
#  1647,
#  1656,
#  1675,
#  1687,
#  1693,
#  1697,
#  1702,
#  1713,
#  1719,
#  1721,
#  1723,
#  1729,
#  1740,
#  1741,
#  1742,
#  1743,
#  1751,
#  1762,
#  1763,
#  1769,
#  1772,
#  1778,
#  1782,
#  1788,
#  1794,
#  1795,
#  1800,
#  1804,
#  1805,
#  1813,
#  1820,
#  1821,
#  1822,
#  1823,
#  1834,
#  1836,
#  1838,
#  1843,
#  1845,
#  1848,
#  1850,
#  1885,
#  1886,
#  1887,
#  1911,
#  1913,
#  1918,
#  1923,
#  1933,
#  1934,
#  1938,
#  1939,
#  1950,
#  1955,
#  1960,
#  1963,
#  1964,
#  1966,
#  1968,
#  1988,
#  1990,
#  1991,
#  1992,
#  1993,
#  1995,
#  1999,
#  2003,
#  2008,
#  2009,
#  2014,
#  2041,
#  2051,
#  2055,
#  2072,
#  2075,
#  2083,
#  2086,
#  2118,
#  2121,
#  2137,
#  2138,
#  2140,
#  2141,
#  2143,
#  2144,
#  2146,
#  2153,
#  2154,
#  2156,
#  2171,
#  2192,
#  2193,
#  2202,
#  2203,
#  2206,
#  2227,
#  2233,
#  2258,
#  2262,
#  2265,
#  2268,
#  2276,
#  2285,
#  2293,
#  2298,
#  2309,
#  2328,
#  2330,
#  2336,
#  2338,
#  2342,
#  2347,
#  2348,
#  2349,
#  2350,
#  2352,
#  2353,
#  2354,
#  2356,
#  2357,
#  2363,
#  2368,
#  2371,
#  2376,
#  2379,
#  2390,
#  2393,
#  2405,
#  2411,
#  2417,
#  2420,
#  2422,
#  2425,
#  2426,
#  2428,
#  2430,
#  2435,
#  2442,
#  2445,
#  2446,
#  2447,
#  2453,
#  2455,
#  2474,
#  2490,
#  2493,
#  2508,
#  2516,
#  2535,
#  2550,
#  2553,
#  2557,
#  2560,
#  2561,
#  2572,
#  2605,
#  2623,
#  2638,
#  2656,
#  2661,
#  2666,
#  2669,
#  2681,
#  2684,
#  2685,
#  2687,
#  2689,
#  2690,
#  2692,
#  2694,
#  2702,
#  2714,
#  2716,
#  2730,
#  2731,
#  2735,
#  2745,
#  2751,
#  2758,
#  2759,
#  2761,
#  2770,
#  2773,
#  2774,
#  2776,
#  2797,
#  2808,
#  2810,
#  2811,
#  2822,
#  2824,
#  2829,
#  2839,
#  2840,
#  2841,
#  2852,
#  2856,
#  2858,
#  2859,
#  2861,
#  2863,
#  2867,
#  2874,
#  2875,
#  2878,
#  2883,
#  2884,
#  2891,
#  2892,
#  2896,
#  2897,
#  2900,
#  2903,
#  2908,
#  2909,
#  2918,
#  2920,
#  2922,
#  2925,
#  2926,
#  2935,
#  2939,
#  2940,
#  2941,
#  2943,
#  2948,
#  2950,
#  2952,
#  2956,
#  2957,
#  2958,
#  2959,
#  2962,
#  2986,
#  2987,
#  2991,
#  2992,
#  2993,
#  2995,
#  2999,
#  3006,
#  3008,
#  3014,
#  3016,
#  3018,
#  3021,
#  3024,
#  3025,
#  3028,
#  3034,
#  3045,
#  3050,
#  3051,
#  3059,
#  3070,
#  3079,
#  3091,
#  3093, 
#  3095] #581

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Dataframe creation based on the movie content extracted

In [None]:
# STEP 1

with open('D:\\GitHub-Thesis\\movie_content_url\\data_twelve_14012020.pkl', 'rb') as f:
    
    content_twelve = pickle.load(f)

print("Number of URLs: {}".format(len(content_twelve)))

with open('D:\\GitHub-Thesis\\reviews_url\\review_twelve_16012020.pkl', 'rb') as f:
    
    review_twelve = pickle.load(f)

print("Number of URLs: {}".format(len(review_twelve)))

In [None]:
# STEP 2

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\pre-indexed files\\plot_twelve_18012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\pre-indexed files\\rating_twelve_18012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\pre-indexed files\\actors_twelve_18012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\pre-indexed files\\director_twelve_18012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\synopsis_twelve_18012020.pkl', 'rb') as f:
    
    synopsis = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\reviews_twelve_18012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
print(len(plot))
print(len(rating))
print(len(actors))
print(len(director))
print(len(synopsis))
print(len(reviews))

In [None]:
# STEP 3

matching_add_plot = [s for s, x in enumerate(plot) if "Add a Plot" in x]

print(matching_add_plot)

matching_add_synopsis = [s for s, x in enumerate(synopsis) if 'It looks like we don\'t have a Synopsis for this title yet. Be the first to contribute! Just click the "Edit page" button at the bottom of the page or learn more in the Synopsis submission guide.' in x]

print(matching_add_synopsis)

print(len(matching_add_synopsis))

In [None]:
# STEP 4

index_remove = [185, 189, 196, 200, 201, 235, 292, 293, 370, 398, 402, 425, 431, 436, 437, 440, 445, 451, 457, 461, 462, 
                466, 469, 473, 476, 477, 488, 493, 535, 664, 695, 763, 783, 829, 840, 849, 858, 868, 883, 938, 939, 951, 995, 1003, 1034, 
                1040, 1043, 1070, 1088, 1120, 1150, 1151, 1152, 1163, 1170, 1171, 1176, 1178, 1259, 1274, 1352, 1359, 1367, 1368, 1397, 1406,
                1472, 1474, 1478, 1479, 1480, 1482, 1483, 1487, 1526, 1560, 1570, 1586, 1597, 1606, 1640, 1646, 1648, 1660, 1684, 1701, 1717,
                1760, 1779, 1817, 1853, 1858, 1917, 1923, 1938, 1988, 2114, 2118, 2132, 2138, 2144, 2150, 2158, 2194, 2277, 2303, 2341, 2342, 
                2347, 2349, 2352, 2366, 2467, 2500, 2550, 2607, 2644, 2656, 2728, 2729, 2768, 2793, 2801, 2825, 2858, 2894, 2903, 2940, 2950, 
                2992, 2995, 3014, 3018, 265, 999, 1064, 1120, 1170, 1347, 1478, 1693, 1788, 1988, 2051, 2342, 2347, 2349, 2350, 2352, 2353, 2861, 3024, 3062,
                189, 476, 535, 664, 1170, 1472, 1760, 1988, 2656, 19882, 18,
                52, 54, 110, 172, 176, 179, 208, 230, 254, 272, 283, 287, 363, 402, 451, 476, 513, 567, 571, 577, 599, 605, 613, 614, 617, 618, 619, 644, 647, 683, 753, 776, 777, 778, 780, 788, 789, 793, 806, 808, 809, 817, 822, 834, 912, 935, 936, 937, 974, 981, 1030, 1050, 1083, 1120, 1151, 1152, 1161, 1170, 1172, 1200, 1266, 1268, 1270, 1271, 1274, 1275, 1276, 1277, 1308, 1309, 1341, 1351, 1368, 1375, 1380, 1402, 1412, 1446, 1450, 1454, 1458, 1476, 1518, 1526, 1527, 1566, 1569, 1626, 1641, 1693, 1702, 1724, 1729, 1754, 1769, 1788, 1800, 1804, 1813, 1816, 1823, 1916, 1918, 1938, 1959, 1960, 1965, 1998, 2029, 2050, 2120, 2137, 2152, 2174, 2286, 2297, 2312, 2341, 2347, 2351, 2352, 2378, 2421, 2454, 2460, 2473, 2507, 2527, 2533, 2556, 2571, 2655, 2660, 2669, 2683, 2760, 2858, 2860, 2898, 2899, 2934, 2939, 2989, 3018, 3049, 3050, 3051]

index_remove = list(dict.fromkeys(index_remove))

print(len(index_remove))

content_index_twelve = [i for j, i in enumerate(content_twelve) if j not in index_remove]

review_index_twelve = [i for j, i in enumerate(review_twelve) if j not in index_remove]

print(len(content_index_twelve))

print(len(review_index_twelve))

# The length of this content_one (2823) should now be extracted from souplist!

In [None]:
# STEP 5

# Pickle the synopsis file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\content_index_twelve_20012020.pkl', 'wb') as f:
    
    pickle.dump(content_index_twelve, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\review_index_twelve_20012020.pkl', 'wb') as f:
    
    pickle.dump(review_index_twelve, f)

In [None]:
# STEP 6

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\content_index_twelve_20012020.pkl', 'rb') as f:
    
    content_index_twelve = pickle.load(f)

print("Number of URLs: {}".format(len(content_index_twelve)))

content_souplist_twelve = []

for i in tqdm_notebook(content_index_twelve):
    
    content_souplist_twelve.append(BeautifulSoup(i.text))

print("Number of souplists: {}".format(len(content_souplist_twelve)))

In [None]:
# STEP 7

# Field 1: Extract plot summary

# souplist_three = souplist_three[0:113] 

myfield_plot = []
plot_summary = []
index_to_remove_no_plot = []

[myfield_plot.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_twelve)]

[[[plot_summary.append(y.text) for y in x.find_all('div', {'class':'summary_text'})] for x in i] if len(i) !=0 else index_to_remove_no_plot.append(myfield_plot.index(i)) for i in myfield_plot]
      
print("Length of Plot Summary list: {}".format(len(plot_summary)))
print("Length of the list with Movies that don't have plot summary: {}".format(len(index_to_remove_no_plot)))
if len(index_to_remove_no_plot) == 0:
    print("None of the movie miss plot")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_plot))

#------------------------------------------------------------------------------------------------

# Field 2: Extract IMDB rating

# myfield_rating = []
# ratings = []
# index_to_remove_no_rating = []

# [myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_three)]

# [[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

# index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

# print("Length of Ratings list: {}".format(len(ratings)))
# print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
# if len(index_to_remove_no_rating) == 0:
#     print("None of the movie miss ratings")
# else:
#     print("Indexes to remove: {}".format(index_to_remove_no_rating))

#------------------------------------------------------------------------------------------------

# Field 3: Extract Actors

myfield_cast = []
ratings = []
phase_two = []
phase_three = []
actors_list = []
index_to_remove_no_actors = []

[myfield_cast.append(i.find_all('table', {'class':'cast_list'})) for i in tqdm_notebook(content_souplist_twelve)]
    
r_one = re.compile(".*name")

[[phase_two.append(j.find_all('a', {'href':r_one})) for j in i] for i in myfield_cast]

[phase_three.append(phase_two[i][1::2]) for i in range(len(phase_two))]

[actors_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), actors))) for actors in phase_three]            
    
index_to_remove_no_actors = [i for i,x in enumerate(myfield_cast) if not x]
        
print("Length of Actors list: {}".format(len(actors_list)))
print("Length of the list with Movies that don't have actors: {}".format(len(index_to_remove_no_actors)))
if len(index_to_remove_no_actors) == 0:
    print("None of the movie miss actors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_actors))

#------------------------------------------------------------------------------------------------

# Field 4: Extract Director Name

myfield_director = []
director_name = []
index_to_remove_no_directors = []

[myfield_director.append(i.find_all('div', {'class':'plot_summary'})) for i in tqdm_notebook(content_souplist_twelve)]

r_name = re.compile(".*name")

[[director_name.append(j.find_all('a', {'href':r_name})) for j in i] for i in myfield_director]
    
director_names = [item[0].text for item in director_name]

index_to_remove_no_directors = [i for i,x in enumerate(myfield_director) if not x]

print("Length of Directors list: {}".format(len(director_names)))
print("Length of the list with Movies that don't have directors: {}".format(len(index_to_remove_no_directors)))
if len(index_to_remove_no_directors) == 0:
    print("None of the movie miss directors")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_directors))

In [None]:
# STEP 8

#------------------------------------------------------------------------------------------------

myfield_rating = []
ratings = []
index_to_remove_no_rating = []

[myfield_rating.append(i.find_all('div', {'class':'ratingValue'})) for i in tqdm_notebook(content_souplist_twelve)]

[[[ratings.append(y.text) for y in x.find_all('span', {'itemprop':'ratingValue'})] for x in i] for i in myfield_rating]

index_to_remove_no_rating = [i for i,x in enumerate(myfield_rating) if not x]

print("Length of Ratings list: {}".format(len(ratings)))
print("Length of the list with Movies that are not rated: {}".format(len(index_to_remove_no_rating)))
if len(index_to_remove_no_rating) == 0:
    print("None of the movie miss ratings")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_rating))

In [None]:
# STEP 9

# Pickle the requests file for further use!

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\plot_twelve_20012020.pkl', 'wb') as f:
    
    pickle.dump(plot_summary, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\actors_twelve_20012020.pkl', 'wb') as f:
    
    pickle.dump(actors_list, f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\director_twelve_20012020.pkl', 'wb') as f:
    
    pickle.dump(director_names, f)

In [None]:
# STEP 10

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\rating_twelve_20012020.pkl', 'wb') as f:
    
    pickle.dump(ratings, f)

In [None]:
# STEP 11

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\review_index_twelve_20012020.pkl', 'rb') as f:
    
    review_twelve = pickle.load(f)
    
print("Number of URLs: {}".format(len(review_twelve)))

review_souplist_twelve = []

for i in tqdm_notebook(review_twelve):
    
    review_souplist_twelve.append(BeautifulSoup(i.text))

print("Number of review tags: {}".format(len(review_souplist_twelve)))

In [None]:
# STEP 12

# Field 6: Extract Movie Reviews

myfield_review_step_one = []
myfield_review_step_two = []
myfield_review_step_three = []

[myfield_review_step_one.append(i.find_all('div', {'class':'lister-list'})) for i in tqdm_notebook(review_souplist_twelve)]

[[myfield_review_step_two.append(j.find_all('div', {'class':'text show-more__control'})) for j in i] for i in myfield_review_step_one]

[myfield_review_step_three.append(list(map(lambda x: x.text, reviews))) for reviews in myfield_review_step_two]

index_to_remove_no_review = [i for i,x in enumerate(myfield_review_step_one) if not x]

print("Length of Reviews list: {}".format(len(myfield_review_step_three)))
print("Length of the list with Movies that don't have reviews: {}".format(len(index_to_remove_no_review)))
if len(index_to_remove_no_review) == 0:
    print("None of the movies miss reviews")
else:
    print("Indexes to remove: {}".format(index_to_remove_no_review))

print(len([i for i,x in enumerate(myfield_review_step_one) if not x]))

print([i for i,x in enumerate(myfield_review_step_two) if not x])

print([i for i,x in enumerate(myfield_review_step_three) if not x])

In [None]:
data_twelve = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\dataset_twelve_final_20012020.pkl")

print(data_twelve.shape)

print(len(myfield_review_step_three))

print(len([i for i,x in enumerate(myfield_review_step_three) if not x]))

In [None]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\reviews_twelve_24012020.pkl', 'wb') as f:
    
    pickle.dump(myfield_review_step_three, f)

In [None]:
index_remove = [2162, 2170, 2222, 2243, 2253, 2264, 2309, 2316, 2330, 2332, 2374, 2381, 2390, 2398, 2457, 2462, 2473, 2481, 2655, 2678, 2719, 2776, 2926, 2937, 2969, 2970, 2974, 2979, 2993, 3019, 3028, 3072, 3080, 3126, 3201, 3296, 3318, 3334, 3345, 3360, 3361, 3398, 3431, 3450, 3483, 3509, 3512, 3527, 3568, 3572, 3577, 3590, 3609, 3682, 3693, 3695, 3702, 3719, 3726, 3736, 3764, 3765, 3768, 3791, 3802, 3829, 3845, 3895, 3912, 3932, 3939, 3951, 3978, 3983, 3991, 4009, 4047, 4091, 4114, 4125, 4146, 4151, 4156, 4247, 4253, 4255, 4274, 4278, 4304, 4306, 4313, 4315, 4320, 4372, 4381, 4387, 4393, 4412, 4431, 4441, 4449, 4486, 4503, 4570, 4579, 4587, 4597, 4613, 4621, 4638]

myfield_review_step_three = [i for j, i in enumerate(myfield_review_step_three) if j not in index_remove]

print(len(myfield_review_step_three))

data_four.reviews = myfield_review_step_three

print(data_four.head())

In [None]:
# STEP 13

index_remove = [185, 189, 196, 200, 201, 235, 292, 293, 370, 398, 402, 425, 431, 436, 437, 440, 445, 451, 457, 461, 462, 
                466, 469, 473, 476, 477, 488, 493, 535, 664, 695, 763, 783, 829, 840, 849, 858, 868, 883, 938, 939, 951, 995, 1003, 1034, 
                1040, 1043, 1070, 1088, 1120, 1150, 1151, 1152, 1163, 1170, 1171, 1176, 1178, 1259, 1274, 1352, 1359, 1367, 1368, 1397, 1406,
                1472, 1474, 1478, 1479, 1480, 1482, 1483, 1487, 1526, 1560, 1570, 1586, 1597, 1606, 1640, 1646, 1648, 1660, 1684, 1701, 1717,
                1760, 1779, 1817, 1853, 1858, 1917, 1923, 1938, 1988, 2114, 2118, 2132, 2138, 2144, 2150, 2158, 2194, 2277, 2303, 2341, 2342, 
                2347, 2349, 2352, 2366, 2467, 2500, 2550, 2607, 2644, 2656, 2728, 2729, 2768, 2793, 2801, 2825, 2858, 2894, 2903, 2940, 2950, 
                2992, 2995, 3014, 3018, 265, 999, 1064, 1120, 1170, 1347, 1478, 1693, 1788, 1988, 2051, 2342, 2347, 2349, 2350, 2352, 2353, 2861, 3024, 3062,
                189, 476, 535, 664, 1170, 1472, 1760, 1988, 2656, 19882, 18,
                52, 54, 110, 172, 176, 179, 208, 230, 254, 272, 283, 287, 363, 402, 451, 476, 513, 567, 571, 577, 599, 605, 613, 614, 617, 618, 619, 644, 647, 683, 753, 776, 777, 778, 780, 788, 789, 793, 806, 808, 809, 817, 822, 834, 912, 935, 936, 937, 974, 981, 1030, 1050, 1083, 1120, 1151, 1152, 1161, 1170, 1172, 1200, 1266, 1268, 1270, 1271, 1274, 1275, 1276, 1277, 1308, 1309, 1341, 1351, 1368, 1375, 1380, 1402, 1412, 1446, 1450, 1454, 1458, 1476, 1518, 1526, 1527, 1566, 1569, 1626, 1641, 1693, 1702, 1724, 1729, 1754, 1769, 1788, 1800, 1804, 1813, 1816, 1823, 1916, 1918, 1938, 1959, 1960, 1965, 1998, 2029, 2050, 2120, 2137, 2152, 2174, 2286, 2297, 2312, 2341, 2347, 2351, 2352, 2378, 2421, 2454, 2460, 2473, 2507, 2527, 2533, 2556, 2571, 2655, 2660, 2669, 2683, 2760, 2858, 2860, 2898, 2899, 2934, 2939, 2989, 3018, 3049, 3050, 3051]

index_remove = list(dict.fromkeys(index_remove))

dataset = pd.read_pickle("D:\\GitHub-Thesis\\dataset_58,000_14012020_latest_version.pkl")

dataset_twelve = dataset.iloc[55000:].reset_index(drop=True)

dataset_twelve = dataset_twelve[~dataset_twelve.index.isin(index_remove)]

dataset_twelve.shape

In [41]:
# STEP 14

with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\plot_twelve_20012020.pkl', 'rb') as f:
    
    plot = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\rating_twelve_20012020.pkl', 'rb') as f:
    
    rating = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\actors_twelve_20012020.pkl', 'rb') as f:
    
    actors = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\director_twelve_20012020.pkl', 'rb') as f:
    
    director = pickle.load(f)
    
with open('D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\reviews_twelve_24012020.pkl', 'rb') as f:
    
    reviews = pickle.load(f)
    
assert len(plot) == len(rating) == len(actors) == len(director) == len(reviews)

In [42]:
[i for i,x in enumerate(rating) if not x]

[]

In [None]:
# STEP 15

dataset_twelve['actors'] = actors

dataset_twelve['plot'] = plot

dataset_twelve['imdb_rating'] = rating

dataset_twelve['director'] = director

dataset_twelve['reviews'] = reviews

dataset_twelve = dataset_twelve.drop(["movieId", "imdbId", "synopsis_url"], axis=1)

dataset_twelve.iloc[2005]

In [None]:
# STEP 16

index_remove = [2, 44, 46, 53, 66, 82, 85, 91, 104, 109, 157, 168, 174, 175, 176, 204, 207, 211, 218, 220, 224, 225, 232, 246, 270, 272, 275, 290, 294, 307, 351, 361, 372, 387, 406, 439, 468, 470, 477, 480, 502, 512, 521, 525, 527, 536, 551, 553, 558, 560, 564, 578, 581, 588, 589, 595, 597, 598, 599, 600, 614, 615, 616, 618, 619, 620, 621, 622, 623, 629, 639, 645, 646, 647, 657, 658, 663, 666, 676, 682, 685, 691, 692, 693, 695, 697, 698, 699, 700, 701, 703, 705, 709, 711, 712, 713, 714, 716, 717, 718, 720, 721, 723, 724, 727, 728, 730, 731, 733, 734, 736, 739, 743, 747, 749, 750, 751, 754, 755, 757, 762, 782, 792, 795, 797, 798, 800, 801, 806, 808, 809, 812, 814, 815, 821, 822, 834, 835, 836, 837, 848, 849, 850, 852, 864, 867, 869, 874, 878, 883, 897, 907, 908, 912, 926, 934, 944, 945, 948, 949, 958, 961, 962, 971, 974, 976, 1006, 1010, 1017, 1028, 1054, 1059, 1069, 1074, 1079, 1080, 1081, 1082, 1086, 1096, 1097, 1098, 1111, 1113, 1115, 1122, 1125, 1129, 1152, 1153, 1184, 1187, 1194, 1204, 1216, 1224, 1242, 1243, 1255, 1261, 1267, 1277, 1281, 1282, 1284, 1292, 1302, 1306, 1313, 1316, 1322, 1323, 1332, 1335, 1350, 1351, 1354, 1362, 1367, 1376, 1378, 1379, 1381, 1390, 1392, 1393, 1439, 1440, 1441, 1442, 1443, 1458, 1463, 1479, 1481, 1489, 1507, 1518, 1527, 1541, 1546, 1548, 1550, 1565, 1566, 1567, 1568, 1576, 1585, 1586, 1594, 1600, 1603, 1614, 1615, 1623, 1635, 1636, 1637, 1648, 1650, 1652, 1657, 1659, 1662, 1664, 1697, 1698, 1699, 1723, 1725, 1741, 1742, 1746, 1757, 1762, 1768, 1769, 1770, 1772, 1793, 1794, 1795, 1796, 1798, 1801, 1805, 1810, 1811, 1816, 1842, 1854, 1871, 1874, 1882, 1885, 1917, 1933, 1934, 1936, 1938, 1943, 1944, 1946, 1960, 1980, 1981, 1989, 1990, 1993, 2014, 2020, 2045, 2049, 2052, 2055, 2063, 2071, 2078, 2082, 2092, 2110, 2112, 2118, 2120, 2127, 2128, 2130, 2131, 2137, 2141, 2144, 2149, 2151, 2162, 2165, 2177, 2183, 2189, 2192, 2193, 2196, 2197, 2199, 2201, 2206, 2213, 2216, 2217, 2218, 2224, 2225, 2241, 2257, 2260, 2273, 2281, 2298, 2315, 2318, 2321, 2322, 2332, 2365, 2382, 2397, 2416, 2421, 2435, 2437, 2438, 2440, 2442, 2443, 2445, 2447, 2455, 2467, 2469, 2481, 2482, 2486, 2496, 2502, 2509, 2510, 2511, 2519, 2522, 2523, 2525, 2545, 2555, 2557, 2558, 2569, 2571, 2575, 2585, 2586, 2587, 2598, 2602, 2604, 2606, 2610, 2617, 2618, 2621, 2626, 2627, 2634, 2635, 2638, 2639, 2640, 2647, 2648, 2657, 2659, 2661, 2664, 2665, 2673, 2677, 2679, 2684, 2687, 2691, 2692, 2693, 2694, 2697, 2721, 2722, 2725, 2726, 2731, 2738, 2740, 2747, 2751, 2754, 2757, 2763, 2774, 2785, 2795, 2804, 2816, 2818, 2820]

dataset_twelve = dataset_twelve[~dataset_twelve.index.isin(index_remove)]

In [None]:
dataset_twelve = dataset_twelve[dataset_twelve.astype(str)['reviews'] != '[]']

In [None]:
# dataset_twelve.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\dataset_twelve_final_20012020.pkl") old

# dataset_twelve.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\dataset_twelve_final_24012020.pkl") old

dataset_twelve.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\dataset_twelve_final_25012020.pkl")

In [None]:
# STEP 17

dataset_twelve.shape

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### FINAL DATASET (23.01.2020)

#### 58,000 movies (newest dataset)

In [106]:
data_ones = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_final_24012020.pkl")
data_one = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movies_one\\dataset_one_final_25012020.pkl")
#assert len(data_ones) == len(data_one)

data_twos = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_two\\dataset_two_final_24012020.pkl")
data_two = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_two\\dataset_two_final_25012020.pkl")
assert len(data_twos) == len(data_two)

data_threes = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_three\\dataset_three_final_24012020.pkl")
data_three = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_three\\dataset_three_final_25012020.pkl")
assert len(data_threes) == len(data_three)

data_fours = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_four\\dataset_four_final_24012020.pkl")
data_four = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_four\\dataset_four_final_25012020.pkl")
assert len(data_fours) == len(data_four)

data_fives = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_five\\dataset_five_final_24012020.pkl")
data_five = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_five\\dataset_five_final_25012020.pkl")
assert len(data_fives) == len(data_five)

data_sixs = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_six\\dataset_six_final_24012020.pkl")
data_six = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_six\\dataset_six_final_25012020.pkl")
assert len(data_sixs) == len(data_six)


data_sevens = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\dataset_seven_final_24012020.pkl")
data_seven = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_seven\\dataset_seven_final_25012020.pkl")
#assert len(data_sevens) == len(data_seven)


data_eights = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\dataset_eight_final_24012020.pkl")
data_eight = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eight\\dataset_eight_final_25012020.pkl")
assert len(data_eights) == len(data_eight)


data_nines = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\dataset_nine_final_24012020.pkl")
data_nine = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_nine\\dataset_nine_final_25012020.pkl")
assert len(data_nines) == len(data_nine)


data_tens = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\dataset_ten_final_24012020.pkl")
data_ten = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_ten\\dataset_ten_final_25012020.pkl")
assert len(data_tens) == len(data_ten)


data_elevens = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\dataset_eleven_final_24012020.pkl")
data_eleven = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_eleven\\dataset_eleven_final_25012020.pkl")
assert len(data_elevens) == len(data_eleven)


data_twelves = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\dataset_twelve_final_24012020.pkl")
data_twelve = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\movie_twelve\\dataset_twelve_final_25012020.pkl")
assert len(data_twelves) == len(data_twelve)

In [107]:
data_eight[data_eight.title == 'The 25th Hour (1967)']

Unnamed: 0,title,genres,rating,imdb_url,reviews_url,actors,plot,imdb_rating,director,reviews
0,The 25th Hour (1967),"[Drama, War]",3.5,http://www.imdb.com/title/tt0062445/,http://www.imdb.com/title/tt0062445/reviews?sp...,"[Anthony Quinn, Virna Lisi, Grégoire Aslan, Mi...","\n In World War II, a Roman...",7.4,Henri Verneuil,[Unlike the majority of movies from the 1960's...


In [108]:
data_twelve[data_twelve.isna().any(axis=1)]

Unnamed: 0,title,genres,rating,imdb_url,reviews_url,actors,plot,imdb_rating,director,reviews


In [None]:
title	genres	rating	imdb_url	reviews_url	actors	plot	imdb_rating	director	reviews

In [109]:
data_sevens.shape

(4427, 9)

In [110]:
data_seven.shape

(4103, 10)

In [111]:
final_dataset = pd.concat([data_one, data_two, data_three, data_four, data_five, data_six, 
                           data_seven, data_eight, data_nine, data_ten, data_eleven, data_twelve], ignore_index=True, sort=False)

final_dataset.shape

(49503, 10)

In [94]:
final_dataset.iloc[32326]

title                                       The 25th Hour (1967)
genres                                              [Drama, War]
rating                                                       7.4
imdb_url                    http://www.imdb.com/title/tt0062445/
reviews_url    http://www.imdb.com/title/tt0062445/reviews?sp...
actors         [Anthony Quinn, Virna Lisi, Grégoire Aslan, Mi...
plot           \n                    In World War II, a Roman...
imdb_rating                                                  NaN
director                                          Henri Verneuil
reviews        [Unlike the majority of movies from the 1960's...
Name: 32326, dtype: object

In [95]:
data_seven[data_seven.isna().any(axis=1)]

Unnamed: 0,title,genres,rating,imdb_url,reviews_url,actors,plot,imdb_rating,director,reviews


In [None]:
final_dataset.tail()

In [113]:
# Data with empty reviews

final_dataset = final_dataset[final_dataset.astype(str)['reviews'] != '[]']

final_dataset.shape

(49399, 10)

In [2]:
final_dataset = pd.read_pickle("D:\\GitHub-Thesis\\58,000 movies\\final_dataset_49399_movies_25012020.pkl")

In [114]:
# Data with no genres

final_dataset.genres.value_counts()

[Drama]                          7307
[Comedy]                         4292
[Documentary]                    2981
[(no genres listed)]             2849
[Comedy, Drama]                  1945
                                 ... 
[Fantasy, Musical, Mystery]         1
[Action, Children, Romance]         1
[Crime, War]                        1
[Action, Film-Noir, Thriller]       1
[Action, Animation, Western]        1
Name: genres, Length: 607, dtype: int64

In [4]:
final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"].shape

# final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"]

(2849, 10)

In [None]:
final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"]

In [115]:
movies_no_genres = final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"].reset_index(drop=True)

movies_no_genres.shape

(2849, 10)

In [6]:
genres_links = movies_no_genres.imdb_url

genres_url_list = []

for i in tqdm_notebook(genres_links):
    
    genres_url_list.append(requests.get(i))

HBox(children=(IntProgress(value=0, max=2849), HTML(value='')))




In [7]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_with_no_genres_25012020.pkl', 'wb') as f:
    
    pickle.dump(genres_url_list, f)

In [117]:
with open('D:\\GitHub-Thesis\\58,000 movies\\movies_with_no_genres_25012020.pkl', 'rb') as f:
    
    genres_url_list = pickle.load(f)

In [118]:
souplist = []

for i in tqdm_notebook(genres_url_list):
    souplist.append(BeautifulSoup(i.text))
    
len(souplist)

HBox(children=(IntProgress(value=0, max=2849), HTML(value='')))




2849

In [119]:
myfield_genres = []
myfield_genres_final = []
genres = []

for i in tqdm_notebook(souplist):
    myfield_genres.append(i.find_all('div', {'class':'see-more inline canwrap'}))

myfield_genres_final = []

for item in myfield_genres:
    if len(item) == 2:
        myfield_genres_final.append([item[1]])
    elif len(item) == 1:
        myfield_genres_final.append([item[0]])

r_genres = re.compile("(?=genres)(.*)")

for i in tqdm_notebook(myfield_genres_final):
    for j in i:
        genres.append(j.find_all('a', {'href':r_genres}))

genres_final = []
for i in genres:
    genres_final.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), i)))

HBox(children=(IntProgress(value=0, max=2849), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2843), HTML(value='')))




In [120]:
len(genres_final)

2843

In [121]:
index_to_remove_no_genres = [i for i,x in enumerate(myfield_genres) if not x]
index_to_remove_no_genres

[888, 1551, 2018, 2137, 2151, 2180]

In [122]:
movies_no_genres = movies_no_genres[~movies_no_genres.index.isin(index_to_remove_no_genres)]

movies_no_genres.shape

(2843, 10)

In [123]:
movies_no_genres.genres = genres_final

movies_no_genres.genres.head()

0                                   [Drama]
1                            [Short, Music]
2    [Drama, Fantasy, History, Sci-Fi, War]
3                                   [Drama]
4                                   [Drama]
Name: genres, dtype: object

In [124]:
for i in tqdm_notebook(range(len(movies_no_genres['genres']))):
    if len(movies_no_genres['genres'].iloc[i]) > 3:
        movies_no_genres['genres'].iloc[i] = movies_no_genres['genres'].iloc[i][0:3]
    elif len(movies_no_genres['genres'].iloc[i]) <= 3:
        movies_no_genres['genres'].iloc[i] = movies_no_genres['genres'].iloc[i]

HBox(children=(IntProgress(value=0, max=2843), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)





In [None]:
#### Now I have to append final_dataset with the movie_no_genres dataset!

In [125]:
movies_no_genres.head()

Unnamed: 0,title,genres,rating,imdb_url,reviews_url,actors,plot,imdb_rating,director,reviews
0,Away with Words (San tiao ren) (1999),[Drama],1.75,http://www.imdb.com/title/tt0213469/,http://www.imdb.com/title/tt0213469/reviews?sp...,"[Tadanobu Asano, Georgina Hobson, Christa Hugh...",\n The protagonist is Asano...,6.3,Christopher Doyle,[Here's the logic you might use to seek this o...
1,Scorpio Rising (1964),"[Short, Music]",3.61,http://www.imdb.com/title/tt0058555/,http://www.imdb.com/title/tt0058555/reviews?sp...,"[Ernie Allo, Bruce Byron, Frank Carifi, Steve ...",\n A gang of Nazi bikers pr...,6.9,Kenneth Anger,"[Kenneth Anger's ""Scorpio Rising"", set to the ..."
2,"Age of the Earth, The (A Idade da Terra) (1980)","[Drama, Fantasy, History]",2.0,http://www.imdb.com/title/tt0080910/,http://www.imdb.com/title/tt0080910/reviews?sp...,"[Maurício do Valle, Jece Valadão, Antonio Pita...",\n Four Third-World Christs...,6.8,Glauber Rocha,"[""This film is a portrait of Brazil and of mys..."
3,Milky Way (Tejút) (2007),[Drama],3.5,http://www.imdb.com/title/tt1092285/,http://www.imdb.com/title/tt1092285/reviews?sp...,"[Barbara Balogh, Sándor Balogh, Péter Balázs, ...",\n This work on a joint at ...,6.8,Benedek Fliegauf,"[As the note on the cover says, it is an ""ambi..."
4,Warsaw Bridge (Pont de Varsòvia) (1990),[Drama],3.2,http://www.imdb.com/title/tt0098139/,http://www.imdb.com/title/tt0098139/reviews?sp...,"[Carme Elias, Francisco Guijar, Jordi Dauder, ...","\n A female professor, a wr...",6.7,Pere Portabella,[Time to get excited about film again! Warsaw ...


In [126]:
movies_no_genres[movies_no_genres.astype(str)['genres'] == "['(no genres listed)']"].shape

(0, 10)

In [None]:
# 1st dataset 

final_dataset.head()

In [None]:
final_dataset.title[final_dataset.director == 'Phil Vischer']

In [None]:
# 2nd dataset 

movies_no_genres.head()

In [None]:
# 3rd dataset

data_older = pd.read_pickle("C:\\Users\\spano\\Desktop\\GitHub-Thesis\\pickled data\\dataset_part_2_07112019.pkl")

data_older.tail()

#### Comment: Since most of my older movies are part of the 50,000 movies I will no further use the older dataset of 10,139 movies

In [21]:
final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"].shape

# final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"]

(2849, 10)

In [127]:
final_dataset_test = pd.concat([final_dataset, movies_no_genres], ignore_index=True, sort=False)

final_dataset_test.shape

(52242, 10)

In [128]:
final_dataset_test = final_dataset_test[final_dataset_test.astype(str)['genres'] != "['(no genres listed)']"]

In [129]:
final_dataset_test[final_dataset_test.astype(str)['genres'] == "['(no genres listed)']"].shape

(0, 10)

In [130]:
final_dataset_test.shape

(49393, 10)

In [131]:
# final_dataset.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\final_dataset_49399_movies_25012020.pkl") old 49399 movies

final_dataset_test.to_pickle("D:\\GitHub-Thesis\\58,000 movies\\final_dataset_49393_movies_25012020.pkl") #49393 movies

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### End of Part 1 - Update, clean & transfrom the dataset of movies