### Part 1 - Concatenate data from different sources and Download URLs

#### Import the libraries

In [None]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import os

# Module to serialize the content produced from the execution of the code

import pickle

# Module to monitor the progress of a python for loop

from tqdm import tqdm_notebook
# Example of Use: tqdm_notebook(examples, desc="Converting examples to features")

#### Create the dataset that will join the information found in csv of links and movie title of Grouplens research & development database

In [None]:
"""
Each of the four datasets has a different number of rows. As a result, they cannot be concatenated at once. 
In order, to concatenate them, we should bring them to a number of rows equal to the number of movies in the dataset. 
The only dataframe with those number of rows is the "movies.csv" containing the title and the genre(s) of each movie.
"""
dataset_links = pd.read_csv(os.path.join(os.getcwd(), "ml-latest//links.csv"))
dataset_movie_names = pd.read_csv(os.path.join(os.getcwd(), "ml-latest//movies.csv"))
dataset_movie_ranks = pd.read_csv(os.path.join(os.getcwd(), "ml-latest//ratings.csv"))
dataset_movie_tags = pd.read_csv(os.path.join(os.getcwd(), "ml-latest//tags.csv"))

In [None]:
dataset_links.head()

In [None]:
dataset_movie_names.head()

In [None]:
dataset_movie_ranks.head()

In [None]:
dataset_movie_tags.head()

In [None]:
"""
Below we can see the ratings of a movie given by different users. Thus, in order to have a single user rating for a movie-
we should find the average value per movie.
"""
dataset_movie_ranks[dataset_movie_ranks.movieId == 4]

In [None]:
"""
This is the average rating per movie. We get 53889 ratings although we have 58098 movies. This means that 4209 movies-
did not recieve any user rating. Thus, those movie's rating will be 0.
"""
list_sums = dataset_movie_ranks.groupby(['movieId'])['rating'].sum().tolist()
list_count = dataset_movie_ranks.groupby(['movieId'])['rating'].count().tolist()

list_final_rating = [round((x*1.0)/y,2) for x, y in zip(list_sums, list_count)]
len(list_final_rating)

In [None]:
"""
Left join the datasets containing movie titles and user ratings. THe dataset_rating includes information about movie title, genre and user rating for a single movie
"""
dataset_rating = pd.merge(dataset_movie_names, round(dataset_movie_ranks.groupby(['movieId'])['rating'].sum()/dataset_movie_ranks.groupby(['movieId'])['rating'].count(),2), on='movieId', how='left')

# As already mentioned, the rating of non-rated movies will be zero.
dataset_rating['rating'].fillna(0, inplace=True)

dataset_rating.head(15)

print('\n', dataset_rating.shape)

In [None]:
"""
Last but not least, join the above dataset with the dataset containing the movie links so as to generate the urls that will help us -
to download the content, the synopsis and the reviews for a single movie.
"""
dataset = pd.concat([dataset_links, dataset_rating], axis=1, join='outer')

dataset = dataset.loc[:,~dataset.columns.duplicated()]

dataset = dataset.drop(['tmdbId'], axis=1)

print(dataset.columns, '\n')
print(dataset.shape)

In [None]:
"""
Finally, since our problem is a multilabel classification one, we should bring the y_label, in this case the genres, to a list of multiple genres.
To do so, we replaced the symbol "|", with the comma punctuation and then splitted the string.
"""
dataset['genres'] = dataset['genres'].apply(lambda x: x.replace('|', ','))

dataset['genres'] = dataset['genres'].apply(lambda x: x.split(","))

for i in tqdm_notebook(range(len(dataset['genres']))):
    if len(dataset['genres'].iloc[i]) > 3:
        dataset['genres'].iloc[i] = dataset['genres'].iloc[i][0:3]
    elif len(dataset['genres'].iloc[i]) <= 3:
        dataset['genres'].iloc[i] = dataset['genres'].iloc[i]

In [None]:
"""
As already informed by the Grouplens website, the number of total movies is equal to 58098.
"""
dataset.shape

In [None]:
"""
No null values in the ratings column.
"""
dataset.rating.isnull().sum()

#### Links of movie features

In [None]:
"""
In order to create a valid hyper reference link that we could later use for data extraction, we should apply to the column "imdbId",
the following string:

"http://www.imdb.com/title/tt" plus (+) a number of 0's depending on the imbdId of the movie.

Prior deciding to create the online links, we observed that every imdb page of a movie had the above http protocol reference incommon.
"""

imdb_url = []

imdb_ids = dataset['imdbId'].values.tolist()

for i in tqdm_notebook(range(len(imdb_ids))):

    if len(str(dataset['imdbId'].iloc[i])) == 7:
        
        imdb_url.append("http://www.imdb.com/title/tt" + str(dataset['imdbId'].iloc[i]) + "/")

    elif len(str(dataset['imdbId'].iloc[i])) == 6:

        imdb_url.append("http://www.imdb.com/title/tt0" + str(dataset['imdbId'].iloc[i]) + "/")
        
    elif len(str(dataset['imdbId'].iloc[i])) == 5:
        
        imdb_url.append("http://www.imdb.com/title/tt00" + str(dataset['imdbId'].iloc[i]) + "/")
        
    elif len(str(dataset['imdbId'].iloc[i])) == 4:
        
        imdb_url.append("http://www.imdb.com/title/tt000" + str(dataset['imdbId'].iloc[i]) + "/")
        
    elif len(str(dataset['imdbId'].iloc[i])) == 3:
        
        imdb_url.append("http://www.imdb.com/title/tt0000" + str(dataset['imdbId'].iloc[i]) + "/")
    
    elif len(str(dataset['imdbId'].iloc[i])) == 2:
        
        imdb_url.append("http://www.imdb.com/title/tt00000" + str(dataset['imdbId'].iloc[i]) + "/")
        
    elif len(str(dataset['imdbId'].iloc[i])) == 1:
        
        imdb_url.append("http://www.imdb.com/title/tt000000" + str(dataset['imdbId'].iloc[i]) + "/")

In [None]:
"""
Pandas module can create a new dataframe column using the content of a python list. Thus, we created the column "imdb_url",
based on the list created on the previous cell.
"""
dataset['imdb_url'] = imdb_url

dataset.head()

In [None]:
"""
Having the main movie page ready to download, we then created the custom urls to access the synopsis page for each movie,
since we observed that a common pattern was followed like in the urls of the main movie page.

Pattern:"plotsummary?ref_=tt_stry_pl#synopsis" after the imdb_url
"""
dataset["synopsis_url"] = dataset["imdb_url"].apply(lambda x: x + "plotsummary?ref_=tt_stry_pl#synopsis")

dataset.head()

In [None]:
"""
The reviews url for a movie createed with the same procedure as the synopsis url.

Pattern: "reviews?spoiler=hide&sort=helpfulnessScore&dir=desc&ratingFilter=0"
"""
dataset["reviews_url"] = dataset["imdb_url"].apply(lambda x: x + "reviews?spoiler=hide&sort=helpfulnessScore&dir=desc&ratingFilter=0")

dataset.head()

In [None]:
"""
Save the final dataset containing the:

1) movieId,
2) imdbId,
3) title,
4) genres.
5) rating (user rating)
6) imdb_url (main page of the movie on IMDB website)
7) sysopsis_url (web-page containing plot summary & plot sunopsis of a movie)
8) reviews_url (web-page containing the top n reviews of a movie). We did not include all the reviews, only those contained in the first HTML page
"""
#dataset.to_pickle('dataset_58,000_14012020_latest_version.pkl')

In [None]:
"""
No null values accross the 8 columns and 58098 rows.
"""
dataset = pd.read_pickle(os.path.join(os.getcwd(), "pickled_data_per_part\\dataset_58,000_14012020_latest_version.pkl"))
print(dataset.isnull().sum(), '\n')
print(dataset.shape)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

### Extract the online HTML docs for each movie. The online web-pages will contain content of:

    1) Movie content,
    2) Plot synopsis content,
    3) User reviews content.

#### Download the first five thousand movies (58098-5000 = 53098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_one = dataset.imdb_url.iloc[:5000]

list_one = []

for i in tqdm_notebook(dataset_one, desc = "Extract the content of the first 5000 movies"):
    
    list_one.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_one = dataset['synopsis_url'].iloc[:5000]

synopsis_one = []

for i in tqdm_notebook(dataset_synopsis_one, desc = "Extract the plot synopsis of the first 5000 movies"):
    
    synopsis_one.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_one = dataset['reviews_url'].iloc[:5000]

reviews_one = []

for i in tqdm_notebook(dataset_reviews_one, desc = "Extract the user reviews of the first 5000 movies"):
    
    reviews_one.append(requests.get(i))

#### Download the second five thousand movies (53098-5000 = 48098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_two = dataset.imdb_url.iloc[5000:10000]

list_two = []

for i in tqdm_notebook(dataset_two, desc = "Extract the content of the second 5000 movies (10,000)"):
    
    list_two.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_two = dataset['synopsis_url'].iloc[5000:10000]

synopsis_two = []

for i in tqdm_notebook(dataset_synopsis_two, desc = "Extract the plot synopsis of the second 5000 movies (10,000)"):
    
    synopsis_two.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_two = dataset['reviews_url'].iloc[5000:10000]

reviews_two = []

for i in tqdm_notebook(dataset_reviews_two, desc = "Extract the user reviews of the second 5000 movies (10,000)"):
    
    reviews_two.append(requests.get(i))

#### Download the third five thousand movies (48098-5000 = 43098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_three = dataset.imdb_url.iloc[10000:15000]

list_three = []

for i in tqdm_notebook(dataset_three, desc = "Extract the content of the third 5000 movies (15,000)"):
    
    list_three.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_three = dataset['synopsis_url'].iloc[10000:15000]

synopsis_three = []

for i in tqdm_notebook(dataset_synopsis_three, desc = "Extract the plot synopsis of the third 5000 movies (15,000)"):
    
    synopsis_three.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_three = dataset['reviews_url'].iloc[10000:15000]

reviews_three = []

for i in tqdm_notebook(dataset_reviews_three, desc = "Extract the user reviews of the third 5000 movies (15,000)"):
    
    reviews_three.append(requests.get(i))

#### Download the fourth five thousand movies (43098-5000 = 38098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_four = dataset.imdb_url.iloc[15000:20000]

list_four = []

for i in tqdm_notebook(dataset_four, desc = "Extract the content of the fourth 5000 movies (20,000)"):
    
    list_four.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_four = dataset['synopsis_url'].iloc[15000:20000]

synopsis_four = []

for i in tqdm_notebook(dataset_synopsis_four, desc = "Extract the plot synopsis of the fourth 5000 movies (20,000)"):
    
    synopsis_four.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_four = dataset['reviews_url'].iloc[15000:20000]

reviews_four = []

for i in tqdm_notebook(dataset_reviews_four, desc = "Extract the user reviews of the fourth 5000 movies (20,000)"):
    
    reviews_four.append(requests.get(i))

#### Download the fifth five thousand movies (38098-5000 = 33098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_five = dataset.imdb_url.iloc[20000:25000]

list_five = []

for i in tqdm_notebook(dataset_five, desc = "Extract the content of the fifth 5000 movies (25,000)"):
    
    list_five.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_five = dataset['synopsis_url'].iloc[20000:25000]

synopsis_five = []

for i in tqdm_notebook(dataset_synopsis_five, desc = "Extract the plot synopsis of the fifth 5000 movies (25,000)"):
    
    synopsis_five.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_five = dataset['reviews_url'].iloc[20000:25000]

reviews_five = []

for i in tqdm_notebook(dataset_reviews_five, desc = "Extract the user reviews of the fifth 5000 movies (25,000)"):
    
    reviews_five.append(requests.get(i))

#### Download the sixth five thousand movies (33098-5000 = 28098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_six = dataset.imdb_url.iloc[25000:30000]

list_six = []

for i in tqdm_notebook(dataset_six, desc = "Extract the content of the sixth 5000 movies (30,000)"):
    
    list_six.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_six = dataset['synopsis_url'].iloc[25000:30000]

synopsis_six = []

for i in tqdm_notebook(dataset_synopsis_six, desc = "Extract the plot synopsis of the sixth 5000 movies (30,000)"):
    
    synopsis_six.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_six = dataset['reviews_url'].iloc[25000:30000]

reviews_six = []

for i in tqdm_notebook(dataset_reviews_six, desc = "Extract the user reviews of the sixth 5000 movies (30,000)"):
    
    reviews_six.append(requests.get(i))

#### Download the seventh five thousand movies (28098-5000 = 23098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_seven = dataset.imdb_url.iloc[30000:35000]

list_seven = []

for i in tqdm_notebook(dataset_seven, desc = "Extract the content of the seventh 5000 movies (35,000)"):
    
    list_seven.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_seven = dataset['synopsis_url'].iloc[30000:35000]

synopsis_seven = []

for i in tqdm_notebook(dataset_synopsis_seven, desc = "Extract the plot synopsis of the seventh 5000 movies (35,000)"):
    
    synopsis_seven.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_seven = dataset['reviews_url'].iloc[30000:35000]

reviews_seven = []

for i in tqdm_notebook(dataset_reviews_seven, desc = "Extract the user reviews of the seventh 5000 movies (35,000)"):
    
    reviews_seven.append(requests.get(i))

#### Download the eigth five thousand movies (23098-5000 = 18098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_eight = dataset.imdb_url.iloc[35000:40000]

list_eight = []

for i in tqdm_notebook(dataset_eight, desc = "Extract the content of the eighth 5000 movies (40,000)"):
    
    list_eight.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_eight = dataset['synopsis_url'].iloc[35000:40000]

synopsis_eight = []

for i in tqdm_notebook(dataset_synopsis_eight, desc = "Extract the plot synopsis of the eighth 5000 movies (40,000)"):
    
    synopsis_eight.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_eight = dataset['reviews_url'].iloc[35000:40000]

reviews_eight = []

for i in tqdm_notebook(dataset_reviews_eight, desc = "Extract the user reviews of the eighth 5000 movies (40,000)"):
    
    reviews_eight.append(requests.get(i))

#### Download the ninth five thousand movies (18098-5000 = 13098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_nine = dataset.imdb_url.iloc[40000:45000]

list_nine = []

for i in tqdm_notebook(dataset_nine, desc = "Extract the content of the ninth 5000 movies (45,000)"):
    
    list_nine.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_nine = dataset['synopsis_url'].iloc[40000:45000]

synopsis_nine = []

for i in tqdm_notebook(dataset_synopsis_nine, desc = "Extract the plot synopsis of the ninth 5000 movies (45,000)"):
    
    synopsis_nine.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_nine = dataset['reviews_url'].iloc[40000:45000]

reviews_nine = []

for i in tqdm_notebook(dataset_reviews_nine, desc = "Extract the user reviews of the ninth 5000 movies (45,000)"):
    
    reviews_nine.append(requests.get(i))

#### Download the tenth five thousand movies (13098-5000 = 8098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_ten = dataset.imdb_url.iloc[45000:50000]

list_ten = []

for i in tqdm_notebook(dataset_ten, desc = "Extract the content of the tenth 5000 movies (50,000)"):
    
    list_ten.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_ten = dataset['synopsis_url'].iloc[45000:50000]

synopsis_ten = []

for i in tqdm_notebook(dataset_synopsis_ten, desc = "Extract the plot synopsis of the tenth 5000 movies (50,000)"):
    
    synopsis_ten.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_ten = dataset['reviews_url'].iloc[45000:50000]

reviews_ten = []

for i in tqdm_notebook(dataset_reviews_ten, desc = "Extract the user reviews of the tenth 5000 movies (50,000)"):
    
    reviews_ten.append(requests.get(i))

#### Download the eleventh five thousand movies (8098-5000 = 3098)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_eleven = dataset.imdb_url.iloc[50000:55000]

list_eleven = []

for i in tqdm_notebook(dataset_eleven, desc = "Extract the content of the eleventh 5000 movies (55,000)"):
    
    list_eleven.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_eleven = dataset['synopsis_url'].iloc[50000:55000]

synopsis_eleven = []

for i in tqdm_notebook(dataset_synopsis_eleven, desc = "Extract the plot synopsis of the eleventh 5000 movies (55,000)"):
    
    synopsis_eleven.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_eleven = dataset['reviews_url'].iloc[50000:55000]

reviews_eleven = []

for i in tqdm_notebook(dataset_reviews_eleven, desc = "Extract the user reviews of the eleventh 5000 movies (55,000)"):
    
    reviews_eleven.append(requests.get(i))

#### Download the twelve five thousand movies (3098-3098 = 0)

In [None]:
"""
Extract the movie content (column: imdb_url)
"""
dataset_twelve = dataset.imdb_url.iloc[55000:]

list_twelve = []

for i in tqdm_notebook(dataset_twelve, desc = "Extract the content of the remainig 3098 movies (58,098)"):
    
    list_twelve.append(requests.get(i))

In [None]:
"""
Extract the movie synopsis (column: synopsis_url)
"""
dataset_synopsis_twelve = dataset['synopsis_url'].iloc[55000:]

synopsis_twelve = []

for i in tqdm_notebook(dataset_synopsis_twelve, desc = "Extract the plot synopsis of the remainig 3098 movies (58,098)"):
    
    synopsis_twelve.append(requests.get(i))

In [None]:
"""
Extract the user reviews (column: reviews_url)
"""
dataset_reviews_twelve = dataset['reviews_url'].iloc[55000:]

reviews_twelve = []

for i in tqdm_notebook(dataset_reviews_twelve, desc = "Extract the user reviews of the remainig 3098 movies (58,098)"):
    
    reviews_twelve.append(requests.get(i))

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Now that we have extracted the online HTML docs for each movie, we will proceed into the information extraction.

The code to be shown next has been splitted into 12 different Jupyter Notebooks. Thus, we advise to have a look on the Notebooks from number 1.1 to 1.12 in order to cehck the information extraction.

We extracted the following 6 fields:

* field 1: Plot Summary (a short summary of the movies scenario)
* field 2: Actors (15 or less number of actors/actresses in the cast)
* field 3: Directors (Name of the director(s))
* field 4: IMDB rating (The IMDB rate of the movie given by the users)
* field 5: Plot Synopsis (The whole synopsis text of the movie)
* field 6: Reviews (The first user reviews)

To extract those 6 fields, we first transformed the extracted urls list to a beautiful soup object. This process consumed an important amount of time and RAM capacity. Thus, we propose to chech each notebook independently.

After the transformation to a beautiful soup object each field is extracted with some for loops and the .find_all() method, which is an appropriate search tool of HTML tags (i.e /href string).

### - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - -  - - - - - - - - - 

#### Having extracted the movie information we continue into the construction of the final dataset.

#### Create the FINAL DATASET

#### Firstly, import the twelve dataset created as the output from each of the 12 Jupyter Notebooks.

In [None]:
data_one = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movies_one\\dataset_one_final_25012020.pkl"))
data_two = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_two\\dataset_two_final_25012020.pkl"))
data_three = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_three\\dataset_three_final_25012020.pkl"))
data_four = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_four\\dataset_four_final_25012020.pkl"))
data_five = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_five\\dataset_five_final_25012020.pkl"))
data_six = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_six\\dataset_six_final_25012020.pkl"))
data_seven = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_seven\\dataset_seven_final_25012020.pkl"))
data_eight = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_eight\\dataset_eight_final_25012020.pkl"))
data_nine = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_nine\\dataset_nine_final_25012020.pkl"))
data_ten = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_ten\\dataset_ten_final_25012020.pkl"))
data_eleven = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_eleven\\dataset_eleven_final_25012020.pkl"))
data_twelve = pd.read_pickle(os.path.join(os.getcwd(),"58,000 movies\\movie_twelve\\dataset_twelve_final_25012020.pkl"))

In [None]:
"""
Check for NAs
"""
movie_tables_list=[data_one, data_two, data_three, data_four, data_five, data_six, data_seven, data_eight, data_nine, data_ten, data_eleven, data_twelve]
for i,j in enumerate(movie_tables_list):
    print("{0}/{1}".format(i+1, len(movie_tables_list)))
    print(j[j.isna().any(axis=1)])
    print("\n")

In [None]:
"""
Concatenate the 12-batch datasets together to form the final extracted movies table
"""
final_dataset = pd.concat([data_one, data_two, data_three, data_four, data_five, data_six, 
                           data_seven, data_eight, data_nine, data_ten, data_eleven, data_twelve], 
                          ignore_index=True, sort=False)
final_dataset.shape

In [None]:
"""
Remove the data with empty list of reviews
"""
final_dataset = final_dataset[final_dataset.astype(str)['reviews'] != '[]']
final_dataset.shape

In [None]:
"""
Data with genre tags like "(no genres listed)" should be removed. Although, since those genres were not included by Grouplens, 
we decied to download them.
"""
final_dataset.genres.value_counts()

In [None]:
"""
The final dataset has 2849 movies with the following tag on the column genre "(no genres listed)".
Thus, we isolated those 2849 movies and we downloaded their genres
"""
final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"].shape

In [None]:
movies_no_genres = final_dataset[final_dataset.astype(str)['genres'] == "['(no genres listed)']"].reset_index(drop=True)
movies_no_genres.shape

### -------------------------------------- --------------------------------------
Download the movies with no listed genres

In [None]:
"""
Download the movie genres of those 2849 movies.
"""
genres_links = movies_no_genres.imdb_url
genres_url_list = []
for i in tqdm_notebook(genres_links):
    
    genres_url_list.append(requests.get(i))

In [None]:
"""
Pickle the downloaded list of HTML documents, because this task takes time to complete at every notebook execution
"""
# with open(os.path.join(os.getcwd(),'58,000 movies\\movies_with_no_genres_25012020.pkl'), 'wb') as f:
#     pickle.dump(genres_url_list, f)

In [None]:
"""
Read the above pickled file
"""
with open(os.path.join(os.getcwd(),'58,000 movies\\movies_with_no_genres_25012020.pkl'), 'rb') as f:
    genres_url_list = pickle.load(f)

In [None]:
"""
Extract the genres using Beautiful Soup
"""
souplist = []

for i in tqdm_notebook(genres_url_list):
    souplist.append(BeautifulSoup(i.text))
print(len(souplist))

In [None]:
myfield_genres = []
myfield_genres_final = []
genres = []

for i in tqdm_notebook(souplist):
    myfield_genres.append(i.find_all('div', {'class':'see-more inline canwrap'}))

myfield_genres_final = []

for item in myfield_genres:
    if len(item) == 2:
        myfield_genres_final.append([item[1]])
    elif len(item) == 1:
        myfield_genres_final.append([item[0]])

r_genres = re.compile("(?=genres)(.*)")

for i in tqdm_notebook(myfield_genres_final):
    for j in i:
        genres.append(j.find_all('a', {'href':r_genres}))

genres_final = []
for i in genres:
    genres_final.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), i)))

In [None]:
"""
6 genre lists less, probably those 6 links never had a written genre.
"""
len(genres_final)

In [None]:
"""
The 6 index movies with no genres should be removed.
"""
index_to_remove_no_genres = [i for i,x in enumerate(myfield_genres) if not x]
index_to_remove_no_genres

In [None]:
movies_no_genres = movies_no_genres[~movies_no_genres.index.isin(index_to_remove_no_genres)]
movies_no_genres.shape

In [None]:
movies_no_genres.genres = genres_final
movies_no_genres.genres.head()

In [None]:
"""
Check if any of the genres download have an empty list.
"""
movies_no_genres[movies_no_genres.astype(str)['genres'] == "['(no genres listed)']"].shape

In [None]:
"""
This step trims the movies with more than three genres from the "movies_no_genres" table.
This is important to remember because the whole classification takes into account up to 3 movie genres for a single movie.
Even though a particular movie could belong to 4, 5, 6 and  more genres.
"""
for i in tqdm_notebook(range(len(movies_no_genres['genres']))):
    if len(movies_no_genres['genres'].iloc[i]) > 3:
        movies_no_genres['genres'].iloc[i] = movies_no_genres['genres'].iloc[i][0:3]
    elif len(movies_no_genres['genres'].iloc[i]) <= 3:
        movies_no_genres['genres'].iloc[i] = movies_no_genres['genres'].iloc[i]

### -------------------------------------- --------------------------------------

#### Now we have to append final_dataset with the movie_no_genres dataset!

In [None]:
"""
1st movie's table - The one created earlier out of the 12 mini tables
"""
final_dataset.head()

In [None]:
"""
2nd movie's table - The one that downloaded the sequences of genres for movies that had the tag "(no genres listed)"
"""
movies_no_genres.head()

In [None]:
"""
Concatenate the final_dataset with the movies_no_genres dataset.In the latter, we include the genres of the movies with no genre listed, belonging to the former dataset.
"""
final_dataset_test = pd.concat([final_dataset, movies_no_genres], ignore_index=True, sort=False)
final_dataset_test.shape

In [None]:
"""
Remove movies with untagged genres
"""
final_dataset_test = final_dataset_test[final_dataset_test.astype(str)['genres'] != "['(no genres listed)']"]

In [None]:
"""
The final dataset contain no movie untagged of genres.
"""
final_dataset_test[final_dataset_test.astype(str)['genres'] == "['(no genres listed)']"].shape

In [None]:
"""
The shape of the final dataset
"""
final_dataset_test.shape

In [None]:
final_dataset_test.to_pickle(os.path.join(os.getcwd(),"58,000 movies\\final_dataset_49393_movies_25012020.pkl")

#### End of Part 1 - Update, clean & transfrom the dataset of movies