#### Import the libraries

In [1]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re


# Module to serialize the content produced from the execution of the code

import pickle


# Module to monitor the progress of a python for loop

from tqdm import tqdm


# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

In [None]:
# !pip3 install pyodbc

#### Import the two datasets (07.11.2019)

#### The first dataset of 4774 movies already made used of them!

In [3]:
dataset = pd.read_pickle('dataset_one_07112019.pkl')

#### The second dataset of movies download on 08.10.2019

In [71]:
dataset_two = pd.read_csv('second_dataset.csv', error_bad_lines=False)

b'Skipping line 66: expected 44 fields, saw 46\nSkipping line 111: expected 44 fields, saw 45\nSkipping line 198: expected 44 fields, saw 45\nSkipping line 222: expected 44 fields, saw 46\nSkipping line 278: expected 44 fields, saw 45\nSkipping line 396: expected 44 fields, saw 45\nSkipping line 403: expected 44 fields, saw 45\nSkipping line 421: expected 44 fields, saw 45\nSkipping line 437: expected 44 fields, saw 45\nSkipping line 462: expected 44 fields, saw 46\nSkipping line 491: expected 44 fields, saw 45\nSkipping line 515: expected 44 fields, saw 45\nSkipping line 529: expected 44 fields, saw 45\nSkipping line 530: expected 44 fields, saw 45\nSkipping line 558: expected 44 fields, saw 45\nSkipping line 623: expected 44 fields, saw 45\nSkipping line 646: expected 44 fields, saw 45\nSkipping line 663: expected 44 fields, saw 46\nSkipping line 713: expected 44 fields, saw 45\nSkipping line 730: expected 44 fields, saw 47\nSkipping line 791: expected 44 fields, saw 45\nSkipping lin




In [5]:
dataset_two.head()

Unnamed: 0,fn,tid,title,wordsInTitle,url,imdbRating,ratingCount,duration,year,type,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
0,titles01/tt0012349,tt0012349,Der Vagabund und das Kind (1921),der vagabund und das kind,http://www.imdb.com/title/tt0012349/,8.4,40550.0,3240.0,1921.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
1,titles01/tt0015864,tt0015864,Goldrausch (1925),goldrausch,http://www.imdb.com/title/tt0015864/,8.3,45319.0,5700.0,1925.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
2,titles01/tt0017136,tt0017136,Metropolis (1927),metropolis,http://www.imdb.com/title/tt0017136/,8.4,81007.0,9180.0,1927.0,video.movie,...,0,0,0,1,0,0,0,0,0,0
3,titles01/tt0017925,tt0017925,Der General (1926),der general,http://www.imdb.com/title/tt0017925/,8.3,37521.0,6420.0,1926.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
4,titles01/tt0021749,tt0021749,Lichter der Großstadt (1931),lichter der gro stadt,http://www.imdb.com/title/tt0021749/,8.7,70057.0,5220.0,1931.0,video.movie,...,0,0,1,0,0,0,0,0,0,0


#### Extract and append the links of the second dataset to the first one.

In [72]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = "\033[1m"
    UNDERLINE = '\033[4m'
    END = "\033[0m"

second_links = dataset_two.url

print("\nThe first five rows of the new links:\n{}".format(second_links.head(5)))

print("\nThe length of the url column of the second dataset: {}".format(len(second_links)))

print("\nThus I have 14332 rows to manilpulate and add additional content in my initial dataset!")
print("\nHowever, it is important before using all the {} links to understand what my data is about!".format(len(second_links)))

first_links = dataset.movie_imdb_link
print("\nContrary to the second dataset, the first dataset I already made use of, has {} active imdb links".format(len(first_links)))

# -------------------------------------------------------------------------------------------------------

print("\n---------------------------------------------------------------------------------")
print("\nHaving imported the two datasets it is now important to check if the two datasets has duplicate links, \nsince I don't want to extract the same movie second time.")

my_series = [first_links, second_links]

my_total_links = pd.concat(my_series)

print("\nThe number of total movies to manipulate is: {}".format(len(my_total_links)))

duplicates = my_total_links.value_counts().tolist()

empty_l = []
for i in duplicates:
    if i > 1:
        empty_l.append(i)
        
print("\nThe number of duplicate urls in the two dataset is: {}".format(len(empty_l)))

if len(empty_l) > 0:
    print("\nDuplicates have been spotted!")
    
else:
    print("\nNo duplicates have been spotted!")
    print("\nEven though no duplicate links have been spotted, the same movie but in different language may exist. \nAs we will see later this is a real case!")

# -------------------------------------------------------------------------------------------------------

print("\n---------------------------------------------------------------------------------")
print(color.BOLD + "\nCleaning the TV series of any type from the dataset" + color.END)
print("\nHavning checked for duplicate rows now I will move on removing the rows that are not Movies, but TV episodes/series/\nshort movies/gaming videos, etc.")

print("\n1) The first indication of a TV episode is given in the column 'RealityTV' where the value 1 indicates a reality tv show, \nwhereas the value 0 indicates anything else but reality show.")

dataset_two[dataset_two.RealityTV==1].head(5)

print("\nThe shape of the second dataset that has reality series is: {}".format(dataset_two[dataset_two.RealityTV==1].shape))
print("\nThose 125 rows should be deleted!")

dataset_two = dataset_two[dataset_two.RealityTV != 1]

print("\nThe shape of the second dataset, with reality series removed, is: {}".format(dataset_two.shape))

print("\n---------------------------------------------------------------------------------")
print("\n2) Having deleted the rows that had the value 1 in the 'RealityTV' column, now I will delete the rows that in the column\n'title' have the word 'Episode'.")

empty_episodes = []

for i in dataset_two['title']:
    if 'Episode' in i: 
        empty_episodes.append(i)
    else:
        pass

print("\nThe number of rows that contain the word 'Episode' in their title is: {}".format(len(empty_episodes)))
print("\nSome of those {} rows are: {}".format(len(empty_episodes), empty_episodes[0:5]))

dataset_two_cleaned = dataset_two[~dataset_two.title.isin(empty_episodes)]

print("\nThe shape of the new dataset is {}, {} rows less".format(dataset_two_cleaned.shape, len(empty_episodes)))
print("\n---------------------------------------------------------------------------------")
print("\n3) Having deleted the 'Episodes' from the dataset now I should remove the 'TV-Series'.")

empty_tvseries = []

for i in dataset_two['title']:
    if 'TV Series' in i: 
        empty_tvseries.append(i)
    else:
        pass

print("\nThe number of rows that contain the word 'TV-Series' in their title is: {}".format(len(empty_tvseries)))
print("\nSome of those {} rows are: {}".format(len(empty_tvseries), empty_tvseries[0:5]))

dataset_two_cleaned = dataset_two_cleaned[~dataset_two_cleaned.title.isin(empty_tvseries)]
print("\nThe shape of the new dataset is {}, {} rows less".format(dataset_two_cleaned.shape, len(empty_tvseries)))
print("\n---------------------------------------------------------------------------------")
print("\n4) Having deleted the 'TV-Series' from the dataset, now I should remove the rows containing the word 'Video Game'.")

empty_videogame = []

for i in dataset_two['title']:
    if 'Video Game' in i: 
        empty_videogame.append(i)
    else:
        pass

print("\nThe number of rows that contain the word 'Video Game' in their title is: {}".format(len(empty_videogame)))
print("\nSome of those {} rows are: {}".format(len(empty_videogame), empty_videogame[0:5]))

dataset_two_cleaned = dataset_two_cleaned[~dataset_two_cleaned.title.isin(empty_videogame)]
print("\nThe shape of the new dataset is {}, {} rows less".format(dataset_two_cleaned.shape, len(empty_videogame)))

# -------------------------------------------------------------------------------------------------------

print("\n---------------------------------------------------------------------------------")
print("\nHaving cleaned the movie titles of my dataset, it is now time to proceed with checking if any of the title in the second dataset already exist in the first dataset.")

first_titles = dataset.movie_title
print("\nThe number of movie titles in the first dataset is: {}".format(len(first_titles)))

second_titles = dataset_two_cleaned.title
print("\nThe number of movie titles in the second dataset is: {}".format(len(second_titles)))

my_series = [first_titles, second_titles]

my_total_titles = pd.concat(my_series)

duplicate_movie_titles = my_total_titles.value_counts().tolist()

empty_duplicate_movie_titles = []
for i in duplicate_movie_titles:
    if i > 1:
        empty_duplicate_movie_titles.append(i)
        
print("\nThe number of duplicate movie titles is: {}".format(len(empty_duplicate_movie_titles)))
print("\nThe duplicate movie titles are: \n{}".format(my_total_titles.value_counts()[0:7]))

print("\nThe movies 'The Host' and 'Out of the blue', are two movies with the same title altough different content. \nThus they won't be deleted.")

dataset_two_cleaned = dataset_two_cleaned[dataset_two_cleaned.year > 1960]

dataset_two_cleaned.shape

print("\nThe dataset finally has {} rows. It is time now to extract the information I need from the data I have.".format(len(dataset_two_cleaned)))
print("\nJust to pinpoint that I have spotted two rows in the dataset, of which their url yielded a 404 error. Those rows correspond to the index 973 and 2951. Thus, I remove them from the final dataset beforehand.")

dataset_two_cleaned = dataset_two_cleaned.drop([973, 2951])

print("\nThe shape of the final dataset is: {}".format(dataset_two_cleaned.shape))


The first five rows of the new links:
0    http://www.imdb.com/title/tt0012349/
1    http://www.imdb.com/title/tt0015864/
2    http://www.imdb.com/title/tt0017136/
3    http://www.imdb.com/title/tt0017925/
4    http://www.imdb.com/title/tt0021749/
Name: url, dtype: object

The length of the url column of the second dataset: 14332

Thus I have 14332 rows to manilpulate and add additional content in my initial dataset!

However, it is important before using all the 14332 links to understand what my data is about!

Contrary to the second dataset, the first dataset I already made use of, has 4774 active imdb links

---------------------------------------------------------------------------------

Having imported the two datasets it is now important to check if the two datasets has duplicate links, 
since I don't want to extract the same movie second time.

The number of total movies to manipulate is: 19106

The number of duplicate urls in the two dataset is: 0

No duplicates have been spo

In [68]:
dataset.to_pickle("dataset_one_07112019.pkl")

In [74]:
dataset_two_cleaned.to_pickle("dataset_two_07112019.pkl")

# - - -  - - - - - - - - - - - - - - - - - -  - - - - - - - - - - - - - - - - - -  - - - - - - - - - - - - - - - - - -  

#### FIrst 5000 movies (14.10.2019)

##### Step 1 - 14.10.2019
Extract the first five thousands movies to request their HTML doc.

*Note: I didn't requested all the 9145 urls at once, since it would result to memory error.*

In [75]:
#The **code** below will take approximately 1 hour to finish.

# mock_dataset_one = dataset_two_cleaned.url.iloc[0:5000]

# mylist_one = []

# for i in tqdm(mock_dataset_one):
#     mylist_one.append(requests.get(i))

In [76]:
# Pickle the requests file for further use!

# Save the file

# with open('requests_one_second_try_13102019.pkl', 'wb') as f:
#     pickle.dump(mylist_one, f)

In [132]:
with open('requests_one_second_try_13102019.pkl', 'rb') as f:
    mylist_one = pickle.load(f)
    
len(mylist_one)

##### Step 2 - 14.10.2019

##### Remove falsy indices (comment on 18.10.2019)

Delete the movies of the below indexes!

* 3 movies (between 1960-2017) with no plot summary!
* 77 movies (between 1960-2017) they don't have a recorded IMDB Rating!
* 22 movies (between 1960-2017) they don't have actors!
* 185 movies (between 1960-2017) they dont' have a genre!

In [135]:
# no plot_summary

remove_indices = [761, 2511, 4326]

mylist_one = [i for j, i in enumerate(mylist_one) if j not in remove_indices]

In [136]:
# no IMDB Rating

remove_indices = [276,  1057, 1216, 1378, 3219, 3331, 3384, 3521, 3676, 3726, 3743, 3758, 3771, 3802, 3927, 3953, 3958, 3970, 
                  3980, 3981, 4010, 4022, 4042, 4060, 4070, 4091, 4094, 4101, 4114, 4115, 4121, 4128, 4141, 4152, 4154, 4186, 
                  4194, 4235, 4243, 4245, 4248, 4274, 4277, 4282, 4285, 4291, 4299, 4300, 4304, 4305, 4306, 4309, 4311, 4313, 
                  4315, 4318, 4322, 4324, 4329, 4330, 4340, 4426, 4630, 4665, 4676, 4683, 4696, 4700, 4731, 4738, 4755, 4778, 
                  4785, 4806, 4811, 4829, 4958]

mylist_one = [i for j, i in enumerate(mylist_one) if j not in remove_indices]

In [137]:
# no actors

remove_indices = [202, 454, 800, 1213, 1301,1911, 2567, 2568, 2569, 2570, 3017, 3140, 3457, 3760, 4014, 4092, 4333, 4410, 
                  4452, 4455, 4552, 4708]

mylist_one = [i for j, i in enumerate(mylist_one) if j not in remove_indices]

In [138]:
# no genres

remove_indices = [273,  419,  712,   882,  883,  955,  1012, 1372, 1377, 1676, 1920, 2108, 2219, 2467, 2517, 2642, 2646, 2696, 
                  2719, 2764, 2767, 2769,  2793, 2797, 2826, 2887, 2904, 2936, 3001, 3022, 3024, 3027, 3079, 3116, 3133, 3134, 
                  3140, 3158, 3162, 3176, 3193, 3208, 3210,  3217, 3221, 3231, 3242, 3245, 3249, 3267, 3278, 3288, 3291, 3292, 
                  3300, 3319, 3327, 3330, 3342, 3359, 3411, 3417, 3426, 3432,  3434, 3445, 3453, 3457, 3468, 3474, 3498, 3502, 
                  3508, 3516, 3533, 3551, 3562, 3578, 3605, 3606, 3616, 3621, 3625, 3635, 3637,  3659, 3665, 3668, 3686, 3692, 
                  3698, 3701, 3723, 3770, 3775, 3778, 3796, 3797, 3798, 3800, 3811, 3812, 3823, 3896, 3915, 3931,  3932, 3944, 
                  4001, 4005, 4006, 4008, 4028, 4033, 4037, 4049, 4062, 4075, 4085, 4086, 4102, 4106, 4107, 4124,  4140, 4142, 
                  4158, 4161, 4164, 4166, 4170, 4177, 4179, 4184, 4201, 4202, 4207, 4208, 4212, 4213, 4229, 4232, 4233,  4235, 
                  4238, 4241, 4246, 4247, 4252, 4255, 4257, 4260, 4261, 4262, 4263, 4265, 4334, 4336, 4345, 4392, 4417,  4457, 
                  4500, 4540, 4551, 4557, 4575, 4604, 4612, 4626, 4627, 4643, 4655, 4684, 4705, 4717, 4763, 4776, 4779,  4791, 
                  4846, 4855, 4864, 4866, 4897]

mylist_one = [i for j, i in enumerate(mylist_one) if j not in remove_indices]

In [139]:
len(mylist_one)

4713

##### Step 3 - Read the souplist at once before start extracting data from the HTML docs!! (14.10.2019)

In [None]:
# Extract the souplist which is the same for every column extracted

souplist = []

for i in tqdm(mylist_one):
    souplist.append(BeautifulSoup(i.text))

##### Step 4 - Extract the infomration for all the columns needed

#### Extract column 1: Plot Summary

In [None]:
myfield = []
plot_summary = []

for i in tqdm(souplist):
    myfield.append(i.find_all('div', {'class':'plot_summary'}))

for i in tqdm(myfield):
    for x in tqdm(i):
        for y in tqdm(x.find_all('div', {'class':'summary_text'})):
            plot_summary.append(y.text)

In [None]:
with open('plot_summary_one_18102019.pkl', 'wb') as f:
    pickle.dump(plot_summary, f)

#### Extract column 2: IMDB Rating

In [None]:
myfield_rating = []
ratings = []

for i in tqdm(souplist):
    myfield_rating.append(i.find_all('div', {'class':'ratingValue'}))

for i in tqdm(myfield_rating):
    for x in tqdm(i):
        for y in tqdm(x.find_all('span', {'itemprop':'ratingValue'})):
            ratings.append(y.text)

In [None]:
with open('ratings_one_18102019.pkl', 'wb') as f:
    pickle.dump(ratings, f)

#### Extract column 3: Actors

In [None]:
# Phase 1

myfield_cast = []

for i in tqdm(souplist):
    myfield_cast.append(i.find_all('table', {'class':'cast_list'}))
    
#------------------------------------------------------------------------

# Phase 2

phase_two = []

import re
r_one = re.compile(".*name")

cast_list = myfield_cast

for i in tqdm(cast_list):
    for j in tqdm(i):
        phase_two.append(j.find_all('a', {'href':r_one}))

# # ------------------------------------------------------------------------

# # Phase 3

phase_three = []

for i in tqdm(range(len(phase_two))):
    if len(phase_two[i]) != 0:
        phase_three.append(phase_two[i][1::2])

# ------------------------------------------------------------------------

# Phase 4

actor_list = []
for tags in phase_three:
    actor_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), tags)))

In [None]:
with open('actors_list_one_18102019.pkl', 'wb') as f:
    pickle.dump(actor_list, f)

#### Extract column 4: Director Name

Note: find the class director and locate i + 1 which is the next element in the list!

In [None]:
myfield_director = []
director_name = []

for i in tqdm(souplist):
    myfield_director.append(i.find_all('div', {'class':'plot_summary'}))

import re
r_one = re.compile(".*name")

for i in tqdm(myfield_director):
    for j in tqdm(i):
        director_name.append(j.find_all('a', {'href':r_one}))
        
director_names = [item[0].text for item in director_name]

In [None]:
with open('director_names_one_18102019.pkl', 'wb') as f:
    pickle.dump(director_names, f)

#### Extract column 5: Plot Keywords

In [None]:
myfield_keywords = []
keywords = []

for i in tqdm(souplist):
    myfield_keywords.append(i.find_all('div', {'class':'see-more inline canwrap'}))

myfield_keywords_final = [[item[0]] for item in myfield_keywords]

for i in tqdm(myfield_keywords_final):
    for j in tqdm(i):
        keywords.append(j.find_all('span', {'class':"itemprop"}))

keywords_final = []
for i in keywords:
    keywords_final.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), i)))

In [None]:
with open('keywords_final_one_18102019.pkl', 'wb') as f:
    pickle.dump(keywords_final, f)

#### Extract column 6: Genres

In [None]:
myfield_genres = []
myfield_genres_final = []
genres = []

for i in tqdm(souplist):
    myfield_genres.append(i.find_all('div', {'class':'see-more inline canwrap'}))

myfield_genres_final = [[item[1]] for item in myfield_genres]

import re
r_genres = re.compile("(?=genres)(.*)")

for i in tqdm(myfield_genres_final):
    for j in tqdm(i):
        genres.append(j.find_all('a', {'href':r_genres}))

genres_final = []
for i in genres:
    genres_final.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), i)))

In [None]:
with open('genres_final_one_18102019.pkl', 'wb') as f:
    pickle.dump(genres_final, f)

#### Second 5000 movies

In [129]:
with open('requests_two_second_try_13102019.pkl', 'rb') as f:
    mylist_two = pickle.load(f)

In [130]:
len(mylist_two)

4145

##### Step 1: Clean the incorrect indices

(incorrect are those indices that don't contain inforamtion for at most one of the columns needed)

In [None]:
# Remove the incorrect indices

remove_indices_no_plot = [3397]

remove_indices_no_rating =  [79,  94,  137, 212, 226, 243, 261, 273, 297, 298, 332, 358, 365, 379, 382, 391, 398, 404, 409, 481, 
                             515, 518, 520, 532, 545, 561, 565, 582, 605, 608, 614, 624,626, 632, 637, 642, 644, 646, 647, 
                             648, 649, 652, 656, 658, 2353, 2805, 2911, 2968, 2990, 3117, 3187, 3259, 3424, 3439, 3618, 3771, 
                             3858, 3866, 3907, 4129]

remove_indices_no_actors = [231, 498, 511, 563, 1857, 2005, 2030, 2345, 2690, 2862, 2891, 3112, 3214, 3534, 3641, 3766, 3901, 
                            3918, 4070]

remove_indices_no_keywords = [23, 2899, 3512]

remove_indices_total = remove_indices_no_plot + remove_indices_no_rating + remove_indices_no_actors + remove_indices_no_keywords

mylist_two = [i for j, i in enumerate(mylist_two) if j not in remove_indices_total]

In [None]:
remove_indices_second_round = [63, 67, 103, 104, 125, 148, 173, 185, 187, 188, 195, 199, 217, 222, 224, 225, 238, 240, 245, 251, 
                           270, 271, 308, 310, 338, 340, 345, 347, 352, 354, 379, 384, 390, 393, 399, 418, 438, 441, 445, 449, 
                           450, 454, 464, 468, 469, 477, 478, 484, 493, 501, 520, 532, 537, 539, 544, 552, 554, 560, 576, 587, 
                           588, 595, 607, 608, 611, 838, 980, 1148, 1151, 1188, 1400, 1433, 1441, 1526, 1826, 1834, 1980, 2061, 
                           2095, 2121, 2142, 2496, 2504, 2577, 2659, 2757, 2801, 2921, 2934, 3012, 3023, 3049, 3063,  3067, 
                           3093, 3098, 3105, 3109, 3138, 3155, 3172, 3189, 3204, 3221, 3226, 3228, 3260, 3263, 3267, 3269, 3276, 
                           3285, 3305, 3317, 3325,3326, 3327, 3329, 3353, 3359, 3387, 3421, 3454, 3478, 3495, 3500, 3511,3543, 
                           3551, 3571, 3576, 3584, 3631, 3658, 3665, 3668, 3674, 3690, 3699, 3716, 3719, 3727, 3742, 3749, 3751,
                           3770, 3793, 3800, 3809, 3817, 3822, 3826, 3832, 3837, 3853, 3856,3874, 3882, 3883, 3906, 3931, 3941, 
                           3947, 3974, 3978, 3979, 3987, 4002, 4018, 4021, 4039, 4041, 4050]

mylist_two = [i for j, i in enumerate(mylist_two) if j not in remove_indices_second_round]

In [None]:
remove_indices_third_round = [346, 441, 503, 505, 507, 510, 514, 2845, 2989, 3104, 3302, 3701]

mylist_two = [i for j, i in enumerate(mylist_two) if j not in remove_indices_third_round]

In [None]:
len(mylist_two)

#### Step 2: Souplist

In [None]:
souplist_two = []

for i in tqdm(mylist_two):
    souplist_two.append(BeautifulSoup(i.text))

In [None]:
len(souplist_two)

#### Extract columns 1: Plot Summary

In [None]:
myfield = []
plot_summary = []

for i in tqdm(souplist_two):
    myfield.append(i.find_all('div', {'class':'plot_summary'}))

for i in tqdm(myfield):
    for x in tqdm(i):
        for y in tqdm(x.find_all('div', {'class':'summary_text'})):
            plot_summary.append(y.text)

In [None]:
with open('plot_summary_final_two_19102019.pkl', 'wb') as f:
    pickle.dump(plot_summary, f)

#### Extract column 2: IMDB Rating

In [None]:
myfield_rating = []
ratings = []

for i in tqdm(souplist_two):
    myfield_rating.append(i.find_all('div', {'class':'ratingValue'}))

for i in tqdm(myfield_rating):
    for x in tqdm(i):
        for y in tqdm(x.find_all('span', {'itemprop':'ratingValue'})):
            ratings.append(y.text)

In [None]:
with open('rating_final_two_19102019.pkl', 'wb') as f:
    pickle.dump(ratings, f)

#### Extract column 3: Actors List

In [None]:
# Phase 1

myfield_cast = []

for i in tqdm(souplist_two):
    myfield_cast.append(i.find_all('table', {'class':'cast_list'}))
    
#------------------------------------------------------------------------

# Phase 2

phase_two = []

import re
r_one = re.compile(".*name")

cast_list = myfield_cast

for i in tqdm(cast_list):
    for j in tqdm(i):
        phase_two.append(j.find_all('a', {'href':r_one}))

# # ------------------------------------------------------------------------

# # Phase 3

phase_three = []

for i in tqdm(range(len(phase_two))):
    if len(phase_two[i]) != 0:
        phase_three.append(phase_two[i][1::2])

# ------------------------------------------------------------------------

# Phase 4

actor_list = []
for tags in phase_three:
    actor_list.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), tags)))

In [None]:
with open('actors_list_final_two_19102019.pkl', 'wb') as f:
    pickle.dump(actor_list, f)

#### Extract column 4: Director Name

In [None]:
myfield_director = []
director_name = []

for i in tqdm(souplist_two):
    myfield_director.append(i.find_all('div', {'class':'plot_summary'}))

import re
r_one = re.compile(".*name")

for i in tqdm(myfield_director):
    for j in tqdm(i):
        director_name.append(j.find_all('a', {'href':r_one}))
        
director_names = [item[0].text for item in director_name]

In [None]:
with open('director_names_final_two_19102019.pkl', 'wb') as f:
    pickle.dump(director_names, f)

#### Exctract column 5: Plot Keywords

In [None]:
myfield_keywords = []
keywords = []

for i in tqdm(souplist_two):
    myfield_keywords.append(i.find_all('div', {'class':'see-more inline canwrap'}))

myfield_keywords_final = [[item[0]] for item in myfield_keywords]

for i in tqdm(myfield_keywords_final):
    for j in tqdm(i):
        keywords.append(j.find_all('span', {'class':"itemprop"}))

keywords_final = []
for i in keywords:
    keywords_final.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), i)))

In [None]:
with open('keywords_final_final_two_19102019.pkl', 'wb') as f:
    pickle.dump(keywords_final, f)

#### Extract column 6: Genres

In [None]:
myfield_genres = []
genres = []

for i in tqdm(souplist_two):
    myfield_genres.append(i.find_all('div', {'class':'see-more inline canwrap'}))

myfield_genres_final = [[item[1]] for item in myfield_genres]

import re
r_genres = re.compile("(?=genres)(.*)")

for i in tqdm(myfield_genres_final):
    for j in tqdm(i):
        genres.append(j.find_all('a', {'href':r_genres}))

genres_final = []
for i in genres:
    genres_final.append(list(map(lambda x: x.text.strip(' ').replace('\n', ''), i)))

In [None]:
with open('genres_final_final_two_19102019.pkl', 'wb') as f:
    pickle.dump(genres_final, f)

#### FINAL DATASET (latest modification on 07.11.2019)

##### First dataset with 4774 movies

In [96]:
dataset_one = pd.read_pickle('dataset_one_07112019.pkl')

print("Shape: {}".format(dataset_one.shape))

print("\nColumn names: \n{}".format(dataset_one.columns))

dataset_one = dataset_one[['movie_title', 'movie_imdb_link', 'updated_rating', 'full_cast', 'director_name', 'plot_summary', 
                           'plot_keywords', 'genre_0', 'genre_1', 'genre_2']]

print("\nThe new Column names of dataset_one: \n{}".format(dataset_one.columns))
print("\nThe new shape of dataset_one: {}".format(dataset_one.shape))

Shape: (4774, 21)

Column names: 
Index(['movie_imdb_link', 'movie_title', 'director_name', 'plot_keywords',
       'genre_0', 'genre_1', 'genre_2', 'updated_rating', 'plot_summary',
       'combined_features', 'full_cast', 'full_cast_embeddings',
       'minimum_cast_vectors', 'maximum_cast_vectors', 'average_cast_vectors',
       'minimum_plot_vectors', 'maximum_plot_vectors', 'average_plot_vectors',
       'minimum_combined_features', 'maximum_combined_features',
       'average_combined_features'],
      dtype='object')

The new Column names of dataset_one: 
Index(['movie_title', 'movie_imdb_link', 'updated_rating', 'full_cast',
       'director_name', 'plot_summary', 'plot_keywords', 'genre_0', 'genre_1',
       'genre_2'],
      dtype='object')

The new shape of dataset_one: (4774, 10)


##### Second dataset with 9145 movies

In [101]:
dataset_two = pd.read_pickle('dataset_two_07112019.pkl')

dataset_two.shape

dataset_two_cleaned_reset_indexed = dataset_two_cleaned.reset_index()

dataset_two_cleaned_reset_indexed.head()

Unnamed: 0,index,fn,tid,title,wordsInTitle,url,imdbRating,ratingCount,duration,year,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
0,58,titles01/tt0054997,tt0054997,Haie der Großstadt (1961),haie der gro stadt,http://www.imdb.com/title/tt0054997/,8.1,47138.0,8040.0,1961.0,...,0,0,0,0,0,1,0,0,0,0
1,59,titles01/tt0055031,tt0055031,Das Urteil von Nürnberg (1961),das urteil von n rnberg,http://www.imdb.com/title/tt0055031/,8.3,28790.0,11160.0,1961.0,...,0,0,0,0,0,0,0,0,1,0
2,60,titles01/tt0055630,tt0055630,Die Leibwache (1961),die leibwache,http://www.imdb.com/title/tt0055630/,8.4,53475.0,6600.0,1961.0,...,0,0,0,0,0,0,0,0,0,0
3,61,titles01/tt0056172,tt0056172,Lawrence von Arabien (1962),lawrence von arabien,http://www.imdb.com/title/tt0056172/,8.4,140727.0,12960.0,1962.0,...,0,0,0,0,0,0,0,0,0,0
4,62,titles01/tt0056592,tt0056592,Wer die Nachtigall stört (1962),wer die nachtigall st rt,http://www.imdb.com/title/tt0056592/,8.4,154978.0,7740.0,1962.0,...,0,0,0,0,0,0,0,0,0,0


##### First 5000 of the second dataset

In [103]:
first_five = dataset_two_cleaned_reset_indexed.iloc[0:5000]

In [104]:
first_five_one = first_five.drop([761, 2511, 4326])

In [105]:
first_five_one.shape

(4997, 45)

In [106]:
first_five_one_reset_indexed = first_five_one.reset_index(drop=True)

In [107]:
first_five_two = first_five_one_reset_indexed.drop([276,  1057, 1216, 1378, 3219, 3331, 3384, 3521, 3676, 3726, 3743, 3758, 3771, 3802, 3927, 3953, 3958, 3970, 
                  3980, 3981, 4010, 4022, 4042, 4060, 4070, 4091, 4094, 4101, 4114, 4115, 4121, 4128, 4141, 4152, 4154, 4186, 
                  4194, 4235, 4243, 4245, 4248, 4274, 4277, 4282, 4285, 4291, 4299, 4300, 4304, 4305, 4306, 4309, 4311, 4313, 
                  4315, 4318, 4322, 4324, 4329, 4330, 4340, 4426, 4630, 4665, 4676, 4683, 4696, 4700, 4731, 4738, 4755, 4778, 
                  4785, 4806, 4811, 4829, 4958])

In [108]:
first_five_two.shape

(4920, 45)

In [109]:
first_five_two_reset_indexed = first_five_two.reset_index(drop=True)

In [110]:
first_five_three = first_five_two_reset_indexed.drop([202, 454, 800, 1213, 1301,1911, 2567, 2568, 2569, 2570, 3017, 3140, 3457,
                                                      3760, 4014, 4092, 4333, 4410, 4452, 4455, 4552, 4708])

In [111]:
first_five_three.shape

(4898, 45)

In [112]:
first_five_three_reset_indexed = first_five_three.reset_index(drop=True)

In [113]:
first_five_four = first_five_three_reset_indexed.drop(
    [273,  419,  712,   882,  883,  955,  1012, 1372, 1377, 1676, 1920, 2108, 2219, 2467, 2517, 2642, 2646, 2696, 
    2719, 2764, 2767, 2769,  2793, 2797, 2826, 2887, 2904, 2936, 3001, 3022, 3024, 3027, 3079, 3116, 3133, 3134, 
    3140, 3158, 3162, 3176, 3193, 3208, 3210,  3217, 3221, 3231, 3242, 3245, 3249, 3267, 3278, 3288, 3291, 3292, 
    3300, 3319, 3327, 3330, 3342, 3359, 3411, 3417, 3426, 3432,  3434, 3445, 3453, 3457, 3468, 3474, 3498, 3502, 
    3508, 3516, 3533, 3551, 3562, 3578, 3605, 3606, 3616, 3621, 3625, 3635, 3637,  3659, 3665, 3668, 3686, 3692, 
    3698, 3701, 3723, 3770, 3775, 3778, 3796, 3797, 3798, 3800, 3811, 3812, 3823, 3896, 3915, 3931,  3932, 3944, 
    4001, 4005, 4006, 4008, 4028, 4033, 4037, 4049, 4062, 4075, 4085, 4086, 4102, 4106, 4107, 4124,  4140, 4142, 
    4158, 4161, 4164, 4166, 4170, 4177, 4179, 4184, 4201, 4202, 4207, 4208, 4212, 4213, 4229, 4232, 4233,  4235, 
    4238, 4241, 4246, 4247, 4252, 4255, 4257, 4260, 4261, 4262, 4263, 4265, 4334, 4336, 4345, 4392, 4417,  4457, 
    4500, 4540, 4551, 4557, 4575, 4604, 4612, 4626, 4627, 4643, 4655, 4684, 4705, 4717, 4763, 4776, 4779,  4791, 
    4846, 4855, 4864, 4866, 4897])

In [114]:
first_five_four.shape

(4713, 45)

##### Remember to pickle the first_five_four dataset

##### Second 5000 of the second dataset

In [115]:
second_five = dataset_two_cleaned_reset_indexed.iloc[5000:]

In [116]:
len(second_five)

4145

In [117]:
second_five_reset_indexed = second_five.reset_index(drop=True)

In [118]:
second_five_one = second_five_reset_indexed.drop(
    [3397, 79,  94,  137, 212, 226, 243, 261, 273, 297, 298, 332, 358, 365, 379, 382, 391, 398, 404, 409, 481, 
     515, 518, 520, 532, 545, 561, 565, 582, 605, 608, 614, 624,626, 632, 637, 642, 644, 646, 647, 
     648, 649, 652, 656, 658, 2353, 2805, 2911, 2968, 2990, 3117, 3187, 3259, 3424, 3439, 3618, 3771, 
     3858, 3866, 3907, 4129, 231, 498, 511, 563, 1857, 2005, 2030, 2345, 2690, 2862, 2891, 3112, 3214, 3534, 3641, 3766, 3901, 
     3918, 4070, 23, 2899, 3512])

In [119]:
second_five_one_reset_indexed = second_five_one.reset_index(drop=True)

In [120]:
second_five_two = second_five_one_reset_indexed.drop(
    [63, 67, 103, 104, 125, 148, 173, 185, 187, 188, 195, 199, 217, 222, 224, 225, 238, 240, 245, 251, 
     270, 271, 308, 310, 338, 340, 345, 347, 352, 354, 379, 384, 390, 393, 399, 418, 438, 441, 445, 449, 
     450, 454, 464, 468, 469, 477, 478, 484, 493, 501, 520, 532, 537, 539, 544, 552, 554, 560, 576, 587, 
     588, 595, 607, 608, 611, 838, 980, 1148, 1151, 1188, 1400, 1433, 1441, 1526, 1826, 1834, 1980, 2061, 
     2095, 2121, 2142, 2496, 2504, 2577, 2659, 2757, 2801, 2921, 2934, 3012, 3023, 3049, 3063,  3067, 
     3093, 3098, 3105, 3109, 3138, 3155, 3172, 3189, 3204, 3221, 3226, 3228, 3260, 3263, 3267, 3269, 3276, 
     3285, 3305, 3317, 3325,3326, 3327, 3329, 3353, 3359, 3387, 3421, 3454, 3478, 3495, 3500, 3511,3543, 
     3551, 3571, 3576, 3584, 3631, 3658, 3665, 3668, 3674, 3690, 3699, 3716, 3719, 3727, 3742, 3749, 3751,
     3770, 3793, 3800, 3809, 3817, 3822, 3826, 3832, 3837, 3853, 3856,3874, 3882, 3883, 3906, 3931, 3941, 
     3947, 3974, 3978, 3979, 3987, 4002, 4018, 4021, 4039, 4041, 4050])

In [121]:
second_five_two_reset_indexed = second_five_two.reset_index(drop=True)

In [122]:
second_five_three = second_five_two_reset_indexed.drop([346, 441, 503, 505, 507, 510, 514, 2845, 2989, 3104, 3302, 3701])

In [123]:
second_five_three.shape

(3877, 45)

##### remember to pickle the "second_five_three" dataset

#### First Column: Movie Title

In [124]:
list_one_title = dataset_one.movie_title.tolist()

print(list_one_title[0:5])

print("\n",len(list_one_title))

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter']

 4774


In [125]:
list_two_title = first_five_four.title.tolist()

print(list_two_title[0:5])

print("\n",len(list_two_title))

['Haie der Großstadt (1961)', 'Das Urteil von Nürnberg (1961)', 'Die Leibwache (1961)', 'Lawrence von Arabien (1962)', 'Wer die Nachtigall stört (1962)']

 4713


In [126]:
list_three_title = second_five_three.title.tolist()
    
print(list_three_title[0:5])

print("\n",len(list_three_title))

['Oldboy (2013)', 'Carjacked - Jeder hat seine Grenzen (2011)', 'Carlos - Der Schakal (TV Mini-Series 2010)', 'Im August in Osage County (2013)', 'Verrückt nach Dir (2010)']

 3877


In [127]:
final_list_title = list_one_title + list_two_title + list_three_title

print(final_list_title[0:5])

print("\n",len(final_list_title))

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter']

 13364


#### Second Column: IMDB link

In [139]:
list_one_url = dataset_one.movie_imdb_link.tolist()

print(list_one_url[0:5])

print("\n",len(list_one_url))

['http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1']

 4774


In [140]:
list_two_url = first_five_four.url.tolist()

print(list_two_url[0:5])

print("\n",len(list_two_url))

['http://www.imdb.com/title/tt0054997/', 'http://www.imdb.com/title/tt0055031/', 'http://www.imdb.com/title/tt0055630/', 'http://www.imdb.com/title/tt0056172/', 'http://www.imdb.com/title/tt0056592/']

 4713


In [141]:
list_three_url = second_five_three.url.tolist()
    
print(list_three_url[0:5])

print("\n",len(list_three_url))

['http://www.imdb.com/title/tt1321511/', 'http://www.imdb.com/title/tt1321861/', 'http://www.imdb.com/title/tt1321865/', 'http://www.imdb.com/title/tt1322269/', 'http://www.imdb.com/title/tt1322312/']

 3877


In [142]:
final_list_url = list_one_url + list_two_url + list_three_url

print(final_list_url[0:5])

print("\n",len(final_list_url))

['http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1', 'http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1']

 13364


#### Third column: IMDB Rating

In [143]:
list_one_rating = dataset_one.updated_rating.tolist()

print(list_one_rating[0:5])

print("\n",len(list_one_rating))

['7.8', '7.1', '6.8', '8.4', '6.6']

 4774


In [144]:
with open('ratings_one_18102019.pkl', 'rb') as f:
    list_two_rating = pickle.load(f)
    
print(list_two_rating[0:5])

print("\n",len(list_two_rating))

['8.0', '8.2', '8.2', '8.3', '8.3']

 4713


In [145]:
with open('rating_final_two_19102019.pkl', 'rb') as f:
    list_three_rating = pickle.load(f)
    
print(list_three_rating[0:5])

print("\n",len(list_three_rating))

['5.7', '5.0', '7.6', '7.2', '6.3']

 3877


In [146]:
final_list_rating = list_one_rating + list_two_rating + list_three_rating

print(final_list_rating[0:5])

print("\n",len(final_list_rating))

['7.8', '7.1', '6.8', '8.4', '6.6']

 13364


#### Fourth column: Actors

In [147]:
list_one_actors = dataset_one.full_cast.tolist()

print(list_one_actors[0:5])

print("\n",len(list_one_actors))

[['Sam-Worthington', 'Zoe-Saldana', 'Sigourney-Weaver', 'Stephen-Lang', 'Michelle-Rodriguez', 'Giovanni-Ribisi', 'Joel-David-Moore', 'CCH-Pounder', 'Wes-Studi', 'Laz-Alonso', 'Dileep-Rao', 'Matt-Gerald', 'Sean-Anthony-Moran', 'Jason-Whyte', 'Scott-Lawrence'], ['Johnny-Depp', 'Geoffrey-Rush', 'Orlando-Bloom', 'Keira-Knightley', 'Jack-Davenport', 'Bill-Nighy', 'Jonathan-Pryce', 'Lee-Arenberg', 'Mackenzie-Crook', 'Kevin-McNally', 'David-Bailie', 'Stellan-Skarsgård', 'Tom-Hollander', 'Naomie-Harris', 'Martin-Klebba'], ['Daniel-Craig', 'Christoph-Waltz', 'Léa-Seydoux', 'Ralph-Fiennes', 'Monica-Bellucci', 'Ben-Whishaw', 'Naomie-Harris', 'Dave-Bautista', 'Andrew-Scott', 'Rory-Kinnear', 'Jesper-Christensen', 'Alessandro-Cremona', 'Stephanie-Sigman', 'Tenoch-Huerta', 'Adriana-Paz'], ['Christian-Bale', 'Gary-Oldman', 'Tom-Hardy', 'Joseph-Gordon-Levitt', 'Anne-Hathaway', 'Marion-Cotillard', 'Morgan-Freeman', 'Michael-Caine', 'Matthew-Modine', 'Alon-Aboutboul', 'Ben-Mendelsohn', 'Burn-Gorman', 'Da

In [148]:
with open('actors_list_one_18102019.pkl', 'rb') as f:
    list_two_actors = pickle.load(f)
    
print(list_two_actors[0:5])

print("\n",len(list_two_actors))

[['Paul Newman', 'Jackie Gleason', 'Piper Laurie', 'George C. Scott', 'Myron McCormick', 'Murray Hamilton', 'Michael Constantine', 'Stefan Gierasch', 'Clifford A. Pellow', 'Jake LaMotta', 'Gordon B. Clarke', 'Alexander Rose', 'Carolyn Coates', 'Carl York', 'Vincent Gardenia'], ['Spencer Tracy', 'Burt Lancaster', 'Richard Widmark', 'Marlene Dietrich', 'Maximilian Schell', 'Judy Garland', 'Montgomery Clift', 'William Shatner', 'Werner Klemperer', 'Kenneth MacKenna', 'Torben Meyer', 'Joseph Bernard', 'Alan Baxter', 'Edward Binns', 'Virginia Christine'], ['Toshirô Mifune', 'Tatsuya Nakadai', 'Yôko Tsukasa', 'Isuzu Yamada', 'Daisuke Katô', 'Seizaburô Kawazu', 'Takashi Shimura', 'Hiroshi Tachikawa', 'Yôsuke Natsuki', 'Eijirô Tôno', 'Kamatari Fujiwara', 'Ikio Sawamura', 'Atsushi Watanabe', 'Susumu Fujita', 'Kyû Sazanka'], ["Peter O'Toole", 'Alec Guinness', 'Anthony Quinn', 'Jack Hawkins', 'Omar Sharif', 'José Ferrer', 'Anthony Quayle', 'Claude Rains', 'Arthur Kennedy', 'Donald Wolfit', 'I.S. 

In [149]:
with open('actors_list_final_two_19102019.pkl', 'rb') as f:
    list_three_actors = pickle.load(f)
    
print(list_three_actors[0:5])

print("\n",len(list_three_actors))

[['Josh Brolin', 'Elizabeth Olsen', 'Sharlto Copley', 'Samuel L. Jackson', 'Michael Imperioli', 'Pom Klementieff', 'James Ransone', 'Max Casella', 'Linda Emond', 'Elvis Nolasco', 'Rami Malek', 'Lance Reddick', 'Hannah Ware', 'Richard Portnow', 'Hannah Simone'], ['Maria Bello', 'Stephen Dorff', 'Connor Hill', 'Robert Peters', 'Cynthia Rube', 'Michael Arata', 'Gary Grubbs', 'Josh Gates', 'Tim Griffin', 'Catherine Dent', 'Kristen Kerr', 'Joanna Cassidy', 'Angelle Brooks', 'Jeff Joslin', 'Lenore Banks'], ['Edgar Ramírez', 'Alexander Scheer', 'Fadi Abi Samra', 'Lamia Ahmed', 'Karam Ghossein', 'Liane Sellerer', 'Philippe Tran', 'Ahmad Kaabour', 'Talal Jurdi', 'Juana Acosta', 'Nora von Waldstätten', 'Christoph Bach', 'Rodney El Haddad', 'Julia Hummer', 'Antoine Balabane', 'Rami Farah', 'Aljoscha Stadelmann', 'Zeid Hamdan', 'Fadi Yanni Turk', 'Katharina Schüttler', 'Badih Abou Chakra', 'Basim Kahar', 'Cem Sultan Ungan'], ['Meryl Streep', 'Julia Roberts', 'Chris Cooper', 'Ewan McGregor', 'Margo

In [136]:
final_list_actors = list_one_actors + list_two_actors + list_three_actors

print(final_list_actors[0:5])

print("\n",len(final_list_actors))

[['Sam-Worthington', 'Zoe-Saldana', 'Sigourney-Weaver', 'Stephen-Lang', 'Michelle-Rodriguez', 'Giovanni-Ribisi', 'Joel-David-Moore', 'CCH-Pounder', 'Wes-Studi', 'Laz-Alonso', 'Dileep-Rao', 'Matt-Gerald', 'Sean-Anthony-Moran', 'Jason-Whyte', 'Scott-Lawrence'], ['Johnny-Depp', 'Geoffrey-Rush', 'Orlando-Bloom', 'Keira-Knightley', 'Jack-Davenport', 'Bill-Nighy', 'Jonathan-Pryce', 'Lee-Arenberg', 'Mackenzie-Crook', 'Kevin-McNally', 'David-Bailie', 'Stellan-Skarsgård', 'Tom-Hollander', 'Naomie-Harris', 'Martin-Klebba'], ['Daniel-Craig', 'Christoph-Waltz', 'Léa-Seydoux', 'Ralph-Fiennes', 'Monica-Bellucci', 'Ben-Whishaw', 'Naomie-Harris', 'Dave-Bautista', 'Andrew-Scott', 'Rory-Kinnear', 'Jesper-Christensen', 'Alessandro-Cremona', 'Stephanie-Sigman', 'Tenoch-Huerta', 'Adriana-Paz'], ['Christian-Bale', 'Gary-Oldman', 'Tom-Hardy', 'Joseph-Gordon-Levitt', 'Anne-Hathaway', 'Marion-Cotillard', 'Morgan-Freeman', 'Michael-Caine', 'Matthew-Modine', 'Alon-Aboutboul', 'Ben-Mendelsohn', 'Burn-Gorman', 'Da

#### Fifth column: Director Name

In [150]:
list_one_directors = dataset_one.director_name.tolist()

print(list_one_directors[0:5])

print("\n",len(list_one_directors))

['James-Cameron', 'Gore-Verbinski', 'Sam-Mendes', 'Christopher-Nolan', 'Andrew-Stanton']

 4774


In [154]:
with open('director_names_one_18102019.pkl', 'rb') as f:
    list_two_directors = pickle.load(f)
    
print(list_two_directors[0:5])

print("\n",len(list_two_directors))

['Robert Rossen', 'Stanley Kramer', 'Akira Kurosawa', 'T.E. Lawrence', 'Robert Mulligan']

 4713


In [152]:
with open('director_names_final_two_19102019.pkl', 'rb') as f:
    list_three_directors = pickle.load(f)
    
print(list_three_directors[0:5])

print("\n",len(list_three_directors))

['Spike Lee', 'John Bonito', 'Edgar Ramírez', 'John Wells', 'Nanette Burstein']

 3877


In [155]:
final_list_directors = list_one_directors + list_two_directors + list_three_directors

print(final_list_directors[0:5])

print("\n",len(final_list_directors))

['James-Cameron', 'Gore-Verbinski', 'Sam-Mendes', 'Christopher-Nolan', 'Andrew-Stanton']

 13364


#### Sixth column: Plot Summary

In [156]:
list_one_plot = dataset_one.plot_summary.tolist()

print(list_one_plot[0:5])

print("\n",len(list_one_plot))

['A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home', 'Captain Barbossa Will Turner and Elizabeth Swann must sail off the edge of the map navigate treachery and betrayal find Jack Sparrow and make their final alliances for one last decisive battle', "A cryptic message from 007's past sends him pitted against a mysterious terrorist organization called Spectre and learns of its involvement in previous events of his most dangerous missions", "Eight years after the Joker's reign of anarchy Batman with the help of the enigmatic Catwoman is forced from his exile to save Gotham City now on the edge of total annihilation from the brutal guerrilla terrorist Bane", 'Transported to Barsoom a Civil War vet discovers a barren planet seemingly inhabited by 12-foot tall barbarians Finding himself prisoner of these creatures he escapes only to encounter Woola and a princess in desperate need of

In [157]:
with open('plot_summary_one_18102019.pkl', 'rb') as f:
    list_two_plot = pickle.load(f)
    
print(list_two_plot[0:5])

print("\n",len(list_two_plot))

['\n                    An up-and-coming pool player plays a long-time champion in a single high-stakes match.\n            ', '\n                    In 1948, an American court in occupied Germany tries four Nazis judged for war crimes.\n            ', '\n                    A crafty ronin comes to a town divided by two criminal gangs and decides to play them against each other to free the town.\n            ', '\n                    The story of T.E. Lawrence, the English officer who successfully united and led the diverse, often warring, Arab tribes during World War I in order to fight the Turks.\n            ', '\n                    Atticus Finch, a lawyer in the Depression-era South, defends a black man against an undeserved rape charge, and his children against prejudice.\n            ']

 4713


In [158]:
with open('plot_summary_final_two_19102019.pkl', 'rb') as f:
    list_three_plot = pickle.load(f)
    
print(list_three_plot[0:5])

print("\n",len(list_three_plot))

['\n                    Obsessed with vengeance, a man sets out to find out why he was kidnapped and locked into solitary confinement for twenty years without reason.\n            ', '\n                    A single mom and her child are carjacked by a bank robber.\n            ', '\n                    The story of Venezuelan revolutionary Ilich Ramírez Sánchez, who founded a worldwide terrorist organization and raided the 1975 OPEC meeting.\n            ', '\n                    A look at the lives of the strong-willed women of the Weston family, whose paths have diverged until a family crisis brings them back to the Oklahoma house they grew up in, and to the dysfunctional woman who raised them.\n            ', '\n                    A romantic comedy centered on a guy and a gal who try to keep their love alive as they shuttle back and forth between New York and San Francisco to see one another.\n            ']

 3877


In [159]:
final_list_plot = list_one_plot + list_two_plot + list_three_plot

print(final_list_plot[0:5])

print("\n",len(final_list_plot))

['A paraplegic Marine dispatched to the moon Pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home', 'Captain Barbossa Will Turner and Elizabeth Swann must sail off the edge of the map navigate treachery and betrayal find Jack Sparrow and make their final alliances for one last decisive battle', "A cryptic message from 007's past sends him pitted against a mysterious terrorist organization called Spectre and learns of its involvement in previous events of his most dangerous missions", "Eight years after the Joker's reign of anarchy Batman with the help of the enigmatic Catwoman is forced from his exile to save Gotham City now on the edge of total annihilation from the brutal guerrilla terrorist Bane", 'Transported to Barsoom a Civil War vet discovers a barren planet seemingly inhabited by 12-foot tall barbarians Finding himself prisoner of these creatures he escapes only to encounter Woola and a princess in desperate need of

#### Seventh column: Plot keywords

In [160]:
list_one_keywords = dataset.plot_keywords.tolist()

print(list_one_keywords[0:5])

print("\n",len(list_one_keywords))

[['avatar', 'future', 'marine', 'native', 'paraplegic'], ['goddess', 'marriage', 'ceremony', 'marriage', 'proposal', 'pirate', 'singapore'], ['bomb', 'espionage', 'sequel', 'spy', 'terrorist'], ['deception', 'imprisonment', 'lawlessness', 'police', 'officer', 'terrorist', 'plot'], ['alien', 'american', 'civil', 'war', 'male', 'nipple', 'mars', 'princess']]

 4774


In [161]:
with open('keywords_final_one_18102019.pkl', 'rb') as f:
    list_two_keywords = pickle.load(f)
    
print(list_two_keywords[0:5])

print("\n",len(list_two_keywords))

[['playing pool', 'arrogance', 'pool hall', 'pool shark', 'pool hustler'], ['nuremberg trials', 'judge', 'nazi', 'war crime', 'courtroom drama'], ['samurai', 'ronin', 'bodyguard', 'one against many', 'man with no name'], ['arab', 'desert', 'bedouin', 'ottoman empire', 'british military'], ['trial', 'lawyer', 'false accusation', 'based on novel', 'small town']]

 4713


In [162]:
with open('keywords_final_final_two_19102019.pkl', 'rb') as f:
    list_three_keywords = pickle.load(f)
    
print(list_three_keywords[0:5])

print("\n",len(list_three_keywords))

[['father daughter incest', 'incest', 'male nudity', 'female nudity', 'solitary confinement'], ['blonde', 'nipples visible through clothing', 'shotgun', 'kidnapping', 'money bag'], ['war story', 'male frontal nudity', 'damascus', 'syria', 'beirut'], ['incestuous relationship', 'pills', 'cancer', 'death of husband', 'family secret'], ['long distance relationship', 'job', 'bar', 'male pubic hair', 'pubic hair']]

 3877


In [163]:
final_list_keywords = list_one_keywords + list_two_keywords + list_three_keywords

print(final_list_keywords[0:5])

print("\n",len(final_list_keywords))

[['avatar', 'future', 'marine', 'native', 'paraplegic'], ['goddess', 'marriage', 'ceremony', 'marriage', 'proposal', 'pirate', 'singapore'], ['bomb', 'espionage', 'sequel', 'spy', 'terrorist'], ['deception', 'imprisonment', 'lawlessness', 'police', 'officer', 'terrorist', 'plot'], ['alien', 'american', 'civil', 'war', 'male', 'nipple', 'mars', 'princess']]

 13364


#### Eigth column: Genres

In [165]:
dataset_one['genres'] = dataset_one[['genre_0', 'genre_1', 'genre_2']].apply(lambda x: ','.join(x[x.notnull()]).split(','), axis = 1)

In [166]:
list_one_genres = dataset_one.genres.tolist()

print(list_one_genres[0:5])

print("\n",len(list_one_genres))

[['Action', 'Adventure', 'Fantasy'], ['Action', 'Adventure', 'Fantasy'], ['Action', 'Adventure', 'Thriller'], ['Action', 'Thriller', '0'], ['Action', 'Adventure', 'Sci-Fi']]

 4774


In [167]:
with open('genres_final_one_18102019.pkl', 'rb') as f:
    list_two_genres = pickle.load(f)
    
print(list_two_genres[0:5])

print("\n",len(list_two_genres))

[['Drama', 'Sport'], ['Drama', 'War'], ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'], ['Adventure', 'Biography', 'Drama', 'History', 'War'], ['Crime', 'Drama']]

 4713


In [168]:
with open('genres_final_final_two_19102019.pkl', 'rb') as f:
    list_three_genres = pickle.load(f)
    
print(list_three_genres[0:5])

print("\n",len(list_three_genres))

[['Action', 'Drama', 'Mystery', 'Thriller'], ['Crime', 'Thriller'], ['Biography', 'Crime', 'Drama', 'Thriller'], ['Comedy', 'Drama'], ['Comedy', 'Romance']]

 3877


In [169]:
final_list_genres = list_one_genres + list_two_genres + list_three_genres

print(final_list_genres[0:5])

print("\n",len(final_list_genres))

[['Action', 'Adventure', 'Fantasy'], ['Action', 'Adventure', 'Fantasy'], ['Action', 'Adventure', 'Thriller'], ['Action', 'Thriller', '0'], ['Action', 'Adventure', 'Sci-Fi']]

 13364


#### THE FINAL DATASET OF 13364 MOVIES (21.10.2019)

In [170]:
final_dataset = pd.DataFrame(list(zip(final_list_title, final_list_url, final_list_rating, final_list_actors, 
                                      final_list_directors, final_list_plot, final_list_keywords, final_list_genres)), 
                  columns =['Movie Title', 'IMDB Url', 'IMDB Rating', 'Actors', 'Director', 'Plot Summary', 'Plot Keywords', 'Genres'])

In [171]:
final_dataset.shape

(13364, 8)

In [172]:
final_dataset.head()

Unnamed: 0,Movie Title,IMDB Url,IMDB Rating,Actors,Director,Plot Summary,Plot Keywords,Genres
0,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,7.8,"[Sam-Worthington, Zoe-Saldana, Sigourney-Weave...",James-Cameron,A paraplegic Marine dispatched to the moon Pan...,"[avatar, future, marine, native, paraplegic]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,7.1,"[Johnny-Depp, Geoffrey-Rush, Orlando-Bloom, Ke...",Gore-Verbinski,Captain Barbossa Will Turner and Elizabeth Swa...,"[goddess, marriage, ceremony, marriage, propos...","[Action, Adventure, Fantasy]"
2,Spectre,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,6.8,"[Daniel-Craig, Christoph-Waltz, Léa-Seydoux, R...",Sam-Mendes,A cryptic message from 007's past sends him pi...,"[bomb, espionage, sequel, spy, terrorist]","[Action, Adventure, Thriller]"
3,The Dark Knight Rises,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,8.4,"[Christian-Bale, Gary-Oldman, Tom-Hardy, Josep...",Christopher-Nolan,Eight years after the Joker's reign of anarchy...,"[deception, imprisonment, lawlessness, police,...","[Action, Thriller, 0]"
4,John Carter,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,6.6,"[Taylor-Kitsch, Lynn-Collins, Samantha-Morton,...",Andrew-Stanton,Transported to Barsoom a Civil War vet discove...,"[alien, american, civil, war, male, nipple, ma...","[Action, Adventure, Sci-Fi]"


#### Pickle the final dataset

In [174]:
# final_dataset.to_pickle('dataset_part_1_07112019.pkl')

#### TO SQL SERVER

In [2]:
final_dataset = pd.read_pickle('final_dataset_07112019.pkl')

In [26]:
import pyodbc

connStr = pyodbc.connect("DRIVER={SQL Server Native Client 11.0};"
                         "SERVER=GR2211336W2;"
                         "DATABASE=Movies_Dataset;"
                         "Trusted_Connection=yes")

cursor = connStr.cursor()
 
delete_table =  """           
                        IF dbo.TableExists('data_table_one') = 1
                             DELETE FROM data_table_one      
                """

insert_values = """ 
                        EXEC [dbo].[store_movies] @Movie_Title = ?, @IMDB_Rating = ?, @Director = ?, @Plot_Summary = ?;
                """   

cursor.execute(delete_table)

for index, row in final_dataset.iterrows():

    params = (row['Movie Title'], row['IMDB Rating'], row['Director'], row['Plot Summary'])
    cursor.execute(insert_values, params)

connStr.commit()

cursor.close()

connStr.close()

In [29]:
column_actors = final_dataset.loc[:, ['Movie Title', 'Actors']]

In [30]:
column_actors.shape

(13364, 2)

In [32]:
s = column_actors.apply(lambda x: pd.Series(x['Actors']), axis=1).stack().reset_index(level=1, drop=True)

s.name = 'Actors'

actor_names = column_actors.drop('Actors', axis=1).join(s)

actor_names['Actors'] = pd.Series(actor_names['Actors'], dtype=object)

actor_names.head(15)

In [33]:
import pyodbc

connStr = pyodbc.connect("DRIVER={SQL Server Native Client 11.0};"
                         "SERVER=GR2211336W2;"
                         "DATABASE=Movies_Dataset;"
                         "Trusted_Connection=yes")

cursor = connStr.cursor()
 
delete_table =  """           
                        IF dbo.TableExists('data_table_two') = 1
                             DELETE FROM data_table_two      
                """

insert_values = """ 
                        EXEC [dbo].[store_actors] @Movie_Title = ?, @Actors = ?;
                """   

cursor.execute(delete_table)

for index, row in actor_names.iterrows():

    params = (row['Movie Title'], row['Actors'])
    cursor.execute(insert_values, params)

connStr.commit()

cursor.close()

connStr.close()

In [34]:
column_plotkeywords = final_dataset.loc[:, ['Movie Title', 'Plot Keywords']]

In [35]:
s = column_plotkeywords.apply(lambda x: pd.Series(x['Plot Keywords']), axis=1).stack().reset_index(level=1, drop=True)

s.name = 'Plot Keywords'

plot_keywords = column_plotkeywords.drop('Plot Keywords', axis=1).join(s)

plot_keywords['Plot Keywords'] = pd.Series(plot_keywords['Plot Keywords'], dtype=object)

plot_keywords.head(15)

Unnamed: 0,Movie Title,Plot Keywords
0,Avatar,avatar
0,Avatar,future
0,Avatar,marine
0,Avatar,native
0,Avatar,paraplegic
1,Pirates of the Caribbean: At World's End,goddess
1,Pirates of the Caribbean: At World's End,marriage
1,Pirates of the Caribbean: At World's End,ceremony
1,Pirates of the Caribbean: At World's End,marriage
1,Pirates of the Caribbean: At World's End,proposal


In [37]:
import pyodbc

connStr = pyodbc.connect("DRIVER={SQL Server Native Client 11.0};"
                         "SERVER=GR2211336W2;"
                         "DATABASE=Movies_Dataset;"
                         "Trusted_Connection=yes")

cursor = connStr.cursor()
 
delete_table =  """           
                        IF dbo.TableExists('data_table_three') = 1
                             DELETE FROM data_table_three      
                """

insert_values = """ 
                        EXEC [dbo].[store_keywords] @Movie_Title = ?, @Plot_Keywords = ?;
                """   

cursor.execute(delete_table)

for index, row in plot_keywords.iterrows():

    params = (row['Movie Title'], row['Plot Keywords'])
    cursor.execute(insert_values, params)

connStr.commit()

cursor.close()

connStr.close()

In [36]:
column_genres = final_dataset.loc[:, ['Movie Title', 'Genres']]

In [43]:
s = column_genres.apply(lambda x: pd.Series(x['Genres']), axis=1).stack().reset_index(level=1, drop=True)

s.name = 'Genres'

genres = column_genres.drop('Genres', axis=1).join(s)

genres['Genres'] = pd.Series(genres['Genres'], dtype=object)

genres.head(15)

Unnamed: 0,Movie Title,Genres
0,Avatar,Action
0,Avatar,Adventure
0,Avatar,Fantasy
1,Pirates of the Caribbean: At World's End,Action
1,Pirates of the Caribbean: At World's End,Adventure
1,Pirates of the Caribbean: At World's End,Fantasy
2,Spectre,Action
2,Spectre,Adventure
2,Spectre,Thriller
3,The Dark Knight Rises,Action


In [44]:
import pyodbc

connStr = pyodbc.connect("DRIVER={SQL Server Native Client 11.0};"
                         "SERVER=GR2211336W2;"
                         "DATABASE=Movies_Dataset;"
                         "Trusted_Connection=yes")

cursor = connStr.cursor()
 
delete_table =  """           
                        IF dbo.TableExists('data_table_four') = 1
                             DELETE FROM data_table_four      
                """

insert_values = """ 
                        EXEC [dbo].[store_genres] @Movie_Title = ?, @Genres = ?;
                """   

cursor.execute(delete_table)

for index, row in genres.iterrows():

    params = (row['Movie Title'], row['Genres'])
    cursor.execute(insert_values, params)

connStr.commit()

cursor.close()

connStr.close()

#### End of Part 1 - Update, clean & transfrom the dataset of movies