#### Import the libraries

In [2]:
# For cleaning and preparing the dataset
# -> dataframe manipulation
# -> text manipulation
# -> Web Scrapping

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re


# Module to serialize the content produced from the execution of the code

import pickle


# Module to monitor the progress of a python for loop

from tqdm import tqdm


# Module to manipulate text in python - NLTK package

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


# Module to compute word vectorizers and compute the cosine distance

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_distances

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Import the dataset

In [3]:
dataset = pd.read_pickle('final_dataset_07112019.pkl') #previous version "final_dataset_22102019.pkl"

In [4]:
dataset = dataset.join(pd.DataFrame(dataset.Genres.values.tolist(), dataset.index).add_prefix('Genres_').fillna(0))

In [5]:
dataset = dataset.drop(['Genres_3', 'Genres_4', 'Genres_5', 'Genres_6', 'Genres_7', 'Genres_8'], axis=1)

In [6]:
dataset.tail(15)

Unnamed: 0,Movie Title,IMDB Url,IMDB Rating,Actors,Director,Plot Summary,Plot Keywords,Genres,Genres_0,Genres_1,Genres_2
13349,Jackass Nummer zwei (2006),http://www.imdb.com/title/tt0493430/,7.0,"[Johnny Knoxville, Bam Margera, Steve-O, Chris...",Jeff Tremaine,"\n Chris Pontius, Johnny Kn...","[pubic hair, male rear nudity, male frontal nu...","[Documentary, Action, Comedy]",Documentary,Action,Comedy
13350,Gangster (2006),http://www.imdb.com/title/tt0495032/,7.1,"[Kangana Ranaut, Shiney Ahuja, Emraan Hashmi, ...",Anurag Basu,"\n In Seoul, a gangster's a...","[underworld don, unrequited love, singer in re...","[Action, Crime, Drama, Romance, Thriller]",Action,Crime,Drama
13351,Budd Boetticher: An American Original (Video 2...,http://www.imdb.com/title/tt0495813/,6.8,"[Fred Bailey, Budd Boetticher, Peter Bogdanovi...",Bruce Ricker,\n Documentary on the under...,[character name in title],[Documentary],Documentary,0,0
13352,Die Gräfin (2009),http://www.imdb.com/title/tt0496634/,6.2,"[Julie Delpy, Daniel Brühl, William Hurt, Anam...",Julie Delpy,\n A 17th century Hungarian...,"[fear of aging, female serial killer, blood, i...","[Biography, Drama, History, Horror, Thriller]",Biography,Drama,History
13353,Eine unbequeme Wahrheit (2006),http://www.imdb.com/title/tt0497116/,7.4,"[Al Gore, Billy West]",Davis Guggenheim,\n Filmmaker Davis Guggenhe...,"[climate, global warming, global climate chang...","[Documentary, News]",Documentary,News,0
13354,Adrift in Manhattan (2007),http://www.imdb.com/title/tt0497316/,5.5,"[Heather Graham, Victor Rasuk, Dominic Chianes...",Alfredo Rodriguez de Villa,\n The lives of three lonel...,"[deep cleavage, older woman younger man sex, s...",[Drama],Drama,0,0
13355,Anamorph - Die Kunst zu töten (2007),http://www.imdb.com/title/tt0497323/,5.5,"[Willem Dafoe, Scott Speedman, Don Harvey, Jam...",Henry Miller,\n A psychological thriller...,"[eyesight, murder, detective, anamorphosis, po...","[Crime, Horror, Thriller]",Crime,Horror,Thriller
13356,Working with Animals: 'The Scorpion King' (Vid...,http://www.imdb.com/title/tt0497667/,6.6,"[Chuck Russell, Richard Luke Rothschild, Sherr...",Chuck Russell,\n A behind the scenes look...,"[behind the scenes, filmmaking]","[Documentary, Short]",Documentary,Short,0
13357,Gwai wik (2006),http://www.imdb.com/title/tt0498311/,6.1,"[Angelica Lee, Soi Cheang, Ekin Cheng, Lawrenc...",Danny Pang,\n After writing three best...,"[downward spiral, mind bending, supernatural p...","[Fantasy, Horror, Mystery, Thriller]",Fantasy,Horror,Mystery
13358,Hostel 2 (2007),http://www.imdb.com/title/tt0498353/,5.5,"[Lauren German, Roger Bart, Heather Matarazzo,...",Eli Roth,\n Three American college s...,"[breasts, torture, extreme violence, tied feet...",[Horror],Horror,0,0


# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Cleaning the columns

##### Movie Title (25.10.2019)

##### Step 1: Clean te year (i.e 2001) from each title - 25.10.2019

In [12]:
exp = r'\(\d\d\d\d.'

dataset['Movie Title'] = dataset['Movie Title'].apply(lambda x: re.sub(exp,"",x).strip())

In [13]:
dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.replace('\\', ''))

In [14]:
dataset['Movie Title'].iloc[-1]

"VeggieTales: Where's God When I'm S-Scared? (Video 1993)"

##### Step 2: Clean the similar movies (based on the similar actors they have) - 29.10.2019

In [15]:
dataset['Actors'] = dataset['Actors'].apply(lambda x: ', '.join(x))

In [16]:
dataset['Actors'] = dataset['Actors'].apply(lambda x: x.replace('-', ' '))

In [17]:
dataset[dataset['Movie Title'].str.contains('Avatar')]

Unnamed: 0,Movie Title,IMDB Url,IMDB Rating,Actors,Director,Plot Summary,Plot Keywords,Genres,Genres_0,Genres_1,Genres_2
0,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,7.8,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",James-Cameron,A paraplegic Marine dispatched to the moon Pan...,"[avatar, future, marine, native, paraplegic]","[Action, Adventure, Fantasy]",Action,Adventure,Fantasy
5468,Avatar - Aufbruch nach Pandora,http://www.imdb.com/title/tt0499549/,7.8,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",James Cameron,\n A paraplegic Marine disp...,"[spiritualism, paraplegic, marine, future, for...","[Action, Adventure, Fantasy, Sci-Fi]",Action,Adventure,Fantasy
8765,Aliens vs. Avatars,http://www.imdb.com/title/tt1854506/,1.5,"Jason Lockhart, Kim Argetsinger, Cassie Fliege...",Lewis Schoenbrun,\n Six college friends find...,"[arm ripped off, decapitation, camping, toples...","[Horror, Sci-Fi]",Horror,Sci-Fi,0


In [None]:
# dataset_duplicated = dataset

# dataset_duplicated[dataset_duplicated.duplicated(['Actors'], keep='last')]

In [18]:
dataset = dataset.drop_duplicates('Actors', keep='first')

In [19]:
dataset.shape

(10602, 11)

In [20]:
dataset[dataset['Movie Title'].str.contains("Avatar")]

Unnamed: 0,Movie Title,IMDB Url,IMDB Rating,Actors,Director,Plot Summary,Plot Keywords,Genres,Genres_0,Genres_1,Genres_2
0,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,7.8,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",James-Cameron,A paraplegic Marine dispatched to the moon Pan...,"[avatar, future, marine, native, paraplegic]","[Action, Adventure, Fantasy]",Action,Adventure,Fantasy
8765,Aliens vs. Avatars,http://www.imdb.com/title/tt1854506/,1.5,"Jason Lockhart, Kim Argetsinger, Cassie Fliege...",Lewis Schoenbrun,\n Six college friends find...,"[arm ripped off, decapitation, camping, toples...","[Horror, Sci-Fi]",Horror,Sci-Fi,0


##### Actors (29.10.2019)

In [21]:
dataset['Actors'] = dataset['Actors'].apply(lambda x: list(x.split(", ")))

##### Director (29.10.2019)

In [22]:
dataset['Director'] = dataset['Director'].apply(lambda x: x.replace('-', ' '))

##### Plot Summary (29.10.2019)

In [23]:
dataset['Plot Summary'].iloc[4774]

'\n                    Two bounty hunters with the same intentions team up to track down a Western outlaw.\n            '

In [24]:
dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.strip())

dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.replace(',', ''))

dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.replace('.', ''))

dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.replace('?', ''))

dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.replace('!', ''))

dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.replace('\n                    See full summary\xa0»', ''))

dataset['Plot Summary'] = dataset['Plot Summary'].apply(lambda x: x.rstrip())

In [25]:
dataset['Plot Summary'].iloc[-1]

'"Tales from the Crisper": Junior Asparagus after watching a scary Frankencelery movie is afraid to go to sleep; he gets help from Bob the Tomato and Larry the Cucumber who teach him that'

##### Genres (30.10.2019)

In [26]:
dataset['Genres'] = dataset['Genres'].apply(lambda x: list(filter(lambda a: a != '0', x)))

In [27]:
dataset.Genres.iloc[19]

['Adventure', 'Fantasy']

In [28]:
dataset = dataset.drop(['Genres_0', 'Genres_1', 'Genres_2'], axis=1)

In [29]:
print(dataset.head())

print(dataset.shape)

                                Movie Title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            IMDB Url IMDB Rating  \
0  http://www.imdb.com/title/tt0499549/?ref_=fn_t...         7.8   
1  http://www.imdb.com/title/tt0449088/?ref_=fn_t...         7.1   
2  http://www.imdb.com/title/tt2379713/?ref_=fn_t...         6.8   
3  http://www.imdb.com/title/tt1345836/?ref_=fn_t...         8.4   
4  http://www.imdb.com/title/tt0401729/?ref_=fn_t...         6.6   

                                              Actors           Director  \
0  [Sam Worthington, Zoe Saldana, Sigourney Weave...      James Cameron   
1  [Johnny Depp, Geoffrey Rush, Orlando Bloom, Ke...     Gore Verbinski   
2  [Daniel Craig, Christoph Waltz, Léa Seydoux, R...         Sam Mendes   
3  [Chri

##### Create the column Combined Features (29.10.2019)

In [30]:
# The columns that will be used to create the column "Combined Features"

def combine_features(row):
    return row['Movie Title'].lower() + " " + ' '.join(map(str, row['Actors'])).lower() + " " + row['Director'].lower() + " " + row['Plot Summary'].lower() + " " + ' '.join(row['Plot Keywords']).lower() + " " + ' '.join(row['Genres']).lower()

def combine_actors(row):
    return ', '.join(row['Actors'])

dataset["Combined_Features"] = dataset.apply(combine_features, axis=1)

dataset['Actors_Embeddings'] = dataset.apply(combine_actors, axis=1)

## Note: Instead of creating the Actor Embeddings column, vectorize the different name of each actor

In [31]:
dataset["Combined_Features"].iloc[0]

'avatar sam worthington zoe saldana sigourney weaver stephen lang michelle rodriguez giovanni ribisi joel david moore cch pounder wes studi laz alonso dileep rao matt gerald sean anthony moran jason whyte scott lawrence james cameron a paraplegic marine dispatched to the moon pandora on a unique mission becomes torn between following his orders and protecting the world he feels is his home avatar future marine native paraplegic action adventure fantasy'

In [32]:
dataset['Actors_Embeddings'].iloc[0]

'Sam Worthington, Zoe Saldana, Sigourney Weaver, Stephen Lang, Michelle Rodriguez, Giovanni Ribisi, Joel David Moore, CCH Pounder, Wes Studi, Laz Alonso, Dileep Rao, Matt Gerald, Sean Anthony Moran, Jason Whyte, Scott Lawrence'

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

#### Pickle the dataset (30.10.2019)

In [36]:
dataset.to_pickle('dataset_part_2_07112019.pkl') #previous version "dataset_30102019.pkl"

In [35]:
dataset.head()

10602

#### End of part 2 (Transforming the columns of the final dataset)