In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from rake_nltk import Rake

from SPARQLWrapper import SPARQLWrapper, JSON

##### We need to concat most of the separate files into one. Or at least the relevant ones

In [2]:
movie_metadata = pd.read_csv("source/movies_metadata.csv")
links_df = pd.read_csv("source/links.csv")
keywords_df = pd.read_csv("source/keywords.csv")
credits_df = pd.read_csv("source/credits.csv")#

  movie_metadata = pd.read_csv("source/movies_metadata.csv")


In [3]:
movie_metadata['id'] = pd.to_numeric(movie_metadata['id'], errors='coerce')
movie_metadata = movie_metadata[movie_metadata['id'].notna()].astype({'id': int})

#### Clean up all the irrelevant columns

In [4]:
links_df.rename(columns={'movieId':'id'},inplace=True)

In [5]:
full_df = movie_metadata.merge(keywords_df, on='id')
full_df = full_df.merge(credits_df[['cast', 'crew', 'id']], on='id')
full_df = full_df.merge(links_df, on='id')

In [6]:
full_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,keywords,cast,crew,imdbId,tmdbId
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",116985,88224.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",78763,42164.0
2,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de...",48028,220.0
3,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...","[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de...",115851,23449.0
4,False,,98000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,1408,tt0112760,en,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",...,The Course Has Been Set. There Is No Turning B...,Cutthroat Island,False,5.7,137.0,"[{'id': 911, 'name': 'exotic island'}, {'id': ...","[{'cast_id': 1, 'character': 'Morgan Adams', '...","[{'credit_id': '52fe42f4c3a36847f802f69f', 'de...",104691,9361.0


In [7]:
def convert_dict_list_to_list(column):
    
    if isinstance(column, str) and isinstance(eval(column), list):
        evaluated_list = eval(column)
    else:
        return np.nan #Will be removed

    column_list_item = []
    for dict_item in evaluated_list:
        column_list_item.append(dict_item['name'])
    return column_list_item

In [8]:
#Tagline has NAN
#Won't be prompting original_language
full_df.drop(['adult', 'belongs_to_collection', 'homepage', 'tagline', 'video', 'poster_path'], axis=1, inplace=True)

In [9]:
full_df['genres'] = full_df['genres'].apply(lambda x : convert_dict_list_to_list(x))
full_df['production_countries'] = full_df['production_countries'].apply(lambda x : convert_dict_list_to_list(x))
full_df['production_companies'] = full_df['production_companies'].apply(lambda x : convert_dict_list_to_list(x))
full_df['spoken_languages'] = full_df['spoken_languages'].apply(lambda x : convert_dict_list_to_list(x))
full_df['keywords'] = full_df['keywords'].apply(lambda x : convert_dict_list_to_list(x))
full_df['cast'] = full_df['cast'].apply(lambda x : convert_dict_list_to_list(x))
full_df['crew'] = full_df['crew'].apply(lambda x : convert_dict_list_to_list(x))

#### Remove rows that are not in English, we focus solely on English movies

In [10]:
full_df = full_df[full_df['original_language'] == 'en']

#### Get important keywords from overview and merge with keywords column and remove duplicates

In [11]:
def get_keywords_from_overview(overview):
    if pd.isnull(overview) == False:
        r = Rake()
        r.extract_keywords_from_text(overview)
        key_words_dict_scores = r.get_word_degrees()

        return list(key_words_dict_scores.keys())
    else:
        return []

In [12]:
full_df['overview'] = full_df['overview'].astype("string")

In [13]:
full_df['overview'].isna

<bound method Series.isna of 0       Led by Woody, Andy's toys live happily in his ...
1       When siblings Judy and Peter discover an encha...
2       Obsessive master thief, Neil McCauley leads a ...
3       James Bond must unmask the mysterious head of ...
4       Morgan Adams and her slave, William Shaw, are ...
                              ...                        
7754    A deformed tormented girl drowns herself after...
7755    A group of fashion models disturb the tomb of ...
7756    A less-than-qualified and far-from-perfect pri...
7758          An abstract animation from Walter Ruttmann.
7764    A stranger named Silas flees from a devastatin...
Name: overview, Length: 5533, dtype: string>

In [14]:
full_df.dtypes

budget                   object
genres                   object
id                        int32
imdb_id                  object
original_language        object
original_title           object
overview                 string
popularity               object
production_companies     object
production_countries     object
release_date             object
revenue                 float64
runtime                 float64
spoken_languages         object
status                   object
title                    object
vote_average            float64
vote_count              float64
keywords                 object
cast                     object
crew                     object
imdbId                    int64
tmdbId                  float64
dtype: object

In [15]:
full_df.head()

Unnamed: 0,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,spoken_languages,status,title,vote_average,vote_count,keywords,cast,crew,imdbId,tmdbId
0,30000000,"[Animation, Comedy, Family]",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,[Pixar Animation Studios],[United States of America],...,[English],Released,Toy Story,7.7,5415.0,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",116985,88224.0
1,65000000,"[Adventure, Fantasy, Family]",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],...,"[English, Français]",Released,Jumanji,6.9,2413.0,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",78763,42164.0
2,60000000,"[Action, Crime, Drama, Thriller]",949,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,"[Regency Enterprises, Forward Pass, Warner Bros.]",[United States of America],...,"[English, Español]",Released,Heat,7.7,1886.0,"[robbery, detective, bank, obsession, chase, s...","[Al Pacino, Robert De Niro, Val Kilmer, Jon Vo...","[Michael Mann, Michael Mann, Art Linson, Micha...",48028,220.0
3,58000000,"[Adventure, Action, Thriller]",710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,"[United Artists, Eon Productions]","[United Kingdom, United States of America]",...,"[English, Pусский, Español]",Released,GoldenEye,6.6,1194.0,"[cuba, falsely accused, secret identity, compu...","[Pierce Brosnan, Sean Bean, Izabella Scorupco,...","[Martin Campbell, Ian Fleming, Jeffrey Caine, ...",115851,23449.0
4,98000000,"[Action, Adventure]",1408,tt0112760,en,Cutthroat Island,"Morgan Adams and her slave, William Shaw, are ...",7.284477,"[Le Studio Canal+, Laurence Mark Productions, ...","[France, Germany, Italy, United States of Amer...",...,"[English, Latin]",Released,Cutthroat Island,5.7,137.0,"[exotic island, treasure, map, ship, scalp, pi...","[Geena Davis, Matthew Modine, Frank Langella, ...","[Peter Levy, Maggie Gray, Norman Garwood, Mari...",104691,9361.0


In [16]:
full_df['overview'] = full_df['overview'].apply(lambda x : get_keywords_from_overview(x))

#### Actual Processing of files

In [17]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    list_of_text = text

    if (len(list_of_text) == 0):
        return []

    filtered_sentence = []

    for w in list_of_text:
        lower_text = w.lower()
        if lower_text not in stop_words:
            filtered_sentence.append(lower_text)

    return filtered_sentence

In [18]:
full_df['overview'] = full_df['overview'].apply(lambda x : remove_stopwords(x))
full_df['genres'] = full_df['genres'].astype("string")
full_df['overview'] = full_df['overview'].astype("string")
full_df['production_companies'] = full_df['production_companies'].astype("string")
full_df['production_countries'] = full_df['production_countries'].astype("string")
full_df['keywords'] = full_df['keywords'].astype("string")
full_df['cast'] = full_df['cast'].astype("string")
full_df['crew'] = full_df['crew'].astype("string")

In [19]:
def combine_all_into_bow(row):
    # genre_row = row['genres']
    # overview_row = row['overview']
    # production_companies_row = row['production_companies']
    # production_countries_row = row['production_countries']
    # keywords_row = row['keywords']
    # cast_row = row['cast']
    # crew_row = row['crew']
    # title_row = row['title']

    genre_row = eval(row['genres'])
    overview_row = eval(row['overview'])
    production_companies_row = eval(row['production_companies'])
    production_countries_row = eval(row['production_countries'])
    keywords_row = eval(row['keywords'])
    cast_row = eval(row['cast'])
    crew_row = eval(row['crew'])
    title_row = row['title']

    #Note: Overview keywords will be merged with keywords row to ensure no duplicates.
    overview_keyword_merge = []
    overview_keyword_merge.extend(overview_row)
    overview_keyword_merge.extend(keywords_row)
    
    keywords_no_dup = list(set(overview_keyword_merge))

    #Lowercase all rows that has them
    l_genre_row = map(lambda x : x.lower(), genre_row)
    l_production_companies_row = map(lambda x : x.lower(), production_companies_row)
    l_production_countries_row = map(lambda x : x.lower(), production_countries_row)
    l_cast_row = map(lambda x : x.lower(), cast_row)
    l_crew_row = map(lambda x : x.lower(), crew_row)

    bow_merge = []

    bow_merge.extend(keywords_no_dup)
    bow_merge.extend(l_genre_row)
    bow_merge.extend(l_production_companies_row)
    bow_merge.extend(l_production_countries_row)
    bow_merge.extend(l_cast_row)
    bow_merge.extend(l_crew_row)
    bow_merge.append(title_row.lower())

    return ' '.join(bow_merge)

In [20]:
full_df['bag_of_words'] = full_df.apply(lambda x : combine_all_into_bow(x), axis=1)

In [21]:
final_df = full_df[["title", "vote_average", "release_date", "popularity" ,"bag_of_words", "imdb_id", "tmdbId"]]

In [22]:
final_df

Unnamed: 0,title,vote_average,release_date,popularity,bag_of_words,imdb_id,tmdbId
0,Toy Story,7.7,1995-10-30,21.946943,friends woody learns lightyear place live scen...,tt0114709,88224.0
1,Jumanji,6.9,1995-12-15,17.015539,recluse living -- adult monkeys evil rhinocero...,tt0113497,42164.0
2,Heat,7.7,1995-12-15,17.924927,aware shooting hanna thief cat without neil ba...,tt0113277,220.0
3,GoldenEye,6.6,1995-11-16,14.686036,mysterious goldeneye prevent syndicate secret ...,tt0113189,23449.0
4,Cutthroat Island,5.7,1995-12-22,7.284477,map leadership pirate raids shaw adams murdero...,tt0112760,9361.0
...,...,...,...,...,...,...,...
7754,Frankenstein Created Woman,5.9,1967-03-15,2.302582,responsible guillotined tormented violent girl...,tt0061683,9013.0
7755,Dawn of the Mummy,3.6,1981-12-11,0.283443,mummy curse fashion disturb gore rising desert...,tt0082237,28667.0
7756,The Pope Must Die,4.2,1991-06-21,9.015632,vatican old qualified less named must perfect ...,tt0102691,15952.0
7758,Opus II,6.7,1921-12-31,0.177238,abstract ruttmann animation walter animation g...,tt0403386,415892.0


In [23]:
## Remove duplicated titles if any
final_df = final_df.drop_duplicates(subset="title")

In [24]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

sparql.setQuery("""SELECT ?item ?IMDb_ID ?sitelink WHERE {

              {
                {
                ?item wdt:P31 /wdt:P279* wd:Q11424 .
              ?item wdt:P345 ?IMDb_ID .
              ?sitelink schema:about ?item ; schema:isPartOf <https://en.wikipedia.org/> .
              }
              }
            UNION {
              ?item wdt:P31 /wdt:P279* wd:Q1259759 .
              ?item wdt:P345 ?IMDb_ID .
              ?sitelink schema:about ?item ; schema:isPartOf <https://en.wikipedia.org/> .
            }
                  

            }""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
results_df = pd.json_normalize(results['results']['bindings'])
results_df = results_df[['IMDb_ID.value','sitelink.value']]
results_df.rename(columns={'IMDb_ID.value':'imdbId', 'sitelink.value':"wikipediaId"}, inplace=True)
results_df.drop_duplicates(inplace=True)

In [25]:
final_df2 = final_df.copy()

In [26]:
final_df = final_df2.copy()

In [27]:
final_df

Unnamed: 0,title,vote_average,release_date,popularity,bag_of_words,imdb_id,tmdbId
0,Toy Story,7.7,1995-10-30,21.946943,friends woody learns lightyear place live scen...,tt0114709,88224.0
1,Jumanji,6.9,1995-12-15,17.015539,recluse living -- adult monkeys evil rhinocero...,tt0113497,42164.0
2,Heat,7.7,1995-12-15,17.924927,aware shooting hanna thief cat without neil ba...,tt0113277,220.0
3,GoldenEye,6.6,1995-11-16,14.686036,mysterious goldeneye prevent syndicate secret ...,tt0113189,23449.0
4,Cutthroat Island,5.7,1995-12-22,7.284477,map leadership pirate raids shaw adams murdero...,tt0112760,9361.0
...,...,...,...,...,...,...,...
7754,Frankenstein Created Woman,5.9,1967-03-15,2.302582,responsible guillotined tormented violent girl...,tt0061683,9013.0
7755,Dawn of the Mummy,3.6,1981-12-11,0.283443,mummy curse fashion disturb gore rising desert...,tt0082237,28667.0
7756,The Pope Must Die,4.2,1991-06-21,9.015632,vatican old qualified less named must perfect ...,tt0102691,15952.0
7758,Opus II,6.7,1921-12-31,0.177238,abstract ruttmann animation walter animation g...,tt0403386,415892.0


In [None]:
#final_df['imdbId'] = np.where("tt" in final_df['imdbId'].astype(str), final_df['imdbId'].astype(str), "tt"+final_df['imdbId'].astype(str))
final_df = final_df.merge(results_df, on="imdb_id")


In [None]:
final_df.to_csv("source/final_comparison_df.csv")

In [None]:
final_df

In [None]:
final_df.iloc[0]['bag_of_words']