# Labsheet 8 task - 18001981

### utils

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# We improve our feature matrix builder with 3 additional optional parameters
# This allows us to extract not only word features, but also n-gram features
# We can also set the minimum and maximum frequencies to be considered as valid
# NB: All these are simply passed on to sklearn's Vectorizer classes
def build_feature_matrix(documents, feature_type='frequency',
                         ngram_range=(1, 1), min_df=0.0, max_df=1.0):

    feature_type = feature_type.lower().strip()  
    
    if feature_type == 'binary':
        vectorizer = CountVectorizer(binary=True, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'frequency':
        vectorizer = CountVectorizer(binary=False, min_df=min_df,
                                     max_df=max_df, ngram_range=ngram_range)
    elif feature_type == 'tfidf':
        vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                     ngram_range=ngram_range)
    else:
        raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")

    feature_matrix = vectorizer.fit_transform(documents).astype(float)
    
    return vectorizer, feature_matrix



### contractions

In [2]:
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

### normalization

In [3]:
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from html.parser import HTMLParser
import unicodedata
import html

stopword_list = nltk.corpus.stopwords.words('english')
# We need to extend the stopword list for the domain we are working with
stopword_list = stopword_list + ['mr', 'mrs', 'come', 'go', 'get',
                                 'tell', 'listen', 'one', 'two', 'three',
                                 'four', 'five', 'six', 'seven', 'eight',
                                 'nine', 'zero', 'join', 'find', 'make',
                                 'say', 'ask', 'tell', 'see', 'try', 'back',
                                 'also']
wnl = WordNetLemmatizer()

def tokenize_text(text):
    tokens = nltk.word_tokenize(text) 
    tokens = [token.strip() for token in tokens]
    return tokens

def expand_contractions(text, contraction_mapping):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
    
    
# from pattern.en import tag (if using python 2.x only)
from nltk.corpus import wordnet as wn

# Annotate text tokens with POS tags
def pos_tag_text(text):
    
    def penn_to_wn_tags(pos_tag):
        if pos_tag.startswith('J'):
            return wn.ADJ
        elif pos_tag.startswith('V'):
            return wn.VERB
        elif pos_tag.startswith('N'):
            return wn.NOUN
        elif pos_tag.startswith('R'):
            return wn.ADV
        else:
            return None

    tokens = nltk.word_tokenize(text)
    tagged_text = nltk.pos_tag(tokens)

#    tagged_text = tag(text) # If using pattern.en in python 2.x only
    tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag))
                         for word, pos_tag in
                         tagged_text]
    return tagged_lower_text
    
# lemmatize text based on POS tags    
def lemmatize_text(text):
    
    pos_tagged_text = pos_tag_text(text)
    lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
                         else word                     
                         for word, pos_tag in pos_tagged_text]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text
    

def remove_special_characters(text):
    tokens = tokenize_text(text)
    pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
    filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
    
    
def remove_stopwords(text):
    tokens = tokenize_text(text)
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

def keep_text_characters(text):
    filtered_tokens = []
    tokens = tokenize_text(text)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# We generalize our normalize function to be able to focus on text tokens alone - 3rd parameter
def normalize_corpus(corpus, lemmatize=True, only_text_chars=False, tokenize=False):
    
    normalized_corpus = []    
    for text in corpus:
        text = html.unescape(text)
        text = expand_contractions(text, CONTRACTION_MAP)
        if lemmatize:
            text = lemmatize_text(text)
        else:
            text = text.lower()
        text = remove_special_characters(text)
        text = remove_stopwords(text)
        # In case we only need to consider text tokens - ignoring numerical tokens for instance
        if only_text_chars:
            text = keep_text_characters(text) 
        
        if tokenize:
            text = tokenize_text(text)
            normalized_corpus.append(text)
        else:
            normalized_corpus.append(text)
            
    return normalized_corpus
   

In [4]:
import numpy as np

def compute_cosine_similarity(doc_features, corpus_features,
                              top_n=3):
    # Get document vectors
    doc_features = doc_features[0]
    # Compute similarities by calling dot.product on transposed corpus feature vector
    similarity = np.dot(doc_features, 
                        corpus_features.T)
    similarity = similarity.toarray()[0]
    # Get docs with highest similarity scores
    top_docs = similarity.argsort()[::-1][:top_n]
    top_docs_with_score = [(index, round(similarity[index], 3))
                            for index in top_docs]
    return top_docs_with_score

In [5]:
def getMovieTitle(movie_data,synopsis):
    return movie_data[movie_data.eq(synopsis).any(1)].iloc[0]["Title"]    

### Finding top 3 closest movie titles for each movie synopsis

In [6]:
import pandas as pd

movie_data = pd.read_csv('movie_data.csv')
print(movie_data.head())

                      Title                                           Synopsis
0             The Godfather  In late summer 1945, guests are gathered for t...
1  The Shawshank Redemption  In 1947, Andy Dufresne (Tim Robbins), a banker...
2          Schindler's List  The relocation of Polish Jews from surrounding...
3               Raging Bull  The film opens in 1964, where an older and fat...
4                Casablanca  In the early years of World War II, December 1...


In [7]:
corpus = normalize_corpus(movie_data["Synopsis"], lemmatize=True)

tfidf_vectorizer, tfidf_features = build_feature_matrix(corpus,
                                                        feature_type='tfidf',
                                                        ngram_range=(1, 1), 
                                                        min_df=0.0, max_df=1.0)

movie_tfidfs = tfidf_vectorizer.transform(corpus)

### Similarity Analysis Using Cosine Similarity

In [8]:
print('Similarity analysis using Cosine similarity')
print('='*60)
for index, synopsis in enumerate(movie_data["Synopsis"]):

    tfidf = movie_tfidfs[index]
    top_similar_titles = compute_cosine_similarity(tfidf,
                                               tfidf_features,
                                               top_n=3)

    print('Synopsis',index+1 ,':', synopsis[:100],"...")
    original_title = getMovieTitle(movie_data,synopsis)
    print('Original title: ', original_title,'\n')
    print('Top', len(top_similar_titles), 'similar titles:')
    print('-'*40)
    for movie_index, sim_score in top_similar_titles:
        sim_title = movie_data.iloc[movie_index]["Title"]
        print('Movie num: {} Similarity Score: {}\nTitle: {}'.format(movie_index+1,
                                                                 sim_score,
                                                                 sim_title))
        print('-'*40)    
    print()
    print()
    print()

Similarity analysis using Cosine similarity
Synopsis 1 : In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daughter C ...
Original title:  The Godfather 

Top 3 similar titles:
----------------------------------------
Movie num: 1 Similarity Score: 1.0
Title: The Godfather
----------------------------------------
Movie num: 12 Similarity Score: 0.577
Title: The Godfather: Part II
----------------------------------------
Movie num: 76 Similarity Score: 0.288
Title: Tootsie
----------------------------------------



Synopsis 2 : In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and her  ...
Original title:  The Shawshank Redemption 

Top 3 similar titles:
----------------------------------------
Movie num: 2 Similarity Score: 1.0
Title: The Shawshank Redemption
----------------------------------------
Movie num: 36 Similarity Score: 0.081
Title: From Here to Eternity
---------------------------------------

Synopsis 45 : At the end of World War II, Fred Derry (Dana Andrews), Al Stevenson (Fredric March) and Homer Parris ...
Original title:  The Best Years of Our Lives 

Top 3 similar titles:
----------------------------------------
Movie num: 45 Similarity Score: 1.0
Title: The Best Years of Our Lives
----------------------------------------
Movie num: 53 Similarity Score: 0.066
Title: Butch Cassidy and the Sundance Kid
----------------------------------------
Movie num: 18 Similarity Score: 0.053
Title: The Sound of Music
----------------------------------------



Synopsis 46 : Henry Higgins (Rex Harrison), an arrogant, irascible professor of phonetics, boasts to a new acquain ...
Original title:  My Fair Lady 

Top 3 similar titles:
----------------------------------------
Movie num: 46 Similarity Score: 1.0
Title: My Fair Lady
----------------------------------------
Movie num: 63 Similarity Score: 0.04
Title: All Quiet on the Western Front
----------------------------------------
Mov

Synopsis 93 : Travis Bickle (Robert De Niro) goes to a New York City taxi depot where he applies for a job as a dr ...
Original title:  Taxi Driver 

Top 3 similar titles:
----------------------------------------
Movie num: 93 Similarity Score: 1.0
Title: Taxi Driver
----------------------------------------
Movie num: 70 Similarity Score: 0.033
Title: Mr. Smith Goes to Washington
----------------------------------------
Movie num: 63 Similarity Score: 0.026
Title: All Quiet on the Western Front
----------------------------------------



Synopsis 94 : A servant in the house of Wuthering Heights tells a traveler the unfortunate tale of lovers Cathy an ...
Original title:  Wuthering Heights 

Top 3 similar titles:
----------------------------------------
Movie num: 94 Similarity Score: 1.0
Title: Wuthering Heights
----------------------------------------
Movie num: 26 Similarity Score: 0.032
Title: Singin' in the Rain
----------------------------------------
Movie num: 45 Similarity Scor

### Similarity Analysis Using Hellinger-Bhattacharya Distance

In [9]:
def compute_hellinger_bhattacharya_distance(doc_features, corpus_features,
                                            top_n=3):
    # Get document vectors                                            
    doc_features = doc_features.toarray()[0]
    corpus_features = corpus_features.toarray()
    # Compute HB distances
    distance = np.hstack(
                    np.sqrt(0.5 *
                            np.sum(
                                np.square(np.sqrt(doc_features) - 
                                          np.sqrt(corpus_features)), 
                                axis=1)))
    # Get docs with lowest distance scores                            
    top_docs = distance.argsort()[:top_n]
    top_docs_with_score = [(index, round(distance[index], 3))
                            for index in top_docs]
    return top_docs_with_score 

print('Similarity Analysis using Hellinger-Bhattacharya distance')
print('='*60)
for index, synopsis in enumerate(movie_data["Synopsis"]):
    
    tfidf = movie_tfidfs[index]
    top_similar_titles = compute_hellinger_bhattacharya_distance(tfidf,
                                             tfidf_features,
                                             top_n=3)
    print('Synopsis',index+1 ,':', synopsis[:100],"...")
    original_title = getMovieTitle(movie_data,synopsis)
    print('Original title: ', original_title,'\n')
    print('Top', len(top_similar_titles), 'similar titles:')
    print('-'*40)
    for movie_index, sim_score in top_similar_titles:
        sim_title = movie_data.iloc[movie_index]["Title"]
        print('Movie num: {} Distance Score: {}\nTitle: {}'.format(movie_index+1,
                                                                 sim_score,
                                                                 sim_title))
        print('-'*40)
    print()                 


Similarity Analysis using Hellinger-Bhattacharya distance
Synopsis 1 : In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daughter C ...
Original title:  The Godfather 

Top 3 similar titles:
----------------------------------------
Movie num: 1 Distance Score: 0.0
Title: The Godfather
----------------------------------------
Movie num: 12 Distance Score: 2.815
Title: The Godfather: Part II
----------------------------------------
Movie num: 40 Distance Score: 3.029
Title: Rocky
----------------------------------------

Synopsis 2 : In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and her  ...
Original title:  The Shawshank Redemption 

Top 3 similar titles:
----------------------------------------
Movie num: 2 Distance Score: 0.0
Title: The Shawshank Redemption
----------------------------------------
Movie num: 40 Distance Score: 2.661
Title: Rocky
----------------------------------------
Movie num: 91 

Synopsis 21 : In a forested area overlooking a sprawling suburban neighborhood, an alien spacecraft has landed. Th ...
Original title:  E.T. the Extra-Terrestrial 

Top 3 similar titles:
----------------------------------------
Movie num: 21 Distance Score: 0.0
Title: E.T. the Extra-Terrestrial
----------------------------------------
Movie num: 40 Distance Score: 2.424
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 2.464
Title: The Maltese Falcon
----------------------------------------

Synopsis 22 : To Richard Strauss' tone poem "Thus Spake Zarathustra," the title sequence shows the sun rising behi ...
Original title:  2001: A Space Odyssey 

Top 3 similar titles:
----------------------------------------
Movie num: 22 Distance Score: 0.0
Title: 2001: A Space Odyssey
----------------------------------------
Movie num: 40 Distance Score: 3.007
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 3.038
Title: The Malte

Synopsis 37 : An American flag back-lighted by the afternoon sun gently flaps in the breeze. The camera pulls back ...
Original title:  Saving Private Ryan 

Top 3 similar titles:
----------------------------------------
Movie num: 37 Distance Score: 0.0
Title: Saving Private Ryan
----------------------------------------
Movie num: 40 Distance Score: 2.758
Title: Rocky
----------------------------------------
Movie num: 98 Distance Score: 2.809
Title: The Third Man
----------------------------------------

Synopsis 38 : William Munny (Clint Eastwood) is a widower with two young children. He was once a very vicious gunf ...
Original title:  Unforgiven 

Top 3 similar titles:
----------------------------------------
Movie num: 38 Distance Score: 0.0
Title: Unforgiven
----------------------------------------
Movie num: 40 Distance Score: 2.62
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 2.644
Title: The Maltese Falcon
--------------------------------

Synopsis 54 : Fred Dobbs (Humphrey Bogart) and Bob Curtin (Tim Holt) are down on their luck in Mexico and are both ...
Original title:  The Treasure of the Sierra Madre 

Top 3 similar titles:
----------------------------------------
Movie num: 54 Distance Score: 0.0
Title: The Treasure of the Sierra Madre
----------------------------------------
Movie num: 91 Distance Score: 2.553
Title: The Maltese Falcon
----------------------------------------
Movie num: 40 Distance Score: 2.562
Title: Rocky
----------------------------------------

Synopsis 55 : C. C. Baxter (Jack Lemmon) is a lonely office drone for an insurance company in New York City. Four  ...
Original title:  The Apartment 

Top 3 similar titles:
----------------------------------------
Movie num: 55 Distance Score: 0.0
Title: The Apartment
----------------------------------------
Movie num: 40 Distance Score: 2.436
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 2.479
Title: The Maltese F

Synopsis 72 : Annie Hall is a film about a comedian, Alvy Singer (Woody Allen), who falls in love with Annie Hall  ...
Original title:  Annie Hall 

Top 3 similar titles:
----------------------------------------
Movie num: 72 Distance Score: 0.0
Title: Annie Hall
----------------------------------------
Movie num: 40 Distance Score: 2.517
Title: Rocky
----------------------------------------
Movie num: 44 Distance Score: 2.524
Title: An American in Paris
----------------------------------------

Synopsis 73 : [Out Of Africa]A well-heeled Danish lady goes to an English colony in Africa and buys 1000 acres. He ...
Original title:  Out of Africa 

Top 3 similar titles:
----------------------------------------
Movie num: 73 Distance Score: 0.0
Title: Out of Africa
----------------------------------------
Movie num: 40 Distance Score: 2.885
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 2.895
Title: The Maltese Falcon
------------------------------------

Top 3 similar titles:
----------------------------------------
Movie num: 89 Distance Score: 0.0
Title: Stagecoach
----------------------------------------
Movie num: 40 Distance Score: 2.735
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 2.736
Title: The Maltese Falcon
----------------------------------------

Synopsis 90 : In 1787, British ship Bounty leaves Portsmouth to bring a cargo of bread-fruit from Tahiti but the s ...
Original title:  Mutiny on the Bounty 

Top 3 similar titles:
----------------------------------------
Movie num: 90 Distance Score: 0.0
Title: Mutiny on the Bounty
----------------------------------------
Movie num: 40 Distance Score: 2.619
Title: Rocky
----------------------------------------
Movie num: 91 Distance Score: 2.633
Title: The Maltese Falcon
----------------------------------------

Synopsis 91 : Private eye Sam Spade and his partner Miles Archer are approached by Miss Wonderly to follow a man,  ...
Original tit

### Similarity Analysis Using BM25

In [10]:
import scipy.sparse as sp 

def compute_corpus_term_idfs(corpus_features, norm_corpus):
    
    dfs = np.diff(sp.csc_matrix(corpus_features, copy=True).indptr)
    dfs = 1 + dfs # to smoothen idf later
    total_docs = 1 + len(norm_corpus)
    idfs = 1.0 + np.log(float(total_docs) / dfs)
    return idfs


def compute_bm25_similarity(doc_features, corpus_features,
                            corpus_doc_lengths, avg_doc_length,
                            term_idfs, k1=1.5, b=0.75, top_n=3):
    # Get corpus bag of words features
    corpus_features = corpus_features.toarray()
    # convert query document features to binary features
    # this is to keep a note of which terms exist per document
    doc_features = doc_features.toarray()[0]
    doc_features[doc_features >= 1] = 1
    
    # Compute the document idf scores for present terms
    doc_idfs = doc_features * term_idfs
    # compute numerator expression in BM25 equation
    numerator_coeff = corpus_features * (k1 + 1)
    numerator = np.multiply(doc_idfs, numerator_coeff)
    # Compute denominator expression in BM25 equation
    denominator_coeff =  k1 * (1 - b + 
                                (b * (corpus_doc_lengths / 
                                        avg_doc_length)))
    denominator_coeff = np.vstack(denominator_coeff)
    denominator = corpus_features + denominator_coeff
    # Compute the BM25 score combining the above equations
    bm25_scores = np.sum(np.divide(numerator,
                                   denominator),
                         axis=1)
    # Get top n relevant docs with highest BM25 score                     
    top_docs = bm25_scores.argsort()[::-1][:top_n]
    top_docs_with_score = [(index, round(bm25_scores[index], 3))
                            for index in top_docs]
    return top_docs_with_score

# Build bag of words based features first
vectorizer, corpus_features = build_feature_matrix(corpus,
                                                   feature_type='frequency')
# We use the same vectorizer that we used to build the feature matrix for the corpus also for query doc
query_docs_features = vectorizer.transform(corpus)

# Get average document length of the corpus (avgdl)
doc_lengths = [len(doc.split()) for doc in corpus]   
avg_dl = np.average(doc_lengths) 

# Get the corpus term idfs
corpus_term_idfs = compute_corpus_term_idfs(corpus_features,
                                            corpus)

# Analyze document similarity using BM25 framework    
print('Similarity Analysis using BM25')
print('='*60)
for index, synopsis in enumerate(movie_data["Synopsis"]):
    
    doc_features = query_docs_features[index]
    top_similar_titles = compute_bm25_similarity(doc_features,
                                               corpus_features,
                                               doc_lengths,
                                               avg_dl,
                                               corpus_term_idfs,
                                               k1=1.5, b=0.75,
                                               top_n=3)
    print('Synopsis',index+1 ,':', synopsis[:100],"...")
    original_title = getMovieTitle(movie_data,synopsis)
    print('Original title: ', original_title,'\n')
    print('Top', len(top_similar_titles), 'similar titles:')
    print('-'*40)
    for movie_index, sim_score in top_similar_titles:
        sim_title = movie_data.iloc[movie_index]["Title"]
        print('Movie num: {} BM25 Score: {}\nTitle: {}'.format(movie_index+1,
                                                                 sim_score,
                                                                 sim_title)) 
        print('-'*40)
    print()
    


Similarity Analysis using BM25
Synopsis 1 : In late summer 1945, guests are gathered for the wedding reception of Don Vito Corleone's daughter C ...
Original title:  The Godfather 

Top 3 similar titles:
----------------------------------------
Movie num: 1 BM25 Score: 2445.764
Title: The Godfather
----------------------------------------
Movie num: 12 BM25 Score: 792.182
Title: The Godfather: Part II
----------------------------------------
Movie num: 77 BM25 Score: 523.034
Title: Fargo
----------------------------------------

Synopsis 2 : In 1947, Andy Dufresne (Tim Robbins), a banker in Maine, is convicted of murdering his wife and her  ...
Original title:  The Shawshank Redemption 

Top 3 similar titles:
----------------------------------------
Movie num: 2 BM25 Score: 2008.344
Title: The Shawshank Redemption
----------------------------------------
Movie num: 77 BM25 Score: 450.535
Title: Fargo
----------------------------------------
Movie num: 16 BM25 Score: 444.955
Title: On t

Synopsis 23 : Promising FBI Academy student Clarice Starling is pulled from her training at the FBI Training Facil ...
Original title:  The Silence of the Lambs 

Top 3 similar titles:
----------------------------------------
Movie num: 23 BM25 Score: 1421.007
Title: The Silence of the Lambs
----------------------------------------
Movie num: 99 BM25 Score: 247.234
Title: North by Northwest
----------------------------------------
Movie num: 20 BM25 Score: 213.077
Title: Star Wars
----------------------------------------

Synopsis 24 : Set in 1937 Los Angeles, a private investigator named Jake "J.J." Gittes (Nicholson) is hired to spy ...
Original title:  Chinatown 

Top 3 similar titles:
----------------------------------------
Movie num: 24 BM25 Score: 1749.957
Title: Chinatown
----------------------------------------
Movie num: 99 BM25 Score: 417.372
Title: North by Northwest
----------------------------------------
Movie num: 77 BM25 Score: 410.513
Title: Fargo
--------------------

Synopsis 39 : In the spring of 1936 an exploration party penetrates thick jungle on the South American continent.  ...
Original title:  Raiders of the Lost Ark 

Top 3 similar titles:
----------------------------------------
Movie num: 39 BM25 Score: 2004.933
Title: Raiders of the Lost Ark
----------------------------------------
Movie num: 20 BM25 Score: 418.306
Title: Star Wars
----------------------------------------
Movie num: 37 BM25 Score: 374.287
Title: Saving Private Ryan
----------------------------------------

Synopsis 40 : Philadelphia Pennsylvania, home to the number one underdog fighter, Rocky Balboa (Sylvester Stalone) ...
Original title:  Rocky 

Top 3 similar titles:
----------------------------------------
Movie num: 40 BM25 Score: 1860.821
Title: Rocky
----------------------------------------
Movie num: 77 BM25 Score: 541.731
Title: Fargo
----------------------------------------
Movie num: 16 BM25 Score: 489.157
Title: On the Waterfront
------------------------------

Synopsis 60 : The film opens with three men driving in their car late at night on a highway. In the car are Henry  ...
Original title:  Goodfellas 

Top 3 similar titles:
----------------------------------------
Movie num: 60 BM25 Score: 1835.518
Title: Goodfellas
----------------------------------------
Movie num: 77 BM25 Score: 343.347
Title: Fargo
----------------------------------------
Movie num: 12 BM25 Score: 332.819
Title: The Godfather: Part II
----------------------------------------

Synopsis 61 : Father Lancaster Merrin (Max Von Sydow) is an elderly Catholic priest on an archeological dig in Ira ...
Original title:  The Exorcist 

Top 3 similar titles:
----------------------------------------
Movie num: 61 BM25 Score: 1956.002
Title: The Exorcist
----------------------------------------
Movie num: 99 BM25 Score: 380.046
Title: North by Northwest
----------------------------------------
Movie num: 92 BM25 Score: 377.965
Title: A Clockwork Orange
-----------------------------

Synopsis 81 : The movie begins with an old man named Paul Edgecomb (Dabbs Greer) in a retirement center. He takes  ...
Original title:  The Green Mile 

Top 3 similar titles:
----------------------------------------
Movie num: 81 BM25 Score: 1777.464
Title: The Green Mile
----------------------------------------
Movie num: 98 BM25 Score: 388.364
Title: The Third Man
----------------------------------------
Movie num: 2 BM25 Score: 385.653
Title: The Shawshank Redemption
----------------------------------------

Synopsis 82 : In what appers to be the Sonoran Desert; in or near Mexico, a cartographer named David Laughlin (Bob ...
Original title:  Close Encounters of the Third Kind 

Top 3 similar titles:
----------------------------------------
Movie num: 82 BM25 Score: 2109.857
Title: Close Encounters of the Third Kind
----------------------------------------
Movie num: 86 BM25 Score: 458.036
Title: American Graffiti
----------------------------------------
Movie num: 99 BM25 Score: 455