In [1]:
import nltk
from nltk import word_tokenize

from collections import Counter

In [2]:
import pandas as pd
data = pd.read_csv("D:/datasets/movies.csv")
data.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [5]:
# count the number of occurrences of each token in each text
text_lowercase = data['imdb_plot'].astype(str).str.lower()
text_tokenized = text_lowercase.apply(word_tokenize)
token_counters = text_tokenized.apply(Counter).values.tolist()

# show the tokens found in the first article with at least 10 occurrences
print({token: n_occ for token, n_occ in token_counters[0].items() if n_occ >= 10}) 

{'in': 59, ',': 180, 'for': 34, 'the': 176, 'of': 40, 'don': 40, 'vito': 13, 'corleone': 23, "'s": 63, 'connie': 19, '(': 44, ')': 44, 'and': 96, 'carlo': 24, '.': 107, 'family': 26, 'is': 59, 'to': 98, 'as': 25, '``': 19, "''": 15, 'he': 58, 'hagen': 13, 'a': 69, 'on': 13, 'his': 59, 'who': 17, 'by': 26, 'with': 33, 'business': 10, 'from': 16, 'michael': 53, 'at': 12, 'about': 12, 'father': 10, 'that': 40, 'sonny': 28, 'but': 11, 'not': 10, 'when': 10, 'him': 23, 'has': 10, 'will': 11, 'an': 11, 'sollozzo': 15, 'meeting': 11, 'clemenza': 11}


In [6]:
#Tokenize the query
query = "Police work"
query_tokenized = word_tokenize(query)

In [7]:
# Compute a matching score for each text with respect to the query. The score is
# the number of times each token in the query can be found in a specific text.

def get_scores(query_tokenized, token_counters):
    scores=[]
    for token_counter in token_counters:
        matches=[token_counter[query_token] for query_token in query_tokenized]
        total_scores = sum (matches)
        scores.append(total_scores)
    return scores

scores = get_scores(query_tokenized, token_counters)

In [8]:
# retrieve the top_n articles with the highest scores and show them
import numpy as np
def show_best_results (data, scores, top_n=10):
    best_indexes = np.argsort(scores)[::-1]
    for position, idx in enumerate(best_indexes[:top_n]):
        row = data.iloc[idx]
        title = row['title']
        score = scores[idx]
        print(f"{position + 1} [score = {score}]: {title}")
        
show_best_results(data, scores)

1 [score = 11]: On the Waterfront
2 [score = 5]: Sunset Blvd.
3 [score = 5]: The Shawshank Redemption
4 [score = 5]: The Bridge on the River Kwai
5 [score = 4]: North by Northwest
6 [score = 3]: Some Like It Hot
7 [score = 3]: The Grapes of Wrath
8 [score = 2]: Jaws
9 [score = 2]: Amadeus
10 [score = 2]: The Pianist


In [9]:
#DIFFERENT query
query = "war"
query_tokenized = word_tokenize(query)
scores = get_scores(query_tokenized, token_counters)
show_best_results(data, scores)

1 [score = 15]: Gone with the Wind
2 [score = 7]: Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb
3 [score = 6]: Braveheart
4 [score = 5]: The Pianist
5 [score = 5]: Giant
6 [score = 5]: The Godfather
7 [score = 4]: The Best Years of Our Lives
8 [score = 4]: Saving Private Ryan
9 [score = 4]: A Streetcar Named Desire
10 [score = 4]: Platoon


# Removing Stopwords

In [10]:
from nltk.corpus import stopwords
import string

english_stopwords = stopwords.words('english')

In [11]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [13]:
#count the number of occurences of each token in each text

text_lower = data['imdb_plot'].astype(str).str.lower()
text_tokenized = text_lower.apply(word_tokenize)
text_tokenized_no_swrds = text_tokenized.apply(lambda token_list: [token for token in token_list
                                                                  if token not in english_stopwords 
                                                                   and token not in string.punctuation])
token_counters = text_tokenized_no_swrds.apply(Counter).values.tolist()
# show the tokens found in the first article with at least 6 occurrences
print({token: n_occ for token, n_occ in token_counters[0].items() if n_occ >= 6})

{'wedding': 6, 'vito': 13, 'corleone': 23, "'s": 63, 'connie': 19, 'carlo': 24, 'family': 26, '``': 19, "''": 15, 'tom': 8, 'hagen': 13, 'one': 9, 'men': 9, 'business': 10, 'michael': 53, 'tells': 8, 'kay': 8, 'father': 10, 'fredo': 7, 'sonny': 28, 'woltz': 6, 'brasi': 6, 'sollozzo': 15, 'tattaglia': 9, 'meeting': 11, 'later': 7, 'assassination': 6, 'paulie': 7, 'clemenza': 11, 'corleones': 7, 'rocco': 6, 'car': 6, 'killed': 9, 'away': 6, 'mccluskey': 8, 'barzini': 9, 'tessio': 8}


In [32]:
query = 'struck gold'
query_tokens = word_tokenize(query)
query_tokens_no_sw = [token for token in query_tokens
                      if token not in english_stopwords and
                      token not in string.punctuation]
print(f"Tokenized query without stopwords: {query_tokens_no_sw}")
print()

# show best results
scores = get_scores(query_tokens, token_counters)
show_best_results(data, scores)

Tokenized query without stopwords: ['struck', 'gold']

1 [score = 12]: The Treasure of the Sierra Madre
2 [score = 3]: Star Wars
3 [score = 3]: Pulp Fiction
4 [score = 2]: The Good, the Bad and the Ugly
5 [score = 1]: Good Will Hunting
6 [score = 1]: Gone with the Wind
7 [score = 1]: Gandhi
8 [score = 1]: Saving Private Ryan
9 [score = 1]: Titanic
10 [score = 1]: The Godfather: Part II
