In [1]:
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary, BucketSampler, DataSetIter, SequentialSampler
from fastNLP import DataSet
from fastNLP.io.data_bundle import DataBundle
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from tqdm import tqdm

In [2]:
def load_data(path, keys):
    metadata = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.split('+++$+++')
            line = [x.strip() for x in line]
            metadata[line[0]] = dict(zip(keys, line[1:]))
    return metadata

In [3]:
movie_titles = load_data('./data/cornell movie-dialogs corpus/movie_titles_metadata.txt', 
                         ['movie title', 'movie year', 'IMDB rating', 'no. IMDB votes', 'genres'])
movie_characters = load_data('./data/cornell movie-dialogs corpus/movie_characters_metadata.txt',
                            ['character name', 'movieID', 'movie title', 'gender', 'position in credits']) 
movie_lines = load_data('./data/cornell movie-dialogs corpus/movie_lines.txt',
                       ['characterID', 'movieID', 'character name', 'utterance'])

In [4]:
movie_characters['u5100']

{'character name': 'LUKE',
 'movieID': 'm337',
 'movie title': 'star wars: the empire strikes back',
 'gender': 'm',
 'position in credits': '?'}

In [5]:
movie_characters['u7249']

{'character name': 'LUKE',
 'movieID': 'm489',
 'movie title': 'star wars: episode vi - return of the jedi',
 'gender': 'm',
 'position in credits': '1'}

In [6]:
movie_characters['u7824']

{'character name': 'LUKE',
 'movieID': 'm529',
 'movie title': 'star wars',
 'gender': 'm',
 'position in credits': '1'}

In [7]:
utt = []
for line in movie_lines:
    if movie_lines[line]['characterID'] in ['u5100', 'u7249', 'u7824']:
        print(movie_lines[line]['utterance'])
        utt.append(movie_lines[line]['utterance'])
#print(utt[:20])

You won't.
And that is why I must go.
I feel The Force.
I can help them!
But Han and Leia will surely die.
Ben ... Ben ...
I'm cold ... so cold ...
I can't ...
How could you know my father? You don't even know who I am.  Can't we get on with this already?
Because of my father, I guess.
Will it take long to get there? How far away is he?
Look, it smells good.  I'm sure it's delicious   But I don't know why we can't see Yoda now.
... I told you, I'm not hungry.
You know the Jedi Master?
I'm not looking for a friend. I'm looking for a Jedi Master.
Okay, Artoo, let him have it. Now get out of here, little fellow, we've got things to do.
I don't want your help.  I want my lamp back.  I'll need it in this slimy mudhole.
Give me that!
Listen, friend, we didn't mean to land here, and if I could get my fighter out of this puddle I would, but I can't.  So ...
I'm looking for a great warrior.
Yeah ...
I'm looking for someone.
... Like we're being watched.
I hope you make your peace with Jabba.
Ab

In [9]:
fout = open("Luke.txt", "w")
for i in utt:
    fout.write(i + "\n")
fout.close()

In [10]:
utt = [word_tokenize(x) for x in utt]

In [64]:
sen = "He is in the cabinet ."
sen = word_tokenize(sen)
print(sen)

['He', 'is', 'in', 'the', 'cabinet', '.']


In [65]:
score = [(sentence_bleu([x], sen, weights=(0.5, 0.5, 0, 0)), x) for x in utt]
print(sorted(score)[-10:])

[(6.17087516772532e-155, ['I', 'am', 'the', 'son', 'of', 'Annikin', 'Starkiller', '.']), (6.17087516772532e-155, ['I', 'thought', 'I', 'was', 'in', 'good', 'shape', '.']), (7.2900276358863456e-155, ['He', 'ca', "n't", 'find', 'Organa', 'Major', '.']), (7.2900276358863456e-155, ['There', "'s", 'still', 'good', 'in', 'him', '.']), (8.612150057732663e-155, ['He', "'ll", 'survive', '.']), (8.612150057732663e-155, ['He', "'s", 'my', 'father', '.']), (0.004150087420293821, ['Tell', 'me', 'where', 'you', "'ve", 'been', '.', 'I', 'am', 'going', 'to', 'become', 'a', 'startrooper', 'and', 'fight', 'in', 'the', 'wars', '.', 'My', 'father', 'was', 'a', 'Jedi', '!', 'Were', 'you', 'in', 'many', 'battles', '?']), (0.08040396667843576, ['You', "'ve", 'been', 'in', 'the', 'Counter', 'Wars', '?', '...', 'Against', 'the', 'Empire', '?']), (0.18500758885923071, ['You', 'do', "n't", 'believe', 'in', 'the', 'Force', '?']), (0.2265870956238665, ['I', 'see', 'a', 'city', 'in', 'the', 'clouds', '.'])]
