In [75]:
import json
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from progress.bar import Bar
import re
import argparse

In [76]:
path_to_data = './noun-modifications/noun-modifications-test-5-new-lines.json'

In [77]:
with open(path_to_data, 'r') as json_in: 
     content = json.load(json_in)

In [78]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [79]:
def sentence_splitter(document):
    """
      Sentence splitter to deal with bullet items in texts.
    """
    # Tokenize per 'sub sentence list' instead of joining (to keep markdown headers separated)
    unflattened_sentences = (sent_tokenize(sent_item)
                             for sent_item in document)
    # Flatten sentences: (https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists)
    sentences = (
        sentence for sub_sentences in unflattened_sentences for sentence in sub_sentences)
    pattern = re.compile(r"^[0-9]+\.$")
    merged_item_sents = []

    sentence = next(sentences)
    while sentence:
        if re.match(pattern, sentence):
            next_sentence = next(sentences)
            merged = f"{sentence} {next_sentence}"
            merged_item_sents.append(merged)
        else:
            merged_item_sents.append(sentence)

        try:
            sentence = next(sentences)
        except StopIteration:
            sentence = False
    return merged_item_sents

In [80]:
content[1272]['Source_Context']
for index, sent in enumerate(content[1272]['Source_Context']): 
    print(index, '\t', sent)

0 	 3. Open the upper portion of the slicer.<br><br>
1 	 4. Place it on the opposite side of the main portion.<br><br>
2 	 5. Place the holder properly.<br><br>
3 	 6. Pick an egg with the help of the tongs or use fingers.<br><br>
4 	 7. Place it in the holder of the slicer carefully.<br><br><br><br>
5 	 8. Place the upper portion of the slicer on the egg.<br><br><br><br>
6 	 9. Press it with the fingers.<br><br>
7 	 10. Press harder once the slits are made on the egg white.<br><br><br><br><br><br>
8 	 11. Pull the upper portion lever till the end of the slicer as this would ensure that the egg has been sliced properly.<br><br><br><br><br><br>
9 	 12. Remove the upper portion of the slicer.
10 	 13. Keep the round slices of the egg on a plate.<br><br><br><br><br><br>


In [81]:
tokenized = sentence_splitter(content[1272]['Source_Context'])
for index, sent in enumerate(tokenized): 
    print(index, '\t', sent)

0 	 3. Open the upper portion of the slicer.<br><br>
1 	 4. Place it on the opposite side of the main portion.<br><br>
2 	 5. Place the holder properly.<br><br>
3 	 6. Pick an egg with the help of the tongs or use fingers.<br><br>
4 	 7. Place it in the holder of the slicer carefully.<br><br><br><br>
5 	 8. Place the upper portion of the slicer on the egg.<br><br><br><br>
6 	 9. Press it with the fingers.<br><br>
7 	 10. Press harder once the slits are made on the egg white.<br><br><br><br><br><br>
8 	 11. Pull the upper portion lever till the end of the slicer as this would ensure that the egg has been sliced properly.<br><br><br><br><br><br>
9 	 12. Remove the upper portion of the slicer.
10 	 13. Keep the round slices of the egg on a plate.<br><br><br><br><br><br>


In [106]:
def get_matching_sent_context(context, sent, windows=[1, 2, 3, 4, 5], use_sent_from_context=True):
    """
        Use this function to get closest match to a source_line or target_line in a paragraph.
    """
    sentence_tokenized_document = sentence_splitter(context)
    bleu_scores = []
    sents = []
    for elem in sentence_tokenized_document:
        elem = remove_html_tags(elem)
        reference = [word_tokenize(elem)]
        score = sentence_bleu(reference, sent)
        bleu_scores.append(score)
        sents.append(elem)
    index_of_max_bleu = bleu_scores.index(max(bleu_scores))
    matched_sent = sents[index_of_max_bleu]
    print(matched_sent)

    # make context here
    previous_sentences = []
    next_sentences = []
    sent_indexes = [i for i in range(len(sents))]
    for window in windows:
        next_sent_pos = index_of_max_bleu+window
        if next_sent_pos in sent_indexes:
            next_sent = sents[index_of_max_bleu+window]
            next_sentences.append(next_sent)
        # repeat for previous_sentences
        previous_sent_pos = index_of_max_bleu-window
        if previous_sent_pos in sent_indexes:
            previous_sent = sents[index_of_max_bleu - window]
            previous_sentences.append(previous_sent)

    previous_sentences.reverse()
    if use_sent_from_context:
        context = previous_sentences + [matched_sent] + next_sentences
    else:
        context = previous_sentences + [' '.join(sent)] + next_sentences
    return context

In [107]:
source_tokenized = word_tokenize(content[1272]['Source_Line'])
source_tokenized


# match 
source_context_filtered = get_matching_sent_context(
            content[1272]['Source_Context'], source_tokenized)

8. Place the upper portion of the slicer on the egg.


In [108]:
for index, sent in enumerate(source_context_filtered): 
    print(index, '\t', sent)

0 	 3. Open the upper portion of the slicer.
1 	 4. Place it on the opposite side of the main portion.
2 	 5. Place the holder properly.
3 	 6. Pick an egg with the help of the tongs or use fingers.
4 	 7. Place it in the holder of the slicer carefully.
5 	 8. Place the upper portion of the slicer on the egg.
6 	 9. Press it with the fingers.
7 	 10. Press harder once the slits are made on the egg white.
8 	 11. Pull the upper portion lever till the end of the slicer as this would ensure that the egg has been sliced properly.
9 	 12. Remove the upper portion of the slicer.
10 	 13. Keep the round slices of the egg on a plate.
