In [199]:
import json
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from progress.bar import Bar
import re
import argparse

In [52]:
with open('./noun-modifications/noun-modifications-test-with-match.json', 'r') as json_in: 
     content = json.load(json_in)

In [201]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [195]:
def sentence_splitter(document):
    """
      Sentence splitter to deal with bullet items in texts.
    """
    # Tokenize per 'sub sentence list' instead of joining (to keep markdown headers separated)
    unflattened_sentences = (sent_tokenize(sent_item)
                             for sent_item in document)
    # Flatten sentences: (https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists)
    sentences = (
        sentence for sub_sentences in unflattened_sentences for sentence in sub_sentences)
    pattern = re.compile(r"^[0-9]+\.$")
    merged_item_sents = []

    sentence = next(sentences)
    while sentence:
        if re.match(pattern, sentence):
            next_sentence = next(sentences)
            merged = f"{sentence} {next_sentence}"
            merged_item_sents.append(merged)
        else:
            merged_item_sents.append(sentence)

        try:
            sentence = next(sentences)
        except StopIteration:
            sentence = False
    return merged_item_sents


In [210]:
def get_matching_sent_context(context, sent, windows=[1, 2, 3, 4, 5], use_sent_from_context=False, tokenized=True):
    """
        Use this function to get closest match to a source_line or target_line in a paragraph.
        Tokenized: whether the input sent should be tokenized or not (nesecarry when the sent is a string.)
        use_sent_from_context: if true, then the matched sent will be taken in the final representation. 

    """
    sentence_tokenized_document = sentence_splitter(context)
    sentence_tokenized_document = get_processed_context(sentence_tokenized_document)
    bleu_scores = []
    sents = []
    if tokenized:
        tokenized_sent = word_tokenize(sent)

    else:
        tokenized_sent = sent

    for elem in sentence_tokenized_document:
        elem = remove_html_tags(elem)
        reference = [word_tokenize(elem)]
        score = sentence_bleu(reference, tokenized_sent)
        bleu_scores.append(score)
        sents.append(elem)
    index_of_max_bleu = bleu_scores.index(max(bleu_scores))
    matched_sent = sents[index_of_max_bleu]
    print(matched_sent)

    # make context here
    previous_sentences = []
    next_sentences = []
    sent_indexes = [i for i in range(len(sents))]
    for window in windows:
        next_sent_pos = index_of_max_bleu+window
        if next_sent_pos in sent_indexes:
            next_sent = sents[index_of_max_bleu+window]
            next_sentences.append(next_sent)
        # repeat for previous_sentences
        previous_sent_pos = index_of_max_bleu-window
        if previous_sent_pos in sent_indexes:
            previous_sent = sents[index_of_max_bleu - window]
            previous_sentences.append(previous_sent)

    previous_sentences.reverse()
    if use_sent_from_context:
        context = previous_sentences + [matched_sent] + next_sentences
    else:
        # be aware that we use the non-tokenized version of sent here and that we do
        # not join the results.
        context = previous_sentences + [sent] + next_sentences
    return context

In [170]:
def remove_timestamps(list_with_indexes, source_context):
    if len(list_with_indexes) == 1: 
        index = list_with_indexes[0]
        if index < 5:
            return source_context[index+1:]
        else:
            return source_context[:index]
    elif len(list_with_indexes) == 2: 
        first_index = list_with_indexes[0]
        second_index = list_with_indexes[-1]
        return source_context[first_index+1:second_index]
    else: 
        first_index = max(filter(lambda index: index < 5, list_with_indexes))
        second_index = min(filter(lambda index: index > 5, list_with_indexes))
        
        return source_context[first_index+1:second_index]

In [237]:
def get_processed_context(source_context):
    timestamp_indexes = []
    for index, sent in enumerate(source_context):
        if '## Timestamp' in sent:
            timestamp_indexes.append(index)
            print(timestamp_indexes)
            source_context = remove_timestamps(timestamp_indexes, source_context)
    return source_context

In [84]:
with open('../classification-scripts/noun-modifications/noun-modifications-train-without-match-pr.json', 'r') as json_in: 
     new_data = json.load(json_in)

In [139]:
timestamp_indexes = []
for index, sent in enumerate(x): 
    if '## Timestamp' in sent: 
        timestamp_indexes.append(index)

In [175]:
with open('../classification-scripts/noun-modifications/noun-modifications-train-without-match-pr.json', 'r') as json_in: 
    data = json.load(json_in)

In [212]:
x = data[33734]

In [213]:
source_first = x['Source_Context']
print(source_first)

['# Make Wobbly Wonders', '## Timestamp::::2009-05-03T22:22:09Z', 'Wobbly Wonders make an easy budget friendly snack for children. This recipe makes about 20 pieces of this gelatin based snack.', '## Steps', '## Section::::Steps.', '## Timestamp::::2009-05-03T22:39:01Z', 'Wobbly Wonders make an easy budget friendly treat for children. This recipe makes about 20 pieces of this gelatin based snack.', '## Ingredients']


In [214]:
get_matching_sent_context(source_first, x['Source_Line'], windows=[1, 2, 3, 4, 5], use_sent_from_context=False, tokenized=True)

[1, 6]
Wobbly Wonders make an easy budget friendly snack for children.


['Wobbly Wonders make an easy budget friendly snack for children.',
 'This recipe makes about 20 pieces of this gelatin based snack.',
 '## Steps',
 '## Section::::Steps.']

In [242]:
#692
with open('../classification-scripts/noun-modifications/noun-modifications-test-5-new-lines.json', 'r') as json_in: 
    data = json.load(json_in)

In [251]:
#data[693]['Source_Context']
from pprint import pprint
pprint(data[692])

{'All_Versions': ['" If you don\'t know where the book is, you can go to '
                  'beginning of the Bible and look at the Contents page and '
                  'find out where.',
                  '" If you don\'t know where the book is, go to beginning of '
                  'the Bible and look at the contents page to locate the '
                  'needed book.',
                  '" If you don\'t know where the book is, go to the beginning '
                  'of the Bible and look at the contents page to locate the '
                  'needed book.',
                  '" If you don\'t know where the book is, go to the beginning '
                  'of the Bible and look at the Table of Contents to locate '
                  'the book.'],
 'Differences': [[['page', 'NN'], ['Contents', 'NNS']]],
 'Entailment_Rel': {'page#contents1': 'OtherRelated'},
 'Filename': 'Look_up_a_Bible_Verse.txt',
 'Key': 'Look_up_a_Bible_Verse9',
 'Loc_in_splits': 'TEST',
 'PPDB_Matches': [[['pa

In [245]:
get_matching_sent_context(data[692]['Source_Context'], data[693]['Source_Line'], windows=[1, 2, 3, 4, 5], use_sent_from_context=False, tokenized=True)

Once you find it, you have your verse right next to it.


['Most of the time, on the top of each page of the Bible, it says what book and what chapter(s) are located on that specific page.',
 'Some Bibles even have the exact verses.',
 '5. "16" refers to the verse in chapter 3 where the Scripture (of Bible) verse (or reference) is located.',
 'The number after the colon will always refer to the verse in that chapter.',
 'So in the chapter of the verse, look for a smaller number within the words.',
 'With these instruction, you will be shooting in no time!',
 '## Tips',
 '* The Bible is made up of 66 books in total.',
 'Some Bibles may contain sections that are not considered "canonical" (or accepted, or received - but thats another topic).',
 'For practical purposes, it is assumed that this refers to a basic Bible, void of any particular "helps" "tools" or "aid."',
 '* The Bible is divided into two major divisions.']

In [246]:
data[692]['Source_Context']

['## Section::::Related wikiHows.',
 'You\'ve seen John 3:16 perhaps written on a poster board, or printed on a shirt. You may even be familiar with some of it, "For God so loved the world..." But how do you look up a Bible verse?',
 '## Steps',
 '1. Look at the passage (or Bible verse) that you want to find.',
 '2. In the example above it would be "John 3:16."',
 '3. "John" refers to the book of John, also known as "The Gospel According to St. John." If you don\'t know where the book is, you can go to beginning of the Bible and look at the Contents page and find out where.',
 '4. "3" refers to the chapter in the book of John where it is located. The number before the colon will always refer to the chapter of the book. In the Bible, the chapter numbers usually stand out because of their size and boldness. In the book of the scripture, look for the chapter number. Some chapters are long and some are short, so be aware of that. Most of the time, on the top of each page of the Bible, it s

In [234]:
sentence_tokenized_document = sentence_splitter(data[692]['Source_Context'])
sentence_tokenized_document

['## Section::::Related wikiHows.',
 "You've seen John 3:16 perhaps written on a poster board, or printed on a shirt.",
 'You may even be familiar with some of it, "For God so loved the world..." But how do you look up a Bible verse?',
 '## Steps',
 '1. Look at the passage (or Bible verse) that you want to find.',
 '2. In the example above it would be "John 3:16."',
 '3. "John" refers to the book of John, also known as "The Gospel According to St.',
 'John."',
 "If you don't know where the book is, you can go to beginning of the Bible and look at the Contents page and find out where.",
 '4. "3" refers to the chapter in the book of John where it is located.',
 'The number before the colon will always refer to the chapter of the book.',
 'In the Bible, the chapter numbers usually stand out because of their size and boldness.',
 'In the book of the scripture, look for the chapter number.',
 'Some chapters are long and some are short, so be aware of that.',
 'Most of the time, on the top o

In [238]:
res = get_processed_context(sentence_tokenized_document)

In [233]:
'## Timestamp' in sentence_tokenized_document

False

In [239]:
res

['## Section::::Related wikiHows.',
 "You've seen John 3:16 perhaps written on a poster board, or printed on a shirt.",
 'You may even be familiar with some of it, "For God so loved the world..." But how do you look up a Bible verse?',
 '## Steps',
 '1. Look at the passage (or Bible verse) that you want to find.',
 '2. In the example above it would be "John 3:16."',
 '3. "John" refers to the book of John, also known as "The Gospel According to St.',
 'John."',
 "If you don't know where the book is, you can go to beginning of the Bible and look at the Contents page and find out where.",
 '4. "3" refers to the chapter in the book of John where it is located.',
 'The number before the colon will always refer to the chapter of the book.',
 'In the Bible, the chapter numbers usually stand out because of their size and boldness.',
 'In the book of the scripture, look for the chapter number.',
 'Some chapters are long and some are short, so be aware of that.',
 'Most of the time, on the top o

In [252]:
indexes = [0,1,3,9]

In [253]:
d = filter(lambda x: x > 5, indexes)

In [254]:
d

<filter at 0x1199f7b90>

In [255]:
not d

False

In [256]:
list(d)

[9]

In [257]:
min([])

ValueError: min() arg is an empty sequence

In [258]:
min([] or [0])

0

In [259]:
with open('../classification-scripts/noun-modifications/noun-modifications-train-5-new-lines.json', 'r') as json_in: 
    data = json.load(json_in)

In [274]:
for index, sent in enumerate(data[8098]['Source_Context']): 
    print(index, '\t', sent)

0 	 ## Section::::Tips.
1 	 This is a guide to playing forgoten hope mod for battlefield 1942 <a href="http%3A//forgottenhope.bf1942files.com">click to go to homepage</a>. this is not for people who want a guide to play by. this is just sugestions. i do not plan to use the format
2 	 ## Steps
3 	 1. Their are 4 kinds of assults on most maps.
4 	 2. A. air raid, B. air deplyment, C. naval (not always avalable) D. rapid armor, infantry, light transport deploy.
5 	 3. For the air raid, i will take the Battle of Britan as an example. As playing as German Luftwafta, you have the selection of fighters bombers and divebombers (if you want to be ainel, there are a list and discription of units at the end.) Messerschmitt BF 109E, Messerschmitt BF 110G, Heinkel He 111H, Junkers Ju 87B-2 "Stuka", Junkers Ju 88A. use your messerschmitts as escorts. stay in formation and the british plains can not get at the bomber's blindspots; between tail and ball turet gunners cant hit. if you keep your speed l

In [268]:
for index, sent in enumerate(sentence_splitter(data[8097]['Source_Context'])): 
    print(index, '\t', sent)

0 	 ## Section::::Sources and Citations.
1 	 English is a difficult language to learn and even more so to speak with native fluency.
2 	 By immersing yourself in the language, using media to practice, and enhancing your appearance as a native speaker, you can perfect your ability to speak English in no time.
3 	 ## Steps
4 	 ### Immersing Yourself
5 	 1. Find a native conversation partner in person or online.
6 	 If your experience with English has largely been in textbooks, it can be helpful to practice your conversation skills with a native English speaker.
7 	 Conversing regularly with a skilled English speaker will enable you to speak more fluidly and develop a natural speaking cadence.
8 	 You can seek out a native speaker to partner with for conversation through Meetup.com or a local university.
9 	 * If you don’t have time to seek out a partner in person, many online service such as Italki.com can connect you with a native speaker over Skype for one-on-one conversation sessions.

In [264]:
d = ['## Steps', '1. Example: A Grass/Ground deer covered with dirt and grass.', "Gather some words for it's first type:Verdant, Green, Forest, Leaf, Jungle, Meadow, Grass, Flower (Any kind of flower)", "2. Next, gather some words for it's second type: Dirt, Mine, Terra(Spanish for Earth), Ground, Sand, Mineral, Mud, Cliff, Mountain", '3. For the third group of words, get words for what it is based on, or its color.', 'My example is a deer: Antler, Animal, Hoof, Brown, Green, Deer, Reindeer', '4. Finally, mash some words together (Or put some in different languages and then mash) We get: Terrantler', '5. By the way, you might also want some words that go with its features and body.']

for index, sent in enumerate(d):
    print(f"{index}\t{sent}")

0	## Steps
1	1. Example: A Grass/Ground deer covered with dirt and grass.
2	Gather some words for it's first type:Verdant, Green, Forest, Leaf, Jungle, Meadow, Grass, Flower (Any kind of flower)
3	2. Next, gather some words for it's second type: Dirt, Mine, Terra(Spanish for Earth), Ground, Sand, Mineral, Mud, Cliff, Mountain
4	3. For the third group of words, get words for what it is based on, or its color.
5	My example is a deer: Antler, Animal, Hoof, Brown, Green, Deer, Reindeer
6	4. Finally, mash some words together (Or put some in different languages and then mash) We get: Terrantler
7	5. By the way, you might also want some words that go with its features and body.


In [265]:
l = [1,2,3,4,5]

In [266]:
l[:]

[1, 2, 3, 4, 5]

In [267]:
l[:None]

[1, 2, 3, 4, 5]

In [279]:
d = ['Seems like too Impossible but it Really is possible!', '## Steps', '1. Hi, May you have a good day, look, i want to talk to you, I can\'t end everything unless I could say "THANKS TO YOU."', 'This would make him/her curious and will push them to know what goodness have she/he done for you to thank him.', '* Tell him/her a thing that will give a push to their thoughts to want to know more.', '* Make him/her be interested what you are going to say ( in DETAILED).', '* Use a handled source that will enlighten you.', 'such as being able to say words to him/her that would:', '2. Refresh yourself, take a warm bath to make you relax.', 'Be sure you are in a mood of doing what you are planning to do.', "Don't men his/her mistakes, take it all as your responsibility, for example, Hi Mich, please talk to me, I know that it's my fault but you have some mistakes too...Never say that!"]

In [280]:
for index, sent in enumerate(d):
    print(f"{index}\t{sent}")

0	Seems like too Impossible but it Really is possible!
1	## Steps
2	1. Hi, May you have a good day, look, i want to talk to you, I can't end everything unless I could say "THANKS TO YOU."
3	This would make him/her curious and will push them to know what goodness have she/he done for you to thank him.
4	* Tell him/her a thing that will give a push to their thoughts to want to know more.
5	* Make him/her be interested what you are going to say ( in DETAILED).
6	* Use a handled source that will enlighten you.
7	such as being able to say words to him/her that would:
8	2. Refresh yourself, take a warm bath to make you relax.
9	Be sure you are in a mood of doing what you are planning to do.
10	Don't men his/her mistakes, take it all as your responsibility, for example, Hi Mich, please talk to me, I know that it's my fault but you have some mistakes too...Never say that!


In [283]:
data[33734]['Source_Context']

['# Make Wobbly Wonders',
 '## Timestamp::::2009-05-03T22:22:09Z',
 'Wobbly Wonders make an easy budget friendly snack for children. This recipe makes about 20 pieces of this gelatin based snack.',
 '## Steps',
 '## Section::::Steps.',
 '## Timestamp::::2009-05-03T22:39:01Z',
 'Wobbly Wonders make an easy budget friendly treat for children. This recipe makes about 20 pieces of this gelatin based snack.',
 '## Ingredients']

In [285]:
data[33734].keys()

dict_keys(['Filename', 'All_Versions', 'Revision_Length', 'Base_Sentence', 'Revisions', 'Key', 'Source_Tokenized', 'Target_Tokenized', 'Correction', 'Source_Line_Tagged', 'Target_Line_Tagged', 'Differences', 'Entailment_Rel', 'PPDB_Matches', 'Loc_in_splits', 'Target_Line_Nr', 'Source_Line_Nr', 'Source_Context', 'Target_Context', 'Source_Context_5', 'Target_Context_5', 'Source_Context_5_Processed', 'Target_Context_5_Processed', 'Source_Line', 'Target_Line'])

In [288]:
data[33734]

{'Filename': 'Make_Wobbly_Wonders.txt',
 'All_Versions': ['Wobbly Wonders make an easy budget friendly snack for children.',
  'Wobbly Wonders make an easy budget friendly treat for children.'],
 'Revision_Length': 1,
 'Base_Sentence': 'Wobbly Wonders make an easy budget friendly snack for children.',
 'Revisions': ['Wobbly Wonders make an easy budget friendly treat for children.'],
 'Key': 'Make_Wobbly_Wonders0',
 'Source_Tokenized': ['Wobbly',
  'Wonders',
  'make',
  'an',
  'easy',
  'budget',
  'friendly',
  'snack',
  'for',
  'children',
  '.'],
 'Target_Tokenized': ['Wobbly',
  'Wonders',
  'make',
  'an',
  'easy',
  'budget',
  'friendly',
  'treat',
  'for',
  'children',
  '.'],
 'Correction': True,
 'Source_Line_Tagged': [['Wobbly', 'JJ'],
  ['Wonders', 'NNP'],
  ['make', 'VBP'],
  ['an', 'DT'],
  ['easy', 'JJ'],
  ['budget', 'NN'],
  ['friendly', 'JJ'],
  ['snack', 'NN'],
  ['for', 'IN'],
  ['children', 'NNS'],
  ['.', '.']],
 'Target_Line_Tagged': [['Wobbly', 'JJ'],
  ['

In [289]:
raise ValueError("Hello", "World")

ValueError: ('Hello', 'World')

In [292]:
try:
    raise ValueError("Hello", message="lol")
except ValueError as error:
    print(vars(error))

TypeError: ValueError() takes no keyword arguments