In [1]:
import nltk.corpus
from sematch.semantic.similarity import WordNetSimilarity
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet as wn

In [2]:
sentences = [list(map(lambda x:x.lower(), sentence)) for sentence in nltk.corpus.brown.sents()]

In [3]:
# Example sentence.
sentences[150] 

['opponents',
 'generally',
 'argued',
 'that',
 'the',
 'ballot',
 "couldn't",
 'give',
 'enough',
 'information',
 'about',
 'tax',
 'proposals',
 'for',
 'the',
 'voters',
 'to',
 'make',
 'an',
 'intelligent',
 'choice',
 '.']

In [4]:
# Lowercases all words currently for simplicity.
def watermark_sentence(orig_sentence):
    # Can add
    sentence = [word.lower() for word in orig_sentence.copy()]
    
    # Get the best synonym starting with the word at the end of the sentence.
    result = get_best_synonym(sentence)
    
    if result is None:
        return None
    
    best_synonym, best_synonym_index = result
    
    # Replace the target word with the synonym.
    sentence[best_synonym_index] = best_synonym
    
    return sentence

In [5]:
def get_all_synonyms(word):
    word = word.lower()
    synonyms = []
    scores = []
    for ss in wn.synsets(word):
        synonyms.extend([lemma.lower() for lemma in ss.lemma_names()])
        for sim in ss.similar_tos():
            synonyms_batch = sim.lemma_names()
            synonyms.extend(synonyms_batch)
    synonyms = set(synonyms)
    if word in synonyms:
        synonyms.remove(word)
    synonyms = [synonym.replace('_',' ') for synonym in synonyms]
    return synonyms

In [6]:
# Gets the index and synonym that is the highest scored synonym.
def get_best_synonym(sentence):
    
    score_list = []
    word_list = []
    
    for i in range(len(sentence)):
        word = sentence[i]
        all_synonyms = get_all_synonyms(word)
        
        # Ignore current word if there are no synonyms.
        if len(all_synonyms) == 0: 
            # append 0 score.
            score_list.append(0)
            word_list.append("none")
            continue
        
        wns = WordNetSimilarity()
        similarity_scores = [wns.word_similarity(word, syn) for syn in all_synonyms]

        # Uncomment the following to see process.
        #print(word)
        #print(all_synonyms)
        
        max_score = max(similarity_scores)
        best_synonym_for_current_idx = all_synonyms[similarity_scores.index(max_score)]
        
        word_list.append(best_synonym_for_current_idx)
        score_list.append(max_score)
        
    best_score_overall = max(score_list)
    
    if best_score_overall == 0:
        print("No available synonyms")
        return None
    
    best_word_idx = score_list.index(best_score_overall)
    best_word = word_list[best_word_idx]
    
    return best_word, best_word_idx

# Traditional Examples

In [7]:
start_num_hundreds = 0
num_hundreds = 1
for i in range(start_num_hundreds + 1, num_hundreds + start_num_hundreds + 1):
    start_range = (i - 1) * 100
    end_range = (i) * 100
    
    print(start_range, end_range)
    
    current_sentences = sentences[start_range:end_range]

    watermarked_sents = []
    unmarked_sents = []
    
    for i in range(len(current_sentences)):
        sentence = current_sentences[i].copy()
        
        if len(sentence) < 5:
            continue
        
        result = watermark_sentence(sentence)
        
        if result is not None:
            watermarked_sents.append(result)
            unmarked_sents.append([word.lower() for word in current_sentences[i]])

0 100


In [8]:
detokenizer = TreebankWordDetokenizer()

In [9]:
print(detokenizer.detokenize(watermarked_sents[0]))

the fulton county thousand jury said friday an investigation of atlanta's recent primary election produced "no evidence" that any irregularities took place.


In [10]:
print(detokenizer.detokenize(watermarked_sents[2]))

the september-october condition jury had been charged by fulton superior court judge durwood pye to investigate reports of possible "irregularities" in the hard-fought primary which was won by mayor-nominate ivan allen jr. .


# DeepTextMarkoriginal

In [11]:
import sys

In [12]:
# Add previous folder to thDeepTextMarker variable temporarily so the python modules can be accessed.
sys.path.append('../')

In [13]:
from DeepTextMarker import DeepTextMarker

In [14]:
marker = DeepTextMarker()

In [15]:
print(detokenizer.detokenize(sentences[0]))

the fulton county grand jury said friday an investigation of atlanta's recent primary election produced "no evidence" that any irregularities took place.


In [16]:
print(detokenizer.detokenize(marker.watermark_single_sentence(sentences[0])))

the fulton county grand jury said friday an investigation of atlanta's recent primary elections produced "no evidence" that any irregularities took place.


In [17]:
print(detokenizer.detokenize(sentences[2]))

the september-october term jury had been charged by fulton superior court judge durwood pye to investigate reports of possible "irregularities" in the hard-fought primary which was won by mayor-nominate ivan allen jr. .


In [18]:
print(detokenizer.detokenize(marker.watermark_single_sentence(sentences[2])))

the september-october term jury had been charged by fulton superior courts judge durwood pye to investigate reports of possible "irregularities" in the hard-fought primary which was won by mayor-nominate ivan allen jr. .


# Both

In [24]:
trimmed_sents = [sent for sent in sentences[0:100] if len(sent) >= 5]

In [30]:
for i in range(0, 100):
    original = trimmed_sents[i]
    
    traditional = watermarked_sents[i]
    deepTextMark = marker.watermark_single_sentence(original)
    
    # Printed in latex format.
    print("\\begin{enumerate}")
    print("\\item " + detokenizer.detokenize(original).capitalize())
    print("\\item " + detokenizer.detokenize(traditional).capitalize())
    print("\\item " + detokenizer.detokenize(deepTextMark).capitalize())
    print("\\end{enumerate}")
    print()

\begin{enumerate}
\item The fulton county grand jury said friday an investigation of atlanta's recent primary election produced "no evidence" that any irregularities took place.
\item The fulton county thousand jury said friday an investigation of atlanta's recent primary election produced "no evidence" that any irregularities took place.
\item The fulton county grand jury said friday an investigation of atlanta's recent primary elections produced "no evidence" that any irregularities took place.
\end{enumerate}

\begin{enumerate}
\item The jury further said in term-end presentments that the city executive committee, which had over-all charge of the election, "deserves the praise and thanks of the city of atlanta" for the manner in which the election was conducted.
\item The panel further said in term-end presentments that the city executive committee, which had over-all charge of the election, "deserves the praise and thanks of the city of atlanta" for the manner in which the election

IndexError: list index out of range