In [1]:
import nltk.corpus
from sematch.semantic.similarity import WordNetSimilarity
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import wordnet as wn

In [2]:
sentences = nltk.corpus.brown.sents()

In [3]:
# Example sentence.
sentences[150]

['Opponents',
 'generally',
 'argued',
 'that',
 'the',
 'ballot',
 "couldn't",
 'give',
 'enough',
 'information',
 'about',
 'tax',
 'proposals',
 'for',
 'the',
 'voters',
 'to',
 'make',
 'an',
 'intelligent',
 'choice',
 '.']

In [4]:
# Lowercases all words currently for simplicity.
def watermark_sentence(orig_sentence):
    # Can add
    sentence = [word.lower() for word in orig_sentence.copy()]
    
    # Get the best synonym starting with the word at the end of the sentence.
    result = get_best_synonym(sentence)
    
    if result is None:
        return None
    
    best_synonym, best_synonym_index = result
    
    # Replace the target word with the synonym.
    sentence[best_synonym_index] = best_synonym
    
    return sentence

In [5]:
def get_all_synonyms(word):
    word = word.lower()
    synonyms = []
    scores = []
    for ss in wn.synsets(word):
        synonyms.extend([lemma.lower() for lemma in ss.lemma_names()])
        for sim in ss.similar_tos():
            synonyms_batch = sim.lemma_names()
            synonyms.extend(synonyms_batch)
    synonyms = set(synonyms)
    if word in synonyms:
        synonyms.remove(word)
    synonyms = [synonym.replace('_',' ') for synonym in synonyms]
    return synonyms

In [6]:
# Gets the index and synonym that is the highest scored synonym.
def get_best_synonym(sentence):
    
    score_list = []
    word_list = []
    
    for i in range(len(sentence)):
        word = sentence[i]
        all_synonyms = get_all_synonyms(word)
        
        # Ignore current word if there are no synonyms.
        if len(all_synonyms) == 0: 
            # append 0 score.
            score_list.append(0)
            word_list.append("none")
            continue
        
        wns = WordNetSimilarity()
        similarity_scores = [wns.word_similarity(word, syn) for syn in all_synonyms]

        # Uncomment the following to see process.
        #print(word)
        #print(all_synonyms)
        
        max_score = max(similarity_scores)
        best_synonym_for_current_idx = all_synonyms[similarity_scores.index(max_score)]
        
        word_list.append(best_synonym_for_current_idx)
        score_list.append(max_score)
        
    best_score_overall = max(score_list)
    
    if best_score_overall == 0:
        print("No available synonyms")
        return None
    
    best_word_idx = score_list.index(best_score_overall)
    best_word = word_list[best_word_idx]
    
    return best_word, best_word_idx

## Create Dataset

In [7]:
import random
import pickle
import pandas as pd
import os

In [8]:
marked_data_folder = 'AllDirectWatermarkedV2/'
unmarked_data_folder = 'AllDirectUnmarkedV2/'

In [9]:
start_num_hundreds = 14
num_hundreds = 5
for i in range(start_num_hundreds + 1, num_hundreds + start_num_hundreds + 1):
    start_range = (i - 1) * 100
    end_range = (i) * 100
    
    print(start_range, end_range)
    
    current_sentences = sentences[start_range:end_range]

    watermarked_sents = []
    unmarked_sents = []
    
    for i in range(len(current_sentences)):
        sentence = current_sentences[i].copy()
        
        if len(sentence) < 5:
            continue
        
        result = watermark_sentence(sentence)
        
        if result is not None:
            watermarked_sents.append(result)
            unmarked_sents.append([word.lower() for word in current_sentences[i]])
            
    with open(unmarked_data_folder + 'unmarked' + str(start_range) + 'To' + str(end_range) + '.pkl', 'wb') as file:
        pickle.dump(unmarked_sents, file)
        
    with open(marked_data_folder + 'watermarked' + str(start_range) + 'To' + str(end_range) + '.pkl', 'wb') as file:
        pickle.dump(watermarked_sents, file)

1400 1500
No available synonyms
1500 1600
1600 1700
1700 1800
No available synonyms
1800 1900
No available synonyms


In [31]:
sentence

['The',
 'Baltimore',
 'and',
 'Ohio',
 'Railroad',
 'announced',
 'yesterday',
 'it',
 'would',
 'reduce',
 'the',
 'total',
 'amount',
 'of',
 'its',
 'payroll',
 'by',
 '10',
 'per',
 'cent',
 'through',
 'salary',
 'cuts',
 'and',
 'lay-offs',
 'effective',
 'at',
 '12:01',
 'A.M.',
 'next',
 'Saturday',
 '.']

In [32]:
result

['the',
 'baltimore',
 'and',
 'oh',
 'railroad',
 'announced',
 'yesterday',
 'it',
 'would',
 'reduce',
 'the',
 'total',
 'amount',
 'of',
 'its',
 'payroll',
 'by',
 '10',
 'per',
 'cent',
 'through',
 'salary',
 'cuts',
 'and',
 'lay-offs',
 'effective',
 'at',
 '12:01',
 'a.m.',
 'next',
 'saturday',
 '.']

In [55]:
full_watermarked = []
full_unmarked = []

In [56]:
for file in sorted(os.listdir(marked_data_folder)):
    fpath = os.path.join(marked_data_folder, file)
    
    if os.path.isfile(fpath) and fpath.lower().endswith(".pkl"):
        with open(fpath, 'rb') as file:
            data = pickle.load(file)
        
        full_watermarked += data

In [57]:
for file in sorted(os.listdir(unmarked_data_folder)):
    fpath = os.path.join(unmarked_data_folder, file)
    
    if os.path.isfile(fpath) and fpath.lower().endswith(".pkl"):
        with open(fpath, 'rb') as file:
            data = pickle.load(file)
        
        full_unmarked += data

In [80]:
len(full_unmarked)

1738

In [79]:
len(full_watermarked)

1738

In [60]:
full_watermarked[0]

['the',
 'fulton',
 'county',
 'thou',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of',
 "atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [61]:
full_unmarked[0]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of',
 "atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [76]:
# full_dictionary = {"watermarked": [TreebankWordDetokenizer().tokenize(sentence).lower() for sentence in full_watermarked], 
#                    "unmarked": [TreebankWordDetokenizer().tokenize(sentence).lower() for sentence in full_unmarked]}
full_dictionary = {"watermarked": full_watermarked, 
                   "unmarked": full_unmarked}
full_df = pd.DataFrame.from_dict(full_dictionary)
full_df.to_csv("DirectWatermarkedV3.csv", index=None)

In [77]:
full_df['watermarked'][0]

['the',
 'fulton',
 'county',
 'thou',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of',
 "atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']

In [78]:
full_df['unmarked'][0]

['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of',
 "atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.']