In [1]:
import pandas as pd
import nltk
import copy
from multiset_distances import MultisetDistances

### Load synthetic data

In [2]:
# araml_raw = pd.read_csv('araml_batch_499.txt', sep='~', header=None)
ivae_raw = pd.read_csv('ivae/gen_sentsivae_results_4000.txt', sep='~', header=None)
# relgan = pd.read_csv('relgan_batch_1980.txt', sep='~', header=None)

In [3]:
araml_raw.head()

Unnamed: 0,0
0,3118 3892 864 615 2876 2325 82 2091 3887 2786 ...
1,1979 2781 1593 1252 1643 4301 4301 4301 4301 4...
2,3887 2786 2088 477 3892 237 2325 776 2098 4301...
3,3887 2786 4081 2612 4081 2442 406 126 1643 164...
4,198 2383 2239 549 2946 4133 3639 3507 2247 539...


In [3]:
ivae_raw.head()

Unnamed: 0,0
0,<s> Famke Louise verwijdert # ikdoenietmeermee...
1,<s> Ik heb nooit meegedaan en ben je wel . </s>
2,<s> Deze misdaad stopt de pot op de # avondklo...
3,<s> En nu is het nog steeds geloven wij dat de...
4,<s> Dat is geweldig . . . daar is het nog zo u...


In [5]:
relgan.head()

Unnamed: 0,0
0,ik ook niet # ikdoenietmeermee
1,stop de # lockdown
2,famke louise # ikdoenietmeermee wat zou gommer...
3,waarom is dit trending ? ! ! !
4,zou toch in een democratie leven waar een rege...


### Postprocessing

ARAML postprocessing

In [6]:
# import csv
vocab = pd.read_csv('../data/covid_gen_train_vocab.txt', sep='~~', header=None, engine='python', error_bad_lines=False)
vocab.head() #last id, the space, is lost here

Unnamed: 0,0
0,faalt
1,rood
2,correct
3,komende
4,landgenoten


In [7]:
def convert_araml(text):
    new_text = []
    for ids in text.split():
        ids = int(ids)
        if ids == 4301:
            continue
        else:
            new_text.append(vocab[0][ids])
    new_text = ' '.join(new_text)
    return new_text

In [8]:
araml = pd.DataFrame(data = araml_raw.apply(lambda x: convert_araml(x[0]), axis=1))

In [9]:
araml.head()

Unnamed: 0,0
0,Het is allemaal goed bedoeld voor de kwetsbare...
1,Dit op 1 dag .
2,# ikdoenietmeermee jongens wat is dit voor ach...
3,# ikdoenietmeermee er doen er steeds minder me...
4,Een beetje weldenkend mens die in zijn omgevin...


iVAE postprocessing

In [4]:
ivae = pd.DataFrame(data = ivae_raw.apply(lambda x: x[0][4:-5], axis=1))

In [5]:
ivae.head()

Unnamed: 0,0
0,Famke Louise verwijdert # ikdoenietmeermee : '...
1,Ik heb nooit meegedaan en ben je wel .
2,Deze misdaad stopt de pot op de # avondklok .
3,En nu is het nog steeds geloven wij dat de ove...
4,Dat is geweldig . . . daar is het nog zo uit !


### General result inspection

Check duplicates within generated texts

In [12]:
len(araml[0]), len(set(araml[0]))

(4000, 2435)

In [6]:
len(ivae[0]), len(set(ivae[0]))

(4000, 3412)

In [14]:
len(relgan[0]), len(set(relgan[0]))

(4032, 1287)

Check duplicates with original texts

In [7]:
train_data = pd.read_csv('../data/covid_gen_train_text.txt', sep='~~', header=None, engine='python')

In [8]:
train_data.head()

Unnamed: 0,0
0,"Zeg jongens , # ikdoenietmeermee kunnen we nie..."
1,Naar Femke Louise luisteren : # ikdoenietmeerm...
2,Vergeet niet de # ikdoenietmeermee aanhangers
3,"Dat moet je niet het kabinet kwalijk nemen , m..."
4,Je moet jezelf eens zien met je ; # IkDoeNietM...


In [9]:
len(train_data[0]), len(set(train_data[0]))

(3280, 3280)

In [18]:
len(set(araml[0]).intersection(set(train_data[0])))

1985

In [10]:
len(set(ivae[0]).intersection(set(train_data[0])))

91

In [20]:
len(set(relgan[0]).intersection(set(train_data[0])))

0

### BLEU

In [11]:
test_data = pd.read_csv('../data/covid_gen_test_text.txt', sep='~~', header=None, engine='python')

In [12]:
# araml_split = [sentence.split() for sentence in araml[0]]
ivae_split = [sentence.split() for sentence in ivae[0]]
# relgan_split = [sentence.split() for sentence in relgan[0]]
test_split = [sentence.split() for sentence in test_data[0]]

In [23]:
len(araml_split)

4000

In [24]:
#ARAML
araml_list_of_references = [copy.deepcopy(test_split) for i in range(len(araml_split))] # list of references for all sentences in corpus.
araml_list_of_hypotheses = copy.deepcopy(araml_split) # list of hypotheses that corresponds to list of references.
len(araml_list_of_references), len(araml_list_of_hypotheses)

(4000, 4000)

In [25]:
araml_bleu_2 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references, araml_list_of_hypotheses, weights = (1/2, 1/2))
araml_bleu_3 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references, araml_list_of_hypotheses, weights = (1/3, 1/3, 1/3))
araml_bleu_4 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references, araml_list_of_hypotheses)
araml_bleu_5 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references, araml_list_of_hypotheses, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [26]:
araml_bleu_2, araml_bleu_3, araml_bleu_4, araml_bleu_5

(0.5355666678734008,
 0.32865348765130153,
 0.2170772701918256,
 0.1575209750099976)

In [13]:
#iVAE
ivae_list_of_references = [copy.deepcopy(test_split) for i in range(len(ivae_split))] # list of references for all sentences in corpus.
ivae_list_of_hypotheses = copy.deepcopy(ivae_split) # list of hypotheses that corresponds to list of references.

In [14]:
ivae_bleu_2 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses, weights = (1/2, 1/2))
ivae_bleu_3 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses, weights = (1/3, 1/3, 1/3))
ivae_bleu_4 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses)
ivae_bleu_5 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [15]:
ivae_bleu_2, ivae_bleu_3, ivae_bleu_4, ivae_bleu_5

(0.7439766652743535,
 0.5647617970438186,
 0.42822903019934944,
 0.3360663924195802)

In [30]:
#RelGAN
relgan_list_of_references = [copy.deepcopy(test_split) for i in range(len(relgan_split))] # list of references for all sentences in corpus.
relgan_list_of_hypotheses = copy.deepcopy(relgan_split) # list of hypotheses that corresponds to list of references.

In [31]:
relgan_bleu_2 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references, relgan_list_of_hypotheses, weights = (1/2, 1/2))
relgan_bleu_3 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references, relgan_list_of_hypotheses, weights = (1/3, 1/3, 1/3))
relgan_bleu_4 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references, relgan_list_of_hypotheses)
relgan_bleu_5 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references, relgan_list_of_hypotheses, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [32]:
relgan_bleu_2, relgan_bleu_3, relgan_bleu_4, relgan_bleu_5

(0.6110493081079732,
 0.4148611239355362,
 0.281139293593719,
 0.17673371985545647)

### Self-BLEU

In [33]:
#ARAML
araml_list_of_references_self = []
for i in range(len(araml_split)):
    ref = copy.deepcopy(araml_split)
    ref.pop(i)
    araml_list_of_references_self.append(ref)
araml_list_of_hypotheses_self = copy.deepcopy(araml_split) # list of hypotheses that corresponds to list of references.

In [34]:
araml_self_bleu_2 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references_self, araml_list_of_hypotheses_self, weights = (1/2, 1/2))
araml_self_bleu_3 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references_self, araml_list_of_hypotheses_self, weights = (1/3, 1/3, 1/3))
araml_self_bleu_4 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references_self, araml_list_of_hypotheses_self)
araml_self_bleu_5 = nltk.translate.bleu_score.corpus_bleu(araml_list_of_references_self, araml_list_of_hypotheses_self, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [35]:
araml_self_bleu_2, araml_self_bleu_3, araml_self_bleu_4, araml_self_bleu_5

(0.9223604274244183,
 0.8655422403147902,
 0.8265847764862504,
 0.7990205025548809)

In [18]:
#iVAE
ivae_list_of_references_self = []
for i in range(len(ivae_split)):
    ref = copy.deepcopy(ivae_split)
    ref.pop(i)
    ivae_list_of_references_self.append(ref)
ivae_list_of_hypotheses_self = copy.deepcopy(ivae_split) # list of hypotheses that corresponds to list of references.

In [19]:
ivae_self_bleu_2 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self, weights = (1/2, 1/2))
ivae_self_bleu_3 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self, weights = (1/3, 1/3, 1/3))
ivae_self_bleu_4 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self)
ivae_self_bleu_5 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [20]:
ivae_self_bleu_2, ivae_self_bleu_3, ivae_self_bleu_4, ivae_self_bleu_5

(0.9727184346428311, 0.94365381401003, 0.9034544427630443, 0.8546247499850489)

In [39]:
#RelGAN
relgan_list_of_references_self = []
for i in range(len(relgan_split)):
    ref = copy.deepcopy(relgan_split)
    ref.pop(i)
    relgan_list_of_references_self.append(ref)
relgan_list_of_hypotheses_self = copy.deepcopy(relgan_split) # list of hypotheses that corresponds to list of references.

In [40]:
relgan_self_bleu_2 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references_self, relgan_list_of_hypotheses_self, weights = (1/2, 1/2))
relgan_self_bleu_3 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references_self, relgan_list_of_hypotheses_self, weights = (1/3, 1/3, 1/3))
relgan_self_bleu_4 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references_self, relgan_list_of_hypotheses_self)
relgan_self_bleu_5 = nltk.translate.bleu_score.corpus_bleu(relgan_list_of_references_self, relgan_list_of_hypotheses_self, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [41]:
relgan_self_bleu_2, relgan_self_bleu_3, relgan_self_bleu_4, relgan_self_bleu_5

(0.9330169448267662,
 0.8876048812318107,
 0.8492562523503902,
 0.8107911151607344)

### MS-Jaccard

In [16]:
references = copy.deepcopy(test_split)
msd = MultisetDistances(references=references, min_n=2)

In [47]:
#ARAML
sentences = copy.deepcopy(araml_split)
msj_distance_araml = msd.get_jaccard_score(sentences=sentences)
msj_distance_araml

Jaccard distances preprocess upto 5!


{2: 0.3383283632549752,
 3: 0.18929606786510203,
 4: 0.11984781790783489,
 5: 0.08511991346970271}

In [17]:
#ARAML
sentences = copy.deepcopy(ivae_split)
msj_distance_ivae = msd.get_jaccard_score(sentences=sentences)
msj_distance_ivae

Jaccard distances preprocess upto 5!


{2: 0.27812985338599944,
 3: 0.177075995775008,
 4: 0.12291388330762909,
 5: 0.09182173463276583}

In [49]:
#ARAML
sentences = copy.deepcopy(relgan_split)
msj_distance_relgan = msd.get_jaccard_score(sentences=sentences)
msj_distance_relgan

Jaccard distances preprocess upto 5!


{2: 0.18099028527353633,
 3: 0.1014203752872681,
 4: 0.05895023285808583,
 5: 0.03738392501448418}