# Experiment 2 Synthetic Text Evaluation

### Imports

In [1]:
import pandas as pd
import nltk
import copy
from multiset_distances import MultisetDistances
from random import sample

### Load synthetic data

In [2]:
ivae = pd.read_csv('../../data/synthetic_data_ivae.csv')
augly = pd.read_csv('../../data/synthetic_data_augly.csv')

In [3]:
ivae.head()

Unnamed: 0,identifier,synthetic_text
0,21,# ikdoenietmeermee is een gevaarlijke uitspraa...
1,21,Het is ook heel erg veel IC ! !
2,21,Het spiekbriefje van Famke Louise kwam heel ev...
3,21,Ik doe niet meer mee . En dan ook de # avondkl...
4,21,Ik heb een goede reden voor # ikdoenietmeermee...


In [4]:
for a,i in zip(ivae['identifier'],ivae['synthetic_text']):
    if type(i) != str:
        print(a,i)

2461 nan
5927 nan
9974 nan
13001 nan
17563 nan
21096 nan
35302 nan
45203 nan


In [5]:
ivae = ivae.dropna()
len(ivae)

20992

In [6]:
augly.head()

Unnamed: 0,identifier,synthetic_text
0,21,Als de zejstig plisseer gevaccineerd zijn meer...
1,21,Als de zestig plisseer gevaccineerd zijn meer ...
2,21,Als de zestig lpisseer gevaccineerd zjin meer ...
3,21,Als ed zestig plisseer gevaccineerd zijn meer ...
4,21,Tals d9e zestig pliseer gevaccineerd ijn meer ...


In [7]:
for a,i in zip(augly['identifier'],augly['synthetic_text']):
    if type(i) != str:
        print(a,i)

In [8]:
augly = augly.dropna()
len(augly)

21000

### General result inspection

Check duplicates within generated texts

In [9]:
len(ivae['synthetic_text']), len(set(ivae['synthetic_text']))

(20992, 15903)

In [10]:
len(augly['synthetic_text']), len(set(augly['synthetic_text']))

(21000, 20843)

### Check Synthetic Text Examples

In [29]:
for i in test_data[test_data['identifier'] == 136]['clean_post']:
    print(i)

Beste mensen van # ikdoenietmeermee : ik doe niet meer mee voor jullie als je straks in het ziekenhuis terecht komt ! ! 


In [37]:
for i in sample(list(ivae[ivae['identifier'] == 136]['synthetic_text']),1):
    print(i, '\n')

Wat een bullshit . . . # ikdoenietmeermee en ik heb een goede reden voor # ikdoenietmeermee 



In [13]:
for i in sample(list(augly[augly['identifier'] == 136]['synthetic_text']),1):
    print(i, '\n')

Beste jensen van#ikdoenietmeermee: ik dke niet memer mee voor jullie als je straks in het ziekenhjis terecht komt!! 



### BLEU

In [11]:
# Load real COVID-19 texts, they are the references
all_data = pd.read_csv('active_learning_data_to_test_imbalanced_with_identifiers.csv')
test_data = all_data[all_data['set'] == 'covid']

In [12]:
ivae_split = [sentence.split() for sentence in ivae['synthetic_text']]
augly_split = [sentence.split() for sentence in augly['synthetic_text']]
test_split = [sentence.split() for sentence in test_data['clean_post']]

In [38]:
#iVAE
ivae_list_of_references = [copy.deepcopy(test_split) for i in range(len(ivae_split))] # list of references for all sentences in corpus.
ivae_list_of_hypotheses = copy.deepcopy(ivae_split) # list of hypotheses that corresponds to list of references.

In [None]:
ivae_bleu_2 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses, weights = (1/2, 1/2))
ivae_bleu_3 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses, weights = (1/3, 1/3, 1/3))
ivae_bleu_4 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses)
ivae_bleu_5 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references, ivae_list_of_hypotheses, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [None]:
ivae_bleu_2, ivae_bleu_3, ivae_bleu_4, ivae_bleu_5

In [14]:
#AugLy
augly_list_of_references = [copy.deepcopy(test_split) for i in range(len(augly_split))] # list of references for all sentences in corpus.
augly_list_of_hypotheses = copy.deepcopy(augly_split) # list of hypotheses that corresponds to list of references.

In [15]:
augly_bleu_2 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references, augly_list_of_hypotheses, weights = (1/2, 1/2))
augly_bleu_3 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references, augly_list_of_hypotheses, weights = (1/3, 1/3, 1/3))
augly_bleu_4 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references, augly_list_of_hypotheses)
augly_bleu_5 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references, augly_list_of_hypotheses, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [16]:
augly_bleu_2, augly_bleu_3, augly_bleu_4, augly_bleu_5

(0.5466311121636129,
 0.4466941357204492,
 0.36285362909205826,
 0.2931781608751932)

### Self-BLEU

In [None]:
#iVAE
ivae_list_of_references_self = []
for i in range(len(ivae_split)):
    ref = copy.deepcopy(ivae_split)
    ref.pop(i)
    ivae_list_of_references_self.append(ref)
ivae_list_of_hypotheses_self = copy.deepcopy(ivae_split) # list of hypotheses that corresponds to list of references.

In [None]:
ivae_self_bleu_2 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self, weights = (1/2, 1/2))
ivae_self_bleu_3 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self, weights = (1/3, 1/3, 1/3))
ivae_self_bleu_4 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self)
ivae_self_bleu_5 = nltk.translate.bleu_score.corpus_bleu(ivae_list_of_references_self, ivae_list_of_hypotheses_self, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [None]:
ivae_self_bleu_2, ivae_self_bleu_3, ivae_self_bleu_4, ivae_self_bleu_5

In [None]:
#AugLy
augly_list_of_references_self = []
for i in range(len(augly_split)):
    ref = copy.deepcopy(augly_split)
    ref.pop(i)
    augly_list_of_references_self.append(ref)
augly_list_of_hypotheses_self = copy.deepcopy(augly_split) # list of hypotheses that corresponds to list of references.

In [None]:
augly_self_bleu_2 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references_self, augly_list_of_hypotheses_self, weights = (1/2, 1/2))
augly_self_bleu_3 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references_self, augly_list_of_hypotheses_self, weights = (1/3, 1/3, 1/3))
augly_self_bleu_4 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references_self, augly_list_of_hypotheses_self)
augly_self_bleu_5 = nltk.translate.bleu_score.corpus_bleu(augly_list_of_references_self, augly_list_of_hypotheses_self, weights = (1/5, 1/5, 1/5, 1/5, 1/5))

In [None]:
augly_self_bleu_2, augly_self_bleu_3, augly_self_bleu_4, augly_self_bleu_5

### MS-Jaccard

In [17]:
references = copy.deepcopy(test_split)
msd = MultisetDistances(references=references, min_n=2)

In [34]:
#iVAE
sentences = copy.deepcopy(ivae_split)
msj_distance_ivae = msd.get_jaccard_score(sentences=sentences)
msj_distance_ivae

Jaccard distances preprocess upto 5!


{2: 0.260817789138788,
 3: 0.1613468445806015,
 4: 0.10636419051501686,
 5: 0.07525170460253774}

In [18]:
#ARAML
sentences = copy.deepcopy(augly_split)
msj_distance_augly = msd.get_jaccard_score(sentences=sentences)
msj_distance_augly

Jaccard distances preprocess upto 5!


{2: 0.31635117309311134,
 3: 0.24598635032496105,
 4: 0.19206631670967542,
 5: 0.15032826514524514}