## Imports

In [35]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from utils.search_funcs import get_prediction
from utils.evaluation_metrics import *

## Load Data

In [25]:
# ORIGINAL EDITS
orig_sents = pd.read_csv('C:/Users/jimli/Desktop/imdb_reviews.csv').head(100)['Source_Sentences'].tolist()
orig_sent_set = set(orig_sents)
print(len(orig_sents))

100


In [20]:
# MICE EDITS
mice_df = pd.read_csv("C:/Users/jimli/Documents/thesis/source_code/Counterfactual-Editor/Edits/imdb/mice_imdb_edits.csv", sep='\t')[['orig_input', 'edited_input']]
mice_orig_sents = mice_df['orig_input'].tolist()
mice_sents = mice_df['edited_input'].tolist()

mice_sents_dict = {k: v for k, v in zip(mice_orig_sents, mice_sents) if k in orig_sent_set}

95


In [21]:
# POLYJUICE EDITS
pj_sents = pd.read_csv("C:/Users/jimli/Documents/thesis/source_code/Counterfactual-Editor/Edits/imdb/polyjuice_imdb_edits.csv", sep='\t')['counter_sents'].tolist()
print(len(pj_sents))

100


In [23]:
# GNN EDITS
gnn_sents = pd.read_csv("C:/Users/jimli/Desktop/gnn_imdb_MUG_edits2.csv")['counter_sents'].tolist()
print(len(gnn_sents))

100


## Evaluation

In [27]:
fl_model, fl_tokenizer = model_init('t5-base', cuda=not torch.cuda.is_available())
sents = []
for i in tqdm(range(100)):
    orig_sent = orig_sents[i]
    mice_sent = mice_sents_dict.get(orig_sent, None)
    pj_sent = pj_sents[i]
    gnn_sent = gnn_sents[i]

    if mice_sent is None:
        continue
    try:
        gnn_fluency =  abs(
                        1 - sent_scoring(fl_model, fl_tokenizer, gnn_sent, cuda=False)[0] / sent_scoring(
                            fl_model, fl_tokenizer, orig_sent, cuda=False)[0]
                        )
        gnn_minimality = edit_distance(orig_sent.lower(), gnn_sent.lower())

        pj_fluency =  abs(
                        1 - sent_scoring(fl_model, fl_tokenizer, pj_sent, cuda=False)[0] / sent_scoring(
                            fl_model, fl_tokenizer, orig_sent, cuda=False)[0]
                        )
        pj_minimality = edit_distance(orig_sent.lower(), pj_sent.lower())

        mice_fluency =  abs(
                        1 - sent_scoring(fl_model, fl_tokenizer, mice_sent, cuda=False)[0] / sent_scoring(
                            fl_model, fl_tokenizer, orig_sent, cuda=False)[0]
                        )
        mice_minimality = edit_distance(orig_sent.lower(), mice_sent.lower())

    except:
        continue

    sents.append((orig_sent, mice_sent, pj_sent, gnn_sent, mice_fluency, pj_fluency, gnn_fluency, mice_minimality, pj_minimality, gnn_minimality))


 32%|███▏      | 32/100 [02:18<05:33,  4.90s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [06:52<00:00,  4.12s/it]


In [31]:
sents_df = pd.DataFrame(sents, columns=['orig_sent', 'mice_sent', 'pj_sent', 'gnn_sent', 'mice_fluency', 'pj_fluency', 'gnn_fluency', 'mice_minimality', 'pj_minimality', 'gnn_minimality'])
print(sents_df.head())
print(sents_df.shape[0])

                                           orig_sent  \
0  When I see a movie, I usually seek entertainme...   
1  For late-80s cheese, this really isn't so bad....   
2  I watch many movies, but presently my genre nu...   
3  Call it manipulative drivel if you will, but I...   
4  I thought this was a really well written film....   

                                           mice_sent  \
0  When I see a movie, I usually seek entertainme...   
1  For late-80s  humor, this really isn't so bad....   
2  I watch many movies, but presently my genre nu...   
3   Bad humor if you will, but I  was sorely disa...   
4  I  thought this was a  terrible film. I've hea...   

                                             pj_sent  \
0  When I see a movie, I usually seek entertainme...   
1  I got to know when I was about to get to Junio...   
2                                     title( "Medo")   
3  Call it manipulative drivel if you will, but I...   
4  I thought this was a really well written fi

In [32]:
filtered_sents_df = sents_df[
    (sents_df['mice_minimality'] > sents_df['gnn_minimality']) & (sents_df['pj_minimality'] > sents_df['gnn_minimality']) & (sents_df['gnn_fluency'] < sents_df['pj_fluency']) & (sents_df['gnn_fluency'] < sents_df['mice_fluency'])
]
filtered_sents_df.shape[0]

31

In [33]:
filtered_sents_df.to_csv("C:/Users/jimli/Desktop/filtered_imdb_sents.csv", index=False)

In [36]:
model_path = "C:/Users/jimli/Desktop/imdb_bert_predictor2"
predictor = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer= DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



In [43]:
filtered_sents_df['mice_label_flip'] = filtered_sents_df['mice_label_flip'].apply(lambda x: 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_sents_df['mice_label_flip'] = filtered_sents_df['mice_label_flip'].apply(lambda x: 1)


In [40]:
filtered_sents_df['pj_label_flip'] = filtered_sents_df.apply(lambda x: int(get_prediction(predictor, tokenizer, x['orig_sent']) != get_prediction(predictor, tokenizer, x['pj_sent'])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_sents_df['pj_label_flip'] = filtered_sents_df.apply(lambda x: int(get_prediction(predictor, tokenizer, x['orig_sent']) != get_prediction(predictor, tokenizer, x['pj_sent'])), axis=1)


In [41]:
filtered_sents_df['gnn_label_flip'] = filtered_sents_df.apply(lambda x: int(get_prediction(predictor, tokenizer, x['orig_sent']) != get_prediction(predictor, tokenizer, x['gnn_sent'])), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_sents_df['gnn_label_flip'] = filtered_sents_df.apply(lambda x: int(get_prediction(predictor, tokenizer, x['orig_sent']) != get_prediction(predictor, tokenizer, x['gnn_sent'])), axis=1)


In [44]:
filtered_sents_df.head()

Unnamed: 0,orig_sent,mice_sent,pj_sent,gnn_sent,mice_fluency,pj_fluency,gnn_fluency,mice_minimality,pj_minimality,gnn_minimality,mice_label_flip,pj_label_flip,gnn_label_flip
8,'Radio' is a beautiful movie based on a real s...,'Radio' is a pretty bad movie that I only wat...,"told a story, how great person always.",'radio' is a beautiful movie based on a real s...,0.058631,0.849228,0.032284,0.31,0.99,0.006667,1,0,1
9,This is one of the best films I have ever seen...,This is one of the best films I have ever seen...,This is one of the worst films I have ever see...,this is crap of the best films i have ever see...,0.107492,0.851975,0.035598,0.188525,0.852459,0.008197,1,1,1
12,I've been strangely attracted to this film sin...,I've been looking forward to this film since ...,[ANSWER],i've been strangely attracted to this film sin...,0.030744,0.984899,0.009921,0.204545,1.0,0.007576,1,1,1
16,Here's another of the 1940's westerns that I w...,I thought this movie was crap. I won't let an...,This is one of the straight up films you will ...,here's another of the 1940's westerns that i w...,0.202065,0.908451,0.101182,0.467213,0.97541,0.02459,1,0,1
17,Featuring some amazing and wonderful character...,"With some great acting and music, a new ca...",Another idea would have been the foundation fo...,featuring some amazing and wonderful character...,0.138934,0.896311,0.013809,0.165138,0.798165,0.009174,1,0,1


In [47]:
final_sents_df = filtered_sents_df[filtered_sents_df['gnn_label_flip'] == 1]
final_sents_df.shape[0]

26

In [50]:
final_sents_df['sentence_length'] = final_sents_df['orig_sent'].apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_sents_df['sentence_length'] = final_sents_df['orig_sent'].apply(lambda x: len(x.split()))


In [51]:
final_sents_df.to_csv("C:/Users/jimli/Desktop/final_imdb_sents.csv", index=False)

In [58]:
metrics_df = final_sents_df.sort_values('sentence_length', ascending=True, axis=0, ignore_index=True).head(4)
metrics_df.to_csv("C:/Users/jimli/Desktop/metrics_imdb_sents.csv", index=False)

In [71]:
best_metric_texts = metrics_df.iloc[3].to_dict()
print('Original:', best_metric_texts['orig_sent'])
print("\n\n")
print('MiCE:', best_metric_texts['mice_sent'])
print("\n\n")
print('Polyjuice:', best_metric_texts['pj_sent'])
print("\n\n")
print('Ours:', best_metric_texts['gnn_sent'])


Original: This movie will likely be too sentimental for many viewers, especially contemporary audiences. Nevertheless I enjoyed this film thanks mostly to the down-to-earth charm of William Holden, one of my favorite stars, and the dazzling beauty of Jennifer Jones. There are some truly heartwarming scenes between the pair and the talent of these two actors rescues what in lesser hands could've been trite lines. The cinematography of Hong Kong from the period of filming is another highlight of this movie. All in all, a better than average romantic drama, 7/10.



MiCE: This movie will likely be too  harsh for many  conservative,  conservative audiences.  Personally I enjoyed this film thanks mostly to the  brilliant acting of William  Powell, both of  whom have the dazzling beauty of Jennifer Jones. There are some truly heart warming scenes between the pair and the talent of these two actors  enhances what in  less than average hands could've been trite lines. The  beautiful performanc

In [61]:
nlp = spacy.load('en_core_web_sm')

In [69]:
doc = 'romantic shameful'
doc = nlp(doc)
for token in doc:
    print(token.text, token.pos_)

romantic ADJ
shameful ADJ
