In [None]:
import pandas as pd
import os
import json
import re
from datasets import Dataset
from torch.utils.data import DataLoader

from transformers import T5ForConditionalGeneration, T5Tokenizer, MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer
os.environ['WANDB_SILENT']="true"
os.environ["WANDB_DISABLED"] = "true"



In [None]:
def collator(batch):

    input = batch['inputs'] #load original sentences
    label = batch['ans_sent'] #load noisy sentences
    inputs = tokenizer(input, text_target=label, return_tensors="pt", max_length = 512, padding='max_length',truncation=True) #tokenized sentences

    return inputs

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model_id="google/flan-t5-small"
saved_model = "./Outputs/Trial-COT"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(saved_model)
model.cuda()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [None]:
test_df = pd.read_csv('test-cot.csv')
test_data = Dataset.from_pandas(test_df)

In [None]:
test_tokenized = test_data.map(collator, remove_columns=test_data.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/1274 [00:00<?, ? examples/s]

In [None]:
from tqdm import tqdm
dataloader = DataLoader(test_data, batch_size=8)

#perform inference
predictions = []
for data in tqdm(dataloader):

    inputs = tokenizer(data['inputs'], return_tensors="pt",max_length = 512, padding='max_length',truncation=True)
    output_ids = model.generate(input_ids=inputs['input_ids'].cuda(), max_length = 512)
    predictions.extend(tokenizer.batch_decode(output_ids,skip_special_tokens=True))


100%|█████████████████████████████████████████| 160/160 [04:23<00:00,  1.65s/it]


In [None]:
predicted_ans = [p.split(" ")[-1] for p in predictions]

In [None]:
count = [1 if p==t else 0 for p,t in zip(predicted_ans,test_data['ans'])]

In [None]:
sum(count)/len(count)

0.8131868131868132

In [None]:
# test_data = test_data.add_column("predictions",predicted_ans)
test_data = test_data.add_column("predictions_sent",predictions)

In [None]:
df = pd.DataFrame(test_data)

In [None]:
print(df[['ans_sent','predictions_sent']].head())

                                            ans_sent  \
0  The answer can be found in the sentence: 'Idah...   
1  The answer can be found in the sentence: 'For ...   
2  The actual answer can be found in the sentence...   
3  The answer can be found in the sentence: 'We'v...   
4  The actual answer can be found in the sentence...   

                                    predictions_sent  
0  The answer can be found in the sentence: 'Ital...  
1  The answer can be found in the sentence: 'For ...  
2  The actual answer can be found in the sentence...  
3  The answer can be found in the sentence: 'We'v...  
4  The actual answer can be found in the sentence...  


In [None]:
df[['ans_sent','predictions_sent']].iloc[2]['ans_sent']

"The actual answer can be found in the sentence: 'An ex-money processing manager for Brink's Company in Alabama who had access to bags and bags of quarters swapped out coins for beads and made off with nearly $200,000 in 2014, per the FBI.'. However, the answer is paraphrased  which is common for large numbers. The number: '200,000' is paraphrased to '200K' after dividing 200000 by 1000. So the answer is 200"

In [None]:
df[['ans_sent','predictions_sent']].iloc[2]['predictions_sent']

"The actual answer can be found in the sentence: 'An ex-money processing manager for Brink's Company in Alabama who had access to bags and bags of quarters swapped out coins for beads and made off with nearly $200,000 in 2014, per the FBI.'. However, the answer is paraphrased which is common for large numbers. The number: '200,000' is paraphrased to '200K' after dividing 200000 by 1000. So the answer is 200"

In [None]:
analysis = df[['calculation','ans','predictions','ans_sent','predictions_sent']]
analysis['operation'] =  analysis['calculation'].apply(lambda x: x.split("(")[0].strip())
missed = analysis[(analysis['ans']!=analysis['predictions'])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analysis['operation'] =  analysis['calculation'].apply(lambda x: x.split("(")[0].strip())


In [None]:
 missed['operation'] =  missed['calculation'].apply(lambda x: x.split("(")[0].strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missed['operation'] =  missed['calculation'].apply(lambda x: x.split("(")[0].strip())


In [None]:
missed['operation'].value_counts()/analysis['operation'].value_counts()

Add           0.750000
Copy          0.131336
Divide        1.000000
Multiply      1.000000
Paraphrase    0.156250
Round         0.820513
SRound        1.000000
Span          1.000000
Subtract      0.958333
Trans         0.129032
Name: operation, dtype: float64

In [None]:
analysis['operation'].value_counts()

Copy          868
Trans         217
Paraphrase     96
Round          39
Subtract       24
Add            16
Multiply        6
Span            4
Divide          3
SRound          1
Name: operation, dtype: int64

In [None]:
# df.to_csv('./Outputs/Trial-v2/predictions-v2.csv',index=False)
missed[(missed.operation=='Subtract') ]

Unnamed: 0,calculation,ans,predictions,ans_sent,predictions_sent,operation
174,"Subtract(109,9)",100,109,So the answer is 100,The answer can be found in the sentence: 'Mark...,Subtract
183,"Subtract(2014,2003)",11,5,The news published in the year 2014 and the ev...,"The answer can be found in the sentence: 'Now,...",Subtract
256,"Subtract(93,3)",90,93,So the answer is 90,The answer can be found in the sentence: 'Rese...,Subtract
273,"Subtract(Trans(Six),Span(him))",5,6,So the answer is 5,The answer can be found in the sentence: 'Six ...,Subtract
293,"Subtract(2016,2000)",16,2016,So the answer is 16,The answer can be found in the sentence: 'His ...,Subtract
294,"Subtract(2015,1985)",30,105,The news published in the year 2015 and the ev...,The news published in the year 2015 and the ev...,Subtract
295,"Subtract(2019,1969)",50,66,The news published in the year 2019 and the ev...,The news published in the year 2019 and the ev...,Subtract
338,"Subtract(2008,50)",1958,23,So the answer is 1958,The answer can be found in the sentence: 'Fans...,Subtract
435,"Subtract(23,Add(Span(Mother),Span(Baby)))",21,23,So the answer is 21,The answer can be found in the sentence: 'A su...,Subtract
494,"Subtract(4,2)",2,13,So the answer is 2,The answer can be found in the sentence: 'Befo...,Subtract


In [None]:
subtract = missed[(missed.operation=='Subtract') & (missed['predictions_sent'].str.contains('The news published'))]

In [None]:
subtract.iloc[2]['ans_sent']

'The news published in the year 2010 and the event mentioned in the news happened in the year 1975, so the year mentioned in the headline comes from a subtraction of 2010-1975 = 35. so the answer is 35'

In [None]:
subtract.iloc[2]['predictions_sent']

'The news published in the year 2010 and the event mentioned in the news happened in the year 1975, so the year mentioned in the headline comes from a subtraction of 2010-1975 = 76. so the answer is 75'

In [None]:
Add           0.625000
Copy          0.104839
Divide        1.000000
Multiply      1.000000
Paraphrase    0.214286
Round         0.794872
SRound        1.000000
Span          0.750000
Subtract      0.821429
Trans         0.103139