In [1]:
from google.colab import drive
import pandas as pd
import os
import json
import re

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd /content/drive/MyDrive/Shared-Tasks/Num-Eval

/content/drive/MyDrive/Shared-Tasks/Num-Eval


In [4]:
ls Dataset

Dev_Headline_Generation.json  Train_Headline_Generation.json
Dev_Numerical_Reasoning.json  Train_Numerical_Reasoning.json


In [5]:
def file_loader(json_path):
  with open(json_path) as f:
    contents = f.read()
  return json.loads(contents)

In [6]:
numerical_reasoning_dev = "Dev_Numerical_Reasoning.json"
numerical_reasoning_train = "Train_Numerical_Reasoning.json"

numerical_data_dev_path = os.path.join("Dataset", numerical_reasoning_dev)
numerical_data_train_path = os.path.join("Dataset", numerical_reasoning_train)

numerical_data_train = file_loader(numerical_data_train_path)
df_train = pd.DataFrame.from_dict(numerical_data_train)

numerical_data_dev = file_loader(numerical_data_dev_path)
df_dev = pd.DataFrame.from_dict(numerical_data_dev)

In [7]:
df_dev.shape

(2572, 4)

In [8]:
df_train.shape

(21157, 4)

In [9]:
df_dev['context'] = df_dev['news'].apply(lambda x: re.sub(r'\([^)]*\)', '', x, 1).strip())
df_dev = df_dev.rename({'masked headline':'question'})


In [18]:
df_dev.iloc[0]['context']

'Police are still hunting for an  armed and dangerous  man who shot and killed three people and wounded five others at the California quarry where he worked this morning. He also shot and injured another woman in an attempted carjacking later. The suspect is identified as 45-year-old Shareef Allman. The San Jose Mercury News says he moonlighted as producer of a show called Real 2 Real on something called CreaTV in the area. (He interviews Jesse Jackson in one YouTube clip, which is in the gallery. Allman, who spread a message of non-violence, also has written a book called Amazing Grace about female victims of domestic violence, notes the Mercury News. The shooting spree occurred about 4:30am local time when a man left a safety meeting at the Lehigh Southwest Cement Permanente Plant, then returned with a rifle and a handgun, according to police. He fled on foot, and a massive manhunt is under way in the San Jose area. Schools are on lockdown, notes AP.'

In [19]:
df_dev.iloc[0]['masked headline']

'____rd Victim Dead in Quarry Shooting; Manhunt Still On'

In [None]:
!pip install transformers
!pip install datasets

In [15]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

model_name = "deepset/roberta-base-squad2"

# a) Get predictions
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

# # b) Load model & tokenizer
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
from datasets import Dataset
from torch.utils.data import DataLoader

In [None]:
def collator(batch):

  context = batch['context']
  question = batch['question']
  ans = batch['ans']


  return {'context': context, 'question': question, 'answer': ans}

In [None]:
dataset = dataset.map(collator, remove_columns=dataset.column_names, batch_size=8, num_proc=4, batched=True)

Map (num_proc=4):   0%|          | 0/2572 [00:00<?, ? examples/s]

In [None]:
dev_batched = DataLoader(dataset, batch_size=8)

In [None]:
predictions = []
for _, item in df_dev.iterrows():

  ans = item['ans']
  item = {'question':item['masked headline'],'context':item['context']}
  res = nlp(item)
  predictions.append((res['answer'],ans))



In [None]:
predictions_only = [p for p,t in predictions]

In [None]:
df_dev['predictions'] = predictions_only

In [None]:
df_dev.head()

Unnamed: 0,news,masked headline,calculation,ans,context,question,predictions
0,"(Oct 5, 2011 12:11 PM CDT) Police are still h...",____rd Victim Dead in Quarry Shooting; Manhunt...,Trans(three),b'3',Police are still hunting for an armed and dan...,rd Victim Dead in Quarry Shooting; Manhunt Sti...,three
1,"(Mar 4, 2014 11:30 AM) The New York Times fol...",NYT Corrects 1853 Piece After ____ Years Win,Copy(12),b'12',The New York Times followed the Best Picture w...,NYT Corrects 1853 Piece After Years Win,—just 161 years late
2,"(Nov 4, 2008 3:19 PM) Stocks rallied on Elect...",Stocks Up ____ in Election Rally,"Round(305.45,0)",b'305',Stocks rallied on Election Day as investors ap...,Stocks Up in Election Rally,investors applauded the looming conclusion to ...
3,"(Dec 24, 2014 11:19 AM) Turns out you won't e...",You Can Watch The Interview at ____pm,Copy(1),b'1',Turns out you won't even have to leave your ho...,You Can Watch The Interview at pm,1pm
4,"(Oct 16, 2014 3:02 AM CDT) Tristen Kurilla, t...","Murder Suspect, ____, Will Stay in Adult Jail",Copy(10),b'10',"Tristen Kurilla, the Pennsylvania 10-year-old ...","Murder Suspect, , Will Stay in Adult Jail",Tristen Kurilla


In [None]:
df_dev.to_csv('predictions-trial.csv',index=False)

In [55]:
predictions = pd.read_csv('predictions-trial.csv')
predictions['predictions'] = predictions['predictions'].fillna("0")

In [98]:
def text_to_digit(text):
    # Define a dictionary to map textual representations to digits
    text_to_digit_map = {
        "zero": 0,
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5,
        "six": 6,
        "seven": 7,
        "eight": 8,
        "nine": 9,
        "third": 3,
        "second": 2,
    }

    # Split the input text into words
    words = text.split()

    # Initialize an empty list to store the converted digits
    digits = []

    # Iterate through the words and convert them to digits
    for word in words:
        # Use the dictionary to look up the digit, or keep the word as is
        digit = text_to_digit_map.get(word.lower(), word)
        digits.append(str(digit))  # Convert to string for consistency

    # Join the converted words/digits back into a string
    converted_text = " ".join(digits)

    return converted_text



def clean(sentence):

  sentence = text_to_digit(sentence)
  match = re.search(r'\d+\.\d+|\d+', sentence)

  # Check if a digit was found and extract it
  if match:
      first_digit = float(match.group())
      if first_digit==round(first_digit):
        first_digit = int(first_digit)
      return str(first_digit)
  else:
      return sentence

In [99]:
predictions['ans'] = predictions['ans'].apply(lambda x: x.replace("b'",''))
predictions['ans'] = predictions['ans'].apply(lambda x: x.replace("'",''))
predictions['predictions-clean'] = predictions['predictions'].apply(lambda x: clean(x))

In [100]:
df = predictions[['ans', 'predictions','predictions-clean']]

In [101]:
match = df[df['ans']==df['predictions-clean']]

In [102]:
match.shape[0]/df_dev.shape[0]

0.4630637636080871

In [103]:
df

Unnamed: 0,ans,predictions,predictions-clean
0,3,three,3
1,12,—just 161 years late,161
2,305,investors applauded the looming conclusion to ...,investors applauded the looming conclusion to ...
3,1,1pm,1
4,10,Tristen Kurilla,Tristen Kurilla
...,...,...,...
2567,94,Fred Beckey,Fred Beckey
2568,18,18th,18
2569,15,15,15
2570,3,Wissam Al Mana,Wissam Al Mana


In [None]:
.to_csv('predictions-trial.csv',index=False)