In [None]:
import pandas as pd
import os
import json
import re
from datasets import Dataset
from torch.utils.data import DataLoader

os.environ['WANDB_SILENT']="true"
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from datasets import load_from_disk
train = load_from_disk('./Dataset/train')

In [None]:
df = pd.DataFrame(train)

In [None]:
df.shape

(21157, 6)

In [None]:
def check_calculation(item, target_string):
    # Create a mask to identify rows where the target_string occurs as a standalone word

    pattern = rf'\b{re.escape(target_string)}\b(?![^(]*\))'


    if target_string in item['calculation'] and len(re.findall(r'\(',item['calculation']))==1:
        if target_string=='Subtract' and 'Year' in item['masked headline']:
            return True
        elif target_string=='Subtract' and 'Year' not in item['masked headline']:
            return False
        else:
            return True


    return False

def get_ans_sent(item):

    operations = {"Copy":get_copy_placeholder,"Trans":get_trans_placeholder,
                  "Span":get_span_placeholder,"Round":get_round_placeholder,"Paraphrase":get_paraphrase_placeholder,
                  "Subtract":get_subtract_placeholder, "SRound":get_round_placeholder}

    for operation, function in operations.items():

        if check_calculation(item,operation):

            return function(item)

    return f"So the answer is {item['ans']}"

In [None]:

def filter_unique_string(df, target_string):
    # Create a mask to identify rows where the target_string occurs as a standalone word

    pattern = rf'\b{re.escape(target_string)}\b(?![^(]*\))'

    # Create a mask to identify rows where the target_string meets the pattern conditions
    mask = df['calculation'].apply(lambda x: True if target_string in x and len(re.findall(r'\(',x))==1 else False)

    # Filter the DataFrame with rows where the target_string is a standalone word
    filtered_df = df[mask]

    return filtered_df

In [None]:
def date_to_sentence(month, day, year):
    # Define a dictionary to map month abbreviations to full month names.
    month_mapping = {
        "Jan": "January",
        "Feb": "February",
        "Mar": "March",
        "Apr": "April",
        "May": "May",
        "Jun": "June",
        "Jul": "July",
        "Aug": "August",
        "Sep": "September",
        "Oct": "October",
        "Nov": "November",
        "Dec": "December"
    }

    month = re.sub(r'[^a-zA-Z]+', '', month)
    day = int(re.sub(r'[^0-9]+', '', day))
    year = int(re.sub(r'[^0-9]+', '', year))

    # Get the full month name from the abbreviation.
    full_month = month_mapping.get(month, month)

    # Determine the appropriate day suffix.
    if 11 <= day <= 13:
        day_suffix = "th"
    else:
        day_suffix = {1: "st", 2: "nd", 3: "rd"}.get(day % 10, "th")

    # Create the placeholder sentence.
    sentence = f"The news was published on {day}{day_suffix} {full_month} in the year of {year}."

    return sentence



In [None]:
def convert_timestamp(news):

    try:
        timestamp = re.findall(r'\([^)]*\)',news)[0]
        month,day,year = timestamp.split()[:3]

        return date_to_sentence(month, day, year)

    except Exception as e:

        return ""




In [None]:
def prepare_input(item):

    try:
        replace = f"{item['published']} News:"
        news_with_time = re.sub(r'\([^)]*\)',replace,item['news'])

        Input = f"{news_with_time} Fill in the blank: {item['masked headline']}"

        return Input

    except Exception as e:

        return ""



In [None]:
import re
def find_ans(news, ans):

    news = news.replace("Sept.", "Sept")
    ans = ans.replace("Sept.", "Sept")
    sentence_pattern = r'(?<=[.!?]) +'

    # Use the re.split function to split the paragraph into sentences.
    sentences = re.split(sentence_pattern, news)



    for sent in sentences:
        if ans.lower() in sent.lower():
            return sent

    return None


In [None]:
def get_copy_placeholder(item):

    sent = find_ans(item['news'], item['ans'])

    if sent is not None:

        sent = re.sub(r'\([^)]*\)','',sent).strip()
        return f"The answer can be found in the sentence: '{sent}'. so the answer is {item['ans']}"


    else: return None

In [None]:
def get_trans_placeholder(item):

    try:


        calculation = item.calculation

        ans = calculation.split('(')[1].replace(")","")

        sent = find_ans(item['news'],ans)


        if sent is not None:

            sent = re.sub(r'\([^)]*\)','',sent).strip()
            return f"The answer can be found in the sentence: '{sent}'. However, the answer is not in numerical form, hence '{ans}' is converted into its equivalent numerical form {item['ans']}. so the answer is {item['ans']}"


        else: return None

    except:
        print(item.calculation)

In [None]:
def get_paraphrase_placeholder(item):

    try:

        calculation = re.findall(r'\([^)]*\)',item['calculation'])[0]


        pattern = r'\b\d{1,3}(,\d{3})*\b'

        ans = [match.group() for match in re.finditer(pattern, calculation)]
        if len(ans)>0:
            ans = ans[0]
        elif "," in calculation:
            ans = calculation.split(",")[0].replace("(","")
        else:
            ans = calculation.replace("(","")
            ans = ans.replace(")","")


        sent = find_ans(item['news'],ans)

        ans_num = int(ans.replace(',',""))

        if sent is not None:

            sent = re.sub(r'\([^)]*\)','',sent).strip()
            return f"The actual answer can be found in the sentence: '{sent}'. However, the answer is paraphrased  which is common for large numbers. The number: '{ans}' is paraphrased to '{item['ans']}K' after dividing {ans_num} by 1000. So the answer is {item['ans']}"


        else: return None

    except:
        print(item.calculation)

In [None]:
def get_round_placeholder(item):

    try:

        calculation = re.findall(r'\([^)]*\)',item['calculation'])[0]

        ans = calculation.split(",")[0].replace("(","")


        sent = find_ans(item['news'],ans)

        ans_num = float(ans.replace(',',""))

#         print(sent,ans)

        if sent is not None:

            sent = re.sub(r'\([^)]*\)','',sent).strip()
            return f"The actual answer can be found in the sentence: '{sent}'. However, the answer is rounded which is a common way of representing floating numbers in headlines or summaries. The number: '{ans}' is rounded to '{item['ans']}' which is the nearest integer or floating number to the original number {ans}. So the answer is {item['ans']}"


        else: return None

    except:
        print(item.calculation, item['ans'])

In [None]:
def get_span_placeholder(item):

#     try:

    calculation = re.findall(r'\([^)]*\)',item['calculation'])[0]

    ans = calculation.replace("(","")
    ans = ans.replace(")","")

    if item['ans'] !="1":
        return None


    sent = find_ans(item['news'],ans)

    if "No." in item['masked headline'] and sent is not None:
        sent = re.sub(r'\([^)]*\)','',sent).strip()
        return f"No. 1 typically refers to the topmost or the best-ranked item in a list or a competition. The presence of 'No.' in the headline gives a clue that the answer is 1 which is supported by '{ans}' from the sentence '{sent}'. So the answer is {item['ans']}"

    if "M" in item['masked headline'] and sent is not None:
        sent = re.sub(r'\([^)]*\)','',sent).strip()
        return f"M often stand for million. The letter 'M' in the headline indicates that the answer refers an amount and '{ans}' can be transformed to {item['ans']} million. So the answer is {item['ans']}"

    if "st" in item['masked headline'] and sent is not None:
        sent = re.sub(r'\([^)]*\)','',sent).strip()
        return f"1st typically refers to the topmost or the best-ranked item in a list or a competition. The presence of 'st' in the headline gives a clue that the answer is  1 which is supported by '{ans}' from the sentence '{sent}'. So the answer is {item['ans']}"

    if sent is not None:

        sent = re.sub(r'\([^)]*\)','',sent).strip()
        return f"The clue for the answer can be found in the sentence: '{sent}' where '{ans}' can refer to a person, an object, or an event. So the answer is {item['ans']}"


    else: return None

#     except:
#         print(item.calculation)

In [None]:
def get_subtract_placeholder(item):

    try:
        calculation = re.findall(r'\([^)]*\)',item['calculation'])[0]

        year_a = calculation.split(',')[0].replace("(","")
        year_b = calculation.split(',')[1].replace(")","")

        if year_a in item['published'] and year_b in item['news']:
             return f"The news published in the year {year_a} and the event mentioned in the news happened in the year {year_b}, so the year mentioned in the headline comes from a subtraction of {year_a}-{year_b} = {int(year_a)-int(year_b)}. so the answer is {item['ans']}"

        else: return None
    except:
        print(item.calculation)

In [None]:
df = df[['news','masked headline','calculation','ans']]

In [None]:
df['published'] = df.apply(lambda item: convert_timestamp(item['news']), axis=1)

In [None]:
df['ans_sent'] = df.apply(lambda item: get_ans_sent(item), axis=1)

Round(42.7) 42.8
Round(141.53) 142


In [None]:
df[df['ans_sent'].isna()].shape

(162, 6)

In [None]:
df[(df['ans_sent'].isna()) & (df['calculation'].str.contains('Trans'))].iloc[2].news

"(Nov 22, 2010  5:08 PM) So the Nissan Leaf's efficiency technically isn't measured in gallons, but the feds say it's running at the equivalent of 99 mpg, reports the AP—106 city and 92 highway. The EPA says the Leaf can go about 73 miles on a full charge (that's down from Nissan's estimates of 100 miles) and run an average $561 a year in electricity."

In [None]:
a = df[(df['ans_sent'].isna()) & (df['calculation'].str.contains('Trans'))].iloc[3]
get_trans_placeholder(a)

In [None]:
missed = df[df['ans_sent'].isna()]

In [None]:
training_df = df[~df['ans_sent'].isna()]

In [None]:
training_df.shape

(20995, 6)

In [None]:
training_df['inputs'] = training_df.apply(lambda item: prepare_input(item),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_df['inputs'] = training_df.apply(lambda item: prepare_input(item),axis=1)


In [None]:
# training_df.to_csv('test-cot.csv', index=False)

In [None]:
training_df['ans_sent'].iloc[0]

"The actual answer can be found in the sentence: '1, Walmart will no longer offer 30,000 of its employees health insurance.'. However, the answer is paraphrased  which is common for large numbers. The number: '30,000' is paraphrased to '30K' after dividing 30000 by 1000. So the answer is 30"

In [None]:
training_df['masked headline'].iloc[0]

'____K Walmart Part-Timers to Lose Health Insurance'

In [None]:
training_df['ans_sent'].iloc[1]

"The answer can be found in the sentence: 'As Shepard revealed last night on Jimmy Kimmel Live, the whole thing—including the fuel it took to get to the courthouse—cost $142.'. so the answer is 142"

In [None]:
training_df['masked headline'].iloc[1]

'Dax Shepard: Wedding to Kristen Bell Cost $____'

In [None]:
training_df['ans_sent'].iloc[2]

"The answer can be found in the sentence: 'Nancy Reagan, the helpmate, backstage adviser, and fierce protector of Ronald Reagan in his journey from actor to president—and finally during his 10-year battle with Alzheimer's disease—died Sunday at the age of 94, reports the AP, via CBS News.'. so the answer is 94"

In [None]:
filter_unique_string(training_df, 'Add')

Unnamed: 0,news,masked headline,calculation,ans,published,ans_sent,inputs
116,"(Mar 20, 2011 6:00 AM CDT) An 80-year-old wom...",Japan Pulls ____ Survivors From Rubble,"Add(80-year-old woman,16-year-old grandson)",2,The news was published on 20th March in the ye...,So the answer is 2,The news was published on 20th March in the ye...
280,"(Sep 30, 2010 10:58 AM CDT) Next year’s Oscar...",____: Whitest Oscars in a Decade?,"Add(2010,1)",2011,The news was published on 30th September in th...,So the answer is 2011,The news was published on 30th September in th...
292,"(Dec 17, 2010 6:00 PM) America's most promine...","CNN, Tea Party Express to Host ____ Debate","Add(2010,2)",2012,The news was published on 17th December in the...,So the answer is 2012,The news was published on 17th December in the...
454,"(Dec 31, 2013 6:22 AM) You will never, ever, ...",Woman With ____-Character Last Name Wins Fight...,"Add(35,1)",36,The news was published on 31st December in the...,So the answer is 36,The news was published on 31st December in the...
543,"(Jan 4, 2013 12:27 PM) Historically speaking,...",Get Ready for Another Black President in ____,"Add(2013,3)",2016,The news was published on 4th January in the y...,So the answer is 2016,The news was published on 4th January in the y...
...,...,...,...,...,...,...,...
19998,"(Nov 8, 2017 7:16 AM) Axios calls it a big s...","Amid 2017 Election Wave, Big Implications for ...","Add(2017,1)",2018,The news was published on 8th November in the ...,So the answer is 2018,The news was published on 8th November in the ...
20202,"(Nov 18, 2016 12:29 PM) Cobb County, Ga., jus...",____ Georgia Residents to Have Homes Demolishe...,"Add(15,16)",31,The news was published on 18th November in the...,So the answer is 31,The news was published on 18th November in the...
20484,"(Oct 19, 2017 6:53 PM CDT) Imagine living in ...",Target Has Stores in 49 States. It's About to ...,"Add(49,1)",50,The news was published on 19th October in the ...,So the answer is 50,The news was published on 19th October in the ...
20581,"(Jan 24, 2017 12:59 PM) Parisian streets have...",How to Buy a Piece of Parisian Real Estate for...,"Add(60,15)",75,The news was published on 24th January in the ...,So the answer is 75,The news was published on 24th January in the ...
