Note: Only Dataset-v2 is used

Process Kaggle Data: Split headlines into sarcastic and non-sarcastic

In [1]:
import json

In [2]:
def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

def parse_data_with_key(file, key):
    '''
    Output: column with title 'key'
    '''
    for l in open(file,'r'):
        yield json.loads(l)[key]

def parse_data_with_key_and_cond(file, key, cond_key, cond_val):
    '''
    Output: column with title 'key', rows where 'cond_key' column has value 'cond_val'
    '''
    for l in open(file,'r'):
        row = json.loads(l)
        if row[cond_key] == cond_val:
            yield json.loads(l)[key]

In [3]:
PATH = '../data/Sarcasm_Headlines_Dataset_v2.json'
sarcastic_headlines = list(parse_data_with_key_and_cond(
    PATH, 'headline', 'is_sarcastic', 1))
non_sarcastic_headlines = list(parse_data_with_key_and_cond(
    PATH, 'headline', 'is_sarcastic', 0))

f = open('../data/sarcastic_headlines.json', 'w')
f.write(json.dumps(sarcastic_headlines))
f = open('../data/non_sarcastic_headlines.json', 'w')
f.write(json.dumps(non_sarcastic_headlines))

953274

Generate headlines through response from LLM (Preparation)

In [4]:
from requests import Response
from json import JSONDecodeError

import requests
import json
from tqdm import tqdm
import time
import shutil

In [5]:
# Prepare input data

# Get all sarcastic headlines
f = open('../data/sarcastic_headlines.json', 'r')
sarcastic_headlines = json.loads(f.read())
f.close()

# Get all non-sarcastic headlines
f = open('../data/non_sarcastic_headlines.json', 'r')
non_sarcastic_headlines = json.loads(f.read())
f.close()

print(len(sarcastic_headlines), len(non_sarcastic_headlines))

13634 14985


Generate headlines through response from LLM (Functions)

In [6]:
class ResponseRecorder:
    def __init__(self, path):
        self.final_results = []
        self.initial_length = 0
        self.buffer = []
        self.path = path

        try:
            f = open(self.path, 'r')
            data = f.read()
            f.close()
            self.final_results = json.loads(data)
            self.initial_length = len(self.final_results)
            print("Existing data found. Loaded successfully.")
        except FileNotFoundError:
            print("File not found. Will create new file when writing.")
    
    def set_buffer(self, buffer: list[str]) -> None:
        self.buffer = buffer
        print("Head of response:")
        print(buffer if len(buffer) < 5 else buffer[:5])
        print(f"Length of response: {len(buffer)}")
        print("Run \'pop_buffer()\' after checking response.")
    
    def pop_buffer(self):
        self.final_results = self.final_results + self.buffer
    
    def save(self, expected_length):
        if self.initial_length + expected_length != len(self.final_results):
            raise ValueError("Updated document has unexpected length. "
                "Save Aborted. "
                f"Existing file has length: {self.initial_length} "
                f"Added section has length: {len(self.final_results) - self.initial_length}")
        f = open(self.path, 'w')
        f.write(json.dumps(self.final_results))
        f.close()

        # Reload
        f = open(self.path, 'r')
        data = f.read()
        self.final_results = json.loads(data)
        self.initial_length = len(self.final_results)
        self.buffer = []

In [7]:
def send_request(
    llm_auth: str,
    model: str,
    query: str
) -> Response:
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": "Bearer " + llm_auth,
            "Content-Type": "application/json",
        },
        data=json.dumps({
            "model": model,
            "messages": [
            {
                "role": "user",
                "content": query
            }
            ],
        })
    )
    return response

In [8]:
def get_query(question, headlines: list[str]) -> str:
    return question + headlines.__str__()

def split_list(l):
    mid = len(l) // 2
    return (l[:mid], l[mid:])

def handle_response(
    response: Response, 
    response_handler: ResponseRecorder,
    expected_length: int
) -> None:
    '''
    Parse response from online LLM and save result to local file.
    '''
    data = json.loads(response.content)
    batch_result = json.loads(data['choices'][0]['message']['content'])
    if type(batch_result) != list or len(batch_result) == 0 or type(batch_result[0]) != str:
        raise TypeError("Batch result cannot be parsed to correct type or is empty.")
    response_handler.set_buffer(batch_result)
    response_handler.pop_buffer()
    response_handler.save(expected_length)

def generate_headlines(
        input_headlines: list[str],
        save_path: str,
        start: int,
        end: int,
        step: int,
        llm_auth: str,
        model: str,
        question: str
    ) -> float:

    response_handler = ResponseRecorder(save_path)

    start_time = time.time()

    for i in tqdm(range(start, end, step)):
        if i + step >= end:
            headline_subset = input_headlines[i:end]
            expected_length = end - i
        else:
            headline_subset = input_headlines[i:i+step]
            expected_length = step
        
        req_start_time = time.time()
        try:
            response = send_request(
                llm_auth,
                model,
                get_query(question, headline_subset)
            )
            handle_response(response, response_handler, expected_length)
        except JSONDecodeError:
            # likely because the response is too long.
            # hence break the input into 2 parts, reducing input size by half.
            headline_subset1, headline_subset2 = split_list(headline_subset)
            print("Failed to parse JSON response. Trying to reduce step size...")
            response1 = send_request(
                llm_auth,
                model,
                get_query(question, headline_subset1)
            )
            handle_response(response1, response_handler, expected_length // 2)
            response = send_request(
                llm_auth,
                model,
                get_query(question, headline_subset2)
            )
            handle_response(response, response_handler, expected_length - expected_length // 2)
            
        req_end_time = time.time()
        print(f"Request took {(req_end_time - req_start_time):.2f} seconds.")

    end_time = time.time()
    return end_time - start_time

def get_output_file_length(save_path):
    '''
    For verification purpose only. To implement logic, 
    use ResponseRecorder class instead.
    '''
    try:
        f = open(save_path, 'r')
    except FileNotFoundError:
        return 0
    data = f.read()
    output = json.loads(data)
    return len(output)

Generate headlines through response from LLM (Execution)

In [None]:
# Change to your own OpenRouter API auth key
llm_auth = "change-to-your-own-auth-key"

# See more models on https://openrouter.ai/models
model = "deepseek/deepseek-chat:free"

# Try prompt engineering if you are not satisfied with the output
sar_to_non_sar_question = (
    "Here is a list of sarcastic headlines. Read all of them in order. "
    "Convert each of them to a non-sarcastic headline while preserving the original "
    "meaning as much as possible. The format of your response should exactly be a "
    "string that can be parsed by json to a python list of strings, without any "
    "additional comments."
)
non_sar_to_sar_question = (
    "Here is a list of non-sarcastic headlines. Read all of them in order. "
    "Convert each of them to a sarcastic headline while preserving the original "
    "meaning as much as possible. The format of your response should exactly be a "
    "string that can be parsed by json to a python list of strings, without any "
    "additional comments."
)

non_sar_save_path = "../data/non_sarcastic_generated.json"
non_sar_backup_path = "../data/non_sarcastic_generated_backup.json"
sar_save_path = "../data/sarcastic_generated.json"
sar_backup_path = "../data/sarcastic_generated_backup.json"

In [36]:
# (Optional) Back up previously generated result before running the next round
shutil.copyfile(non_sar_save_path, non_sar_backup_path)

'non_sarcastic_generated_backup.json'

In [None]:
# Generation of non-sarcastic headlines. Done in batches.
# Number of sarcastic headlines as input: 13634

# Input index range: [start, end)
start = 2000   # start index (included)
end = 3000   # end index (excluded)
step = 100  # batch size: number of headlines sent in a single request

# Output file is FRAGILE! If a certain batch fails, check output file length 
# immediately. Set start index to be exactly output file length when you re-run 
# this cell to avoid repeated feed of the same input.
if get_output_file_length(non_sar_save_path) != start:
    raise ValueError("Start index is set incorrectly.")

running_time = generate_headlines(sarcastic_headlines, non_sar_save_path, start, end, step, llm_auth, model, sar_to_non_sar_question)

Existing data found. Loaded successfully.


  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:21<00:00, 21.45s/it]

Head of response:
['Apartment decorated to appear as if it reflects a well-rounded life', 'Friend who was struggling is now making some progress', 'Stephen Miller visits children in local ICE detention center after a day of speechwriting', 'Entomologists retract discovery of new spider species, realizing it was just dust and hair', "Man's priorities are completely different from his grandfather's"]
Length of response: 50
Run 'pop_buffer()' after checking response.
Request took 21.45 seconds.





In [14]:
# Verify that length of updated json file is as expected
get_output_file_length(non_sar_save_path)

2000

In [None]:
# Performance check
running_time

59.54799795150757

In [None]:
# Save performance check results
# Known issue: always need to save twice and remove the first save,
# as the first will only replicate the previous result.
f = open('query_perf.txt', 'a')
f.write(f"count: 300; batch size: 100; time: {running_time}s\n")

54

In [None]:
'''___________Below are for generation of sarcastic headlines___________'''

In [44]:
# (Optional) Back up previously generated result before running the next round
shutil.copyfile(sar_save_path, sar_backup_path)

'sarcastic_generated_backup.json'

In [40]:
# Generation of sarcastic headlines. Done in batches.
# Number of non-sarcastic headlines as input: 14985

# Input index range: [start, end)
start = 0   # start index (included)
end = 1000   # end index (excluded)
step = 100  # batch size: number of headlines sent in a single request

# Output file is FRAGILE! If a certain batch fails, check output file length 
# immediately. Set start index to be exactly output file length when you re-run 
# this cell to avoid repeated feed of the same input.
if get_output_file_length(sar_save_path) != start:
    raise ValueError("Start index is set incorrectly.")

running_time = generate_headlines(non_sarcastic_headlines, sar_save_path, start, end, step, llm_auth, model, non_sar_to_sar_question)

File not found. Will create new file when writing.


  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:34<05:13, 34.79s/it]

Head of response:
['Oh Great, Congress Finally Nails Gender and Racial Equality', 'Because We All Needed More Veggie Recipes', 'What a Shocker: My White Inheritance', 'Taxes Are So Relaxing, Here Are 5 Ways to Make Them Less Stressful', 'Groundbreaking News: Lots of Parents Know This Scenario']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 34.79 seconds.


 20%|██        | 2/10 [02:16<09:54, 74.27s/it]

Head of response:
["'sleeping on it' really does solve all your problems, because who needs effort?", 'toeing the race line: because labels are the most important thing in life.', "how an essay on 'sexual paranoia' caused a frenzy at northwestern university, because academia is so calm otherwise.", 'my disastrous search for the perfect swimsuit, because swimsuits are the pinnacle of human achievement.', '10 big space-saving ideas for small kitchens, because who doesn’t love a cluttered countertop?']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 101.90 seconds.


 30%|███       | 3/10 [03:44<09:22, 80.43s/it]

Head of response:
['Oh great, another riveting stage door story: Wiesenthal', 'Because aging in prison is just a walk in the park', "Gwen Stefani bares her soul, because that's what the AMAs are for", 'Scott Pruitt gets yet another glowing profile, because why not?', 'Death to shoppers? Al-Shabaab really knows how to make a statement']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 87.76 seconds.


 40%|████      | 4/10 [05:06<08:06, 81.04s/it]

Head of response:
['Oh sure, mom pulling a gun on teens threatening her son is totally normal parenting', 'HuffPollster: Sorry Bernie fans, but a Sanders comeback is as likely as a snowball in hell', 'I cannot do this alone: Because the Down Syndrome community just loves being dependent on allies', 'The Grid: Because what the world really needs is AI-designed websites for everyone', 'City offers free pot for the poor: Because nothing solves poverty like a good high']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 81.97 seconds.


 50%|█████     | 5/10 [06:26<06:42, 80.58s/it]

Head of response:
["Jenna Fischer finally spills the groundbreaking secret of what Pam told Michael in his 'Office' goodbye episode", "Some of Amazon's suitors have absolutely no regrets about their past decisions", "Donald Trump just can't stop promoting his own perfect health", '13 totally original questions you’ve never thought to ask when hiring a web design company', 'Twitter: A paradise for perpetrators and a nightmare for sexual violence survivors']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 79.77 seconds.


 60%|██████    | 6/10 [07:16<04:41, 70.45s/it]

Head of response:
["Oh great, the FDA is finally getting around to caring about women's sexual health", 'Your nightly routine is definitely not aging you, says no one ever', "Thousands of people are just casually strolling to demand the Nicaraguan president's resignation", 'The Republican debate in Utah is canceled because who needs democracy anyway?', 'A drug company prioritizing work-life balance over profit? Groundbreaking.']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 50.79 seconds.


 70%|███████   | 7/10 [08:06<03:10, 63.57s/it]

Head of response:
['Oh great, another round of side-splitting parental humor', "Finally, the 'Broad City' scene we’ve all been desperately missing", 'Kelly Rowland’s groundbreaking advice for moms-to-be, because no one else has ever shared that', "Because nothing says 'decency' like politics today", 'Nothing like sneaking a cybersecurity bill into a spending package, very classy']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 49.39 seconds.


 80%|████████  | 8/10 [09:05<02:04, 62.11s/it]

Head of response:
['Oh sure, the school bus is totally the last bastion of tech-free paradise', 'Shocking: Russia apparently tried to cause chaos in 2016 elections, says Senate committee', 'Wow, Sarah Palin is absolutely crystal clear in her message—what even is she saying?', 'Christie campaign donors generously find a new home for their money: his super PAC', 'McConnell assures us Republicans have enough votes for the tax bill—what a surprise']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 58.99 seconds.


 90%|█████████ | 9/10 [10:06<01:01, 61.91s/it]

Head of response:
["Oh sure, for clean air and a safe climate future, because *that's* totally under control", 'How Nebraska can return to college football greatness—just sprinkle some magic dust!', 'Leaked report: Jerusalem at boiling point, because who doesn’t love a good simmering conflict?', 'Here’s a brand new thing you didn’t know about ‘The Office’—because we definitely needed *one more*', "Maxine Waters to women’s convention: Trump is 'most dishonorable and despicable' president ever—shocking, right?"]
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 61.48 seconds.


100%|██████████| 10/10 [11:20<00:00, 68.02s/it]

Head of response:
['Oh great, using new technology to give voice to the voiceless, as if that hasn’t been tried before', 'Will Ferrell is *so* thrilled about the USA-Germany game, it’s not like he’s a comedian or anything', 'Good girls have abortions too, because obviously nobody thought of that before', 'Democrats split over opposing a government funding bill that doesn’t protect Dreamers—who could have guessed?', 'Bill Paxton discovers his revolutionary past on ‘Who Do You Think You Are?’—riveting stuff']
Length of response: 100
Run 'pop_buffer()' after checking response.
Request took 73.38 seconds.





In [57]:
# Verify that length of updated json file is as expected
get_output_file_length(sar_save_path)

1000

In [42]:
# Performance check
running_time

680.2946770191193

In [63]:
non_sarcastic_headlines[999]

'lesson for urban cities: how chicagoans stand up for quality schools'

In [56]:
f = open(sar_save_path, 'r')
data = f.read()
sar_gen = json.loads(data)
sar_gen[999]

'Lesson for urban cities: how Chicagoans stand up for quality schools—because cities are easy to fix'