In [1]:
# import libraries

from collections import Counter
import requests
import json
import os
from tqdm import tqdm
import pandas as pd
import json

# Change to the parent directory of the notebook
os.chdir('..')

## Load and Transform Dataset + convert to DF | Transform

In [2]:
# Load the dataset.json file
file_path = 'Data/dataset.json'
with open(file_path, 'r') as file:
    data = json.load(file)


In [3]:

rows = []
for row in data.items():
    post_id = row[1]['post_id']
    post_tokens = row[1]['post_tokens']
    rationales = row[1]['rationales']
    annotators = row[1]['annotators']
    tweet_text = " ".join(post_tokens)

    # Aggregate all rationales
    combined_rationale = [any(rat[i] for rat in rationales) for i in range(len(post_tokens))] if rationales else [0] * len(post_tokens)

    # Extract key features
    key_features = [post_tokens[i] for i, is_key in enumerate(combined_rationale) if is_key]

    annotator_data = {}
    all_targets = set()  # To store unique targets
    labels = []  # To collect labels for voting

    for index, annotator in enumerate(annotators, start=1):  # Use index as annotator_id
        annotator_data[f"annotator_{index}_label"] = annotator['label']
        annotator_data[f"annotator_{index}_target"] = ", ".join(annotator['target']) if annotator['target'] else None

        # Collect all targets
        all_targets.update(annotator['target'])

        # Collect labels for voting
        labels.append(annotator['label'])

    # Combine all unique targets
    combined_target = ", ".join(sorted(all_targets))

    # Perform majority voting for the label
    label_counts = Counter(labels)
    voted_label = label_counts.most_common(1)[0][0] if labels else None

    # Create the row
    row_data = {
        "post_id": post_id,
        "tweet_text": tweet_text,
        "key_features": key_features,
        "target": combined_target,
        "label": voted_label,
    }
    row_data.update(annotator_data)  # Add annotator-related fields

    rows.append(row_data)
df = pd.DataFrame(rows)

df.tail()

Unnamed: 0,post_id,tweet_text,key_features,target,label,annotator_1_label,annotator_1_target,annotator_2_label,annotator_2_target,annotator_3_label,annotator_3_target
20143,9989999_gab,if ur still on twitter tell carlton i said his...,"[alcoholic, drug, addicted, teenage, single, m...","Men, None, Other, Women",offensive,offensive,"Men, Women, Other",offensive,,normal,
20144,9990225_gab,when i first got on here and said i hate trump...,"[i, hate, trump, i, was, hating, niggers]","African, None, Other",offensive,offensive,African,offensive,"African, Other",normal,
20145,9991681_gab,was macht der moslem wenn der zion gegen seine...,[],"Islam, None, Other",normal,offensive,Islam,normal,Other,normal,
20146,9992513_gab,it is awful look at world demographics asians ...,"[asians, are, fucking, everywhere, white, geno...","Asian, Hispanic",hatespeech,hatespeech,Hispanic,hatespeech,Asian,offensive,Asian
20147,9998729_gab,the jewish globalist elite have only imported ...,"[imported, few, million, muslims, to, multicul...","African, Islam, Jewish",offensive,hatespeech,"African, Islam",offensive,"Islam, Jewish",offensive,"African, Islam, Jewish"


In [4]:
# filder data
data = [
    {key: d[key] for key in ('post_id', 'tweet_text', 'key_features', 'target', 'label')}
    for d in rows
]

In [5]:
# see data
data[10000]

{'post_id': '1178534874602496000_twitter',
 'tweet_text': '<user> <user> this is exactly the same as the argument that people who get raped when they were dressed scantily were asking for it you are lame af',
 'key_features': [],
 'target': 'None',
 'label': 'normal'}

# LLM for Explaination Generation

## Configure LLM

In [6]:
# Define LLM inference function to use later
API_URL = "https://inf.cl.uni-trier.de/chat/"

def llm(model_name, system_prompt, input_query):
    # Construct the request payload
    payload = {
        "messages": [
            {"content": system_prompt, "role": "system"},
            {"content": input_query, "role": "user"}
        ],
        "model": model_name,
        "options": {}
    }
    
    # Set the request headers
    headers = {
        "accept": "application/json",
        "Content-Type": "application/json"
    }
    
    # Send the POST request
    response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
    
    # Process the response
    if response.status_code == 200:
        # print("Response received successfully:")
        response = response.json() # json.dumps(, indent=4)
    else:
        print(f"Failed to retrieve response. Status code: {response.status_code}")

    return response


In [7]:
# Picking one row from dataset and converting it into string.
user_input = json.dumps(data[10000])
user_input

'{"post_id": "1178534874602496000_twitter", "tweet_text": "<user> <user> this is exactly the same as the argument that people who get raped when they were dressed scantily were asking for it you are lame af", "key_features": [], "target": "None", "label": "normal"}'

In [8]:
prompt_1 = """
Your Job is to analyze the tweet and write **Explaination** on it by taking inspiration from the examples bellow. 
Label -> Normal

1.
"i know this is somewhat hypocritical but what the fuck is the deal with queer guys and slightly too small white tshirts"

Explanation:
The tweet has been labeled normal because the user is solely expressing an observation. However the use of vulgar terms such as “fuck” may indicate that the user has a rather negative outlook towards topics concerning queers. Given that the context is not clear it may be understood as a neutral observation. The targets used within the dataset do not contain the word “queer” but the word homosexual. 

2.
“wore a nipsey blue suit today only the niggers in the work place feel the energy”

This tweet has been labeled normal because it seems to be a positive reference of paying tribute to the rapper nipsey hussle who is known for a specific shade of blue that is affiliated to the crib gang culture of LA.

Explanation:
The use of the term "niggers" is offensive and rooted in racism. Even if no direct harm or attack is evident, it perpetuates a derogatory term, which many systems and definitions include as hate speech. However, disagreement in labeling often arises due to contextual interpretations. Example for that could be Cultural or Social Context that goes Intended Meaning. In the tweet addition The phrase might be interpreted as describing a situation where the "energy" of the outfit is appreciated by a certain group, which could be seen as neutral, despite the offensive language.

3. rafiullah shaheed a young man from levi force was shot dead by unknown men in muslim bagh a levies officer was killed in a bomb blast in lashkar gah on sunday while militants were also killed on the same day three people including mullahaniyev were killed in chaman"

Explanation:

The statement is a factual report of events without any explicit derogatory, hateful, or harmful language targeting a specific individual or group based on characteristics like race, religion, or ethnicity. It lacks the intention to incite hate or violence, which is key to labeling content as hate speech.

Label -> Offensive
 
“pocahontas jumps in bed with moslem terrorists interesting combo”

Explanation:
This statement is labeled offensive because it uses derogatory language and stereotypes to insult and demean individuals or groups. Referring to someone as "Pocahontas" in a pejorative context is disrespectful, and associating them with "Muslim terrorists" perpetuates harmful stereotypes and biases, making the comment offensive and inappropriate.

iIslamophobic you for real and how is islam a race you half wit

It is labled offensive beacuse it contains an insult.


treat the immigrants as drug traffickers they will soon have a change of mind
This statement is offensive because it unjustly stereotypes immigrants as drug traffickers, promoting a harmful generalization that dehumanizes and criminalizes an entire group of people. Such language fosters prejudice, discrimination, and hostility toward immigrants. Furthermore it incitement to

 
Label -> Hate Speech
1)you niggas is fake asl that why you dont see none round me

Explanation:
This statement is classified as hate speech because it uses a racial slur ("niggas"), which is historically and culturally offensive, particularly against Black individuals. Even when used in some contexts as reclaimed language, its use in this statement appears derogatory, contributing to a hostile and discriminatory tone. The inclusion of aggressive language further amplifies its hateful nature.


2) "online nazi haha guys the left can not meme <number> white privilege diversity is our strength race is just skin color the holocaust racist sexist bigot homophobe islamophobe anti semite"

Explanation:
This statement is hate speech because it mocks and trivializes serious societal issues, such as racism, sexism, and discrimination, using a sarcastic tone to dismiss terms like "white privilege," "diversity is our strength," and "anti-Semite." By doing so, it undermines the significance of these issues and belittles those advocating for equality. References to "Nazi" and "the Holocaust" in a dismissive context further minimize historical atrocities and signal alignment with extremist ideologies. Overall, the statement ridicules efforts to address inequality and promotes hostility, making it hate speech.

3) "pro tip the niggers do not care if you never owned slaves granddaddy fought the natzis or great great grandad ground the confederacy into dust with the boot heel of the proto bolshevik yankee gubment force of weight and arms behind it they hate you because you are white and you exist altright"

Explanation:
This statement is classified as hate speech because it uses the racial slur ("niggers"). The statement further vilifies an entire group by making sweeping generalizations, such as claiming that Black people "hate you because you are white and you exist." This kind of language fosters an "us vs. them" narrative, portraying Black people as inherently hostile toward white people, which promotes racial division and resentment. The inclusion of terms like "proto Bolshevik Yankee gubment force" and "altright" reflects alignment with extremist ideologies often associated with white nationalism and discriminatory rhetoric. Additionally, references to historical events such as "granddaddy fought the Nazis" and "great great grandad ground the Confederacy" are weaponized to evoke a sense of pride in white heritage while simultaneously vilifying Black individuals. 
"""

In [9]:
prompt_2 = """
As a hate speech specialist, your task is to analyze tweets to determine if they contain hate speech, offensive content, 
or are neutral. Hate speech can be conveyed through any form of expression. 
Hate speech is “discriminatory” (biased, bigoted, or intolerant) or “pejorative” (prejudiced, contemptuous, or demeaning) of an individual or group. 
Hate speech calls out real or perceived “identity factors” of an individual or a group, including: “religion, ethnicity, nationality, race, color, descent, gender,” 
but also characteristics such as language, economic or social origin, disability, health status, or sexual orientation, among many others.

Here are some examples to illustrate:
- "Nigga", "Bitch" // Offensive: This term is offensive as it historically derogates African Americans and Women.
- "A negro moslem doctor got struck off for negligence when a white <number> year old kid with down syndrome died; the guardian's response is to help promote a crowdfunding campaign for her" // Hate speech: This statement is hate speech as it discriminates based on race and religion, promoting negative stereotypes.
- "Smiling cuz I am young black rich and I am handsome!" // No hate speech: This expression is a positive self-description and does not target or degrade any other group.

For each tweet provided, you will analyze:
- tweet_text: The text of the tweet
- key_features: Significant words or phrases identified in the tweet
- target: The intended audience of the tweet
- label: The classification of the tweet (hate speech, offensive, or neutral)

Your explanation should discuss:
- The context and user's intention behind the tweet.
- Why the tweet falls into its assigned category based on the information provided.
- How key features and the specific content of the tweet support your classification.

This careful analysis is vital for improving our understanding and detection of hate speech, providing nuanced insights into how such speech can be identified and classified accurately.

# Notes:
1. Each explanation should be a single, comprehensive paragraph.
2. Include insights into the tweet's context and the user's intent.
3. Analyze the content through the lens of the provided hate speech definition, discussing the role of key features in your decision.
4. Your detailed assessment will help refine the AI's capabilities in recognizing and differentiating between hate speech, offensive content, and neutral statements.
"""

In [10]:
prompt_3 = """
You are tasked with providing a detailed explanation of why a given text is classified as hate speech, offensive, or neutral, based on annotations provided. Follow a step-by-step reasoning process to justify the classification, considering the context, user's intent, and linguistic features.

The dataset provides the following information:
- tweet_text: The main user text requiring explanation.
- key_features: Words or phrases crucial to the decision.
- target: The audience or group targeted by the text.
- label: The classification of the text (hate speech, offensive, or neutral).

Your job is to:
1. Start by identifying the label of the text and summarizing what it indicates about the content.
2. Analyze the key features and explain how they support the classification.
3. Consider the target and evaluate how the text affects or refers to this group.
4. Examine the overall context and tone to understand the user's intent.
5. Conclude with a detailed explanation, justifying the classification and the reasoning behind it.

Guidelines:
- If the label is neutral, but you believe the text could be hate speech, explain why it is still considered neutral.
- If the label is offensive, describe the nature of the offense and how the key features contribute to it.
- Ensure the explanation is in one clear paragraph, summarizing the reasoning process.
- Highlight what helped in making the decision and why the text is classified as it is.

Here are examples for guidance:

1. **Hate Speech**:
   - tweet_text: "All [group] people should be eliminated."
   - key_features: ["eliminated", "[group] people"]
   - target: A specific ethnic or religious group
   - label: Hate Speech
   Reasoning: 
   Step 1: The label is hate speech, which indicates that the content is intended to incite harm or violence against a group.  
   Step 2: Key features like "eliminated" express a clear call for harm, targeting "[group] people."  
   Step 3: The target is a specific ethnic or religious group, making this statement particularly harmful.  
   Step 4: The context and tone show a violent intention to exclude and harm the group.  
   Conclusion: This text qualifies as hate speech due to its explicit incitement of violence and harm directed toward a specific group.

2. **Offensive**:
   - tweet_text: "Why are men so bad at taking care of kids?"
   - key_features: ["men", "bad at taking care"]
   - target: Men
   - label: Offensive
   Reasoning:
   Step 1: The label is offensive, which suggests the text contains harmful stereotypes or derogatory assumptions.  
   Step 2: Key features like "bad at taking care" perpetuate a stereotype that questions men’s abilities in caregiving.  
   Step 3: The target is men, a group generalized and negatively characterized in this statement.  
   Step 4: While the tone is not explicitly harmful, the language dismisses and demeans the target group.  
   Conclusion: This text is offensive because it perpetuates a harmful stereotype about men, although it does not incite direct harm.

3. **Neutral**:
   - tweet_text: "What’s everyone’s favorite food?"
   - key_features: None
   - target: None
   - label: Neutral
   Reasoning:
   Step 1: The label is neutral, meaning the text does not contain offensive or harmful language.  
   Step 2: There are no key features or elements that suggest hostility, harm, or stereotypes.  
   Step 3: The content is inclusive and general, without targeting any group or individual.  
   Step 4: The context and tone are friendly and open-ended, with no intent to harm or offend.  
   Conclusion: This text is neutral because it is non-controversial, lacks harmful language, and does not target any individual or group.

Now, write an explanation for the given input using this reasoning framework.
"""

In [11]:
# Function to select a range of 10 rows
def pick_inputs(data, start_row, num_inputs=10):
    if start_row + num_inputs > len(data):
        raise ValueError("Start row plus number of inputs exceeds dataset size.")
    return list(enumerate(data[start_row:start_row + num_inputs], start=start_row + 1))

In [12]:
inputs = pick_inputs(data, start_row=100, num_inputs=10)

In [13]:
inputs[1][1]

{'post_id': '1179053462463635456_twitter',
 'tweet_text': 'wore a nipsey blue suit today only the niggers in the work place feel the energy 🏁 💙',
 'key_features': [],
 'target': 'African, None',
 'label': 'normal'}

In [14]:
import pandas as pd
from tqdm import tqdm
import random
import time


# Models, prompts, and inputs
models = [
    "mistral:7b-instruct-v0.2-q6_K",
    "llama3.3:70b-instruct-q6_K",
    "qwen2:72b-instruct-q6_K"
]
prompts = [
    prompt_1, 
    prompt_2, 
    prompt_3
]
inputs = pick_inputs(data, start_row=100, num_inputs=10)

# Run all combinations with progress bar and retry logic
results = []

# Total number of combinations
total_combinations = len(inputs) * len(prompts) * len(models)

# Use tqdm for progress tracking
with tqdm(total=total_combinations, desc="Processing LLM responses") as pbar:
    for model in models:
        for prompt in prompts:
            for input_tupple in inputs:
                attempt = 0
                time_taken = 0.0  # Initialize time taken
                input_text = input_tupple[1]
                row_number = input_tupple[0]
                while attempt < 3:  # Retry up to 3 times
                    try:
                        start_time = time.time()  # Start timing
                        
                        response = llm(model, prompt, json.dumps(input_text))
                        end_time = time.time()  # End timing
                        time_taken = round(end_time - start_time, 2)  # Calculate time taken
                        explanation = response['response']
                        results.append({
                            "Row Number": row_number,
                            "Model": model,
                            "Prompt": prompt,
                            "Input": input_text,
                            "Response": explanation,
                            "Time Taken (s)": time_taken
                        })
                        break
                    except Exception as e:
                        attempt += 1
                        if attempt == 3:
                            # Log error and continue
                            results.append({
                                "Row Number": row_number,
                                "Model": model,
                                "Prompt": prompt,
                                "Input": input_text,
                                "Response": "error",
                                "Time Taken (s)": time_taken
                            })
                            print(f"Error after 3 retries: {e}")
                pbar.update(1)

# Convert results to DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="Row Number", ascending=True)
# Save results to CSV
df_results.to_csv("llm_results_with_retries.csv", index=False)

# Display the first few rows
df_results.head()


Processing LLM responses:  79%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 71/90 [1:16:47<37:52, 119.63s/it]

Failed to retrieve response. Status code: 504
Failed to retrieve response. Status code: 504


Processing LLM responses:  80%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                | 72/90 [1:46:48<3:07:09, 623.84s/it]

Failed to retrieve response. Status code: 504
Error after 3 retries: 'Response' object is not subscriptable
Failed to retrieve response. Status code: 504


Processing LLM responses:  83%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                        | 75/90 [2:05:22<1:40:52, 403.53s/it]

Failed to retrieve response. Status code: 504


Processing LLM responses:  84%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                                     | 76/90 [2:18:48<2:02:17, 524.14s/it]

Failed to retrieve response. Status code: 504


Processing LLM responses:  87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                | 78/90 [2:40:43<1:48:49, 544.13s/it]

Failed to retrieve response. Status code: 504


Processing LLM responses:  91%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                     | 82/90 [3:14:14<57:32, 431.58s/it]

Failed to retrieve response. Status code: 502


Processing LLM responses: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [3:40:53<00:00, 147.26s/it]


Unnamed: 0,Row Number,Model,Prompt,Input,Response,Time Taken (s)
0,101,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178650031076909056_twitter', 'tw...",Explanation:\nThe tweet expresses a observati...,7.52
10,101,mistral:7b-instruct-v0.2-q6_K,"\nAs a hate speech specialist, your task is to...","{'post_id': '1178650031076909056_twitter', 'tw...","This tweet, with text ""i know this is somewha...",19.02
30,101,llama3.3:70b-instruct-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178650031076909056_twitter', 'tw...","Explanation:\nThe tweet has been labeled as ""n...",124.07
20,101,mistral:7b-instruct-v0.2-q6_K,\nYou are tasked with providing a detailed exp...,"{'post_id': '1178650031076909056_twitter', 'tw...","Step 1: The label is labeled as ""normal"" or n...",18.95
50,101,llama3.3:70b-instruct-q6_K,\nYou are tasked with providing a detailed exp...,"{'post_id': '1178650031076909056_twitter', 'tw...","The label of this text is classified as ""norma...",121.73


In [14]:
df_results = df_results.sort_values(by="Row Number", ascending=True)

# Save the sorted results to CSV
df_results.to_csv("llm_results_sorted_by_row_number.csv", index=False)

Unnamed: 0,Model,Prompt,Input,Response,Time Taken (s)
0,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178650031076909056_twitter', 'tw...",Explanation:\nThe tweet expresses a question ...,74.41
1,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1179053462463635456_twitter', 'tw...",Explanation:\nThe tweet has been labeled norm...,19.95
2,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1179079421619097600_twitter', 'tw...",Explanation:\n\nThis tweet has been labeled n...,18.67
3,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178082052513468417_twitter', 'tw...","Explanation:\nThe tweet ""a camel jockey midde...",18.52
4,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178822517470965766_twitter', 'tw...","Explanation:\nThe tweet ""dwts they all look g...",18.11
5,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178016367200624640_twitter', 'tw...","Explanation:\nThe tweet ""create a list destab...",18.14
6,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178504094530424833_twitter', 'tw...",Explanation:\nThe tweet has been labeled offe...,17.92
7,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178503399962300416_twitter', 'tw...",Explanation:\nThis statement is classified as...,18.85
8,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178477971960672257_twitter', 'tw...",Explanation:\nThis tweet is labeled as hate s...,18.57
9,mistral:7b-instruct-v0.2-q6_K,\nYour Job is to analyze the tweet and write *...,"{'post_id': '1178819339631431680_twitter', 'tw...",Explanation:\nThis tweet has been labeled off...,17.87


In [16]:
df_results.to_csv("llm_results_with_retries.csv", index=True)

In [None]:
import pandas as pd
from tqdm import tqdm
import random
import time


# Models, prompts, and inputs
models = [
    "mistral:7b-instruct-v0.2-q6_K",
    "llama3.3:70b-instruct-q6_K",
    "qwen2:72b-instruct-q6_K"
]
prompts = [
    prompt_1, 
    prompt_2, 
    prompt_3
]
inputs = pick_inputs(data, start_row=200, num_inputs=50)

# Run all combinations with progress bar and retry logic
results = []

# Total number of combinations
total_combinations = len(inputs) * len(prompts) * len(models)

# Use tqdm for progress tracking
with tqdm(total=total_combinations, desc="Processing LLM responses") as pbar:
    for model in models:
        for prompt in prompts:
            for input_tupple in inputs:
                attempt = 0
                time_taken = 0.0  # Initialize time taken
                input_text = input_tupple[1]
                row_number = input_tupple[0]
                while attempt < 3:  # Retry up to 3 times
                    try:
                        start_time = time.time()  # Start timing
                        
                        response = llm(model, prompt, json.dumps(input_text))
                        end_time = time.time()  # End timing
                        time_taken = round(end_time - start_time, 2)  # Calculate time taken
                        explanation = response['response']
                        results.append({
                            "Row Number": row_number,
                            "Model": model,
                            "Prompt": prompt,
                            "Input": input_text,
                            "Response": explanation,
                            "Time Taken (s)": time_taken
                        })
                        break
                    except Exception as e:
                        attempt += 1
                        if attempt == 3:
                            # Log error and continue
                            results.append({
                                "Row Number": row_number,
                                "Model": model,
                                "Prompt": prompt,
                                "Input": input_text,
                                "Response": "error",
                                "Time Taken (s)": time_taken
                            })
                            print(f"Error after 3 retries: {e}")
                pbar.update(1)

# Convert results to DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="Row Number", ascending=True)
# Save results to CSV
df_results.to_csv("llm_results_with_retries333.csv", index=False)

# Display the first few rows
df_results.head()


Processing LLM responses:  71%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 321/450 [5:27:18<3:48:54, 106.47s/it]

In [None]:
# Convert results to DataFrame
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by="Row Number", ascending=True)
# Save results to CSV
df_results.to_csv("llm_results_with_retries333.csv", index=False)

# Display the first few rows
df_results.head()