In [2]:
import requests
import json, math, sys, time
import openai, re
from openai import AzureOpenAI
from dotenv import load_dotenv
import os, time, glob
from tqdm import tqdm
from statistics import mode
from sklearn.metrics import confusion_matrix
import pandas as pd
from tqdm import tqdm

load_dotenv()

True

In [3]:
def query_o1(prompt):
    client = AzureOpenAI(
      azure_endpoint = os.getenv("o1_endpoint"), 
      api_key=os.getenv("o1_key"),  
      api_version="2024-02-01"
    )
    
    response = client.chat.completions.create(
        model=os.getenv("o1_mini"), # model = "deployment_name".
        messages=[
            {"role": "user", "content": prompt},
        ]
    )
    return response.choices[0].message.content

In [4]:
def load_query_template(path="./query_template2.txt"):
    with open(path, "r") as f:
        query_template = f.read()
    return query_template

In [5]:
def tag_comments(comment_list):
    tagged_comments = [f"<comment>{i}<\comment>\n" for i in comment_list]
    tagged_comments_str = "".join(tagged_comments)
    return tagged_comments_str

In [6]:
def extract_ann_exp(response):
    pattern = r"<ann>(.*?)</ann> <exp>(.*?)</exp>"
    
    response_list = response.split("\n")
    # AttributeError if one of them does not follow the patter or No match was found
    ann_batch = [re.search(pattern, i).group(1) for i in response_list]
    exp_batch = [re.search(pattern, i).group(2) for i in response_list]


    return ann_batch, exp_batch

In [7]:
def dumb():
    raise openai.RateLimitError 

In [8]:
def get_response(prompt, query_func, num_comments):
    retry = True; 
    max_retries_api_error = 10; num_retries_api_error = 0
    max_retries_bad_request = 5; num_retries_bad_request = 0

    while retry:
        try:
            response = query_func(prompt)
            retry = False
            
        except openai.BadRequestError as e:
            print(e); print(f"{num_retries_bad_request}/{max_retries_bad_request} tries more ...")
            if num_retries_bad_request < max_retries_bad_request: 
                time.sleep(3); num_retries_bad_request+=1
            else:
                response = "<ann>BadRequestError</ann> <exp>BadRequestError</exp>\n" * num_comments
                retry = False
            
        except (openai.RateLimitError, KeyError, openai.Timeout, openai.APIConnectionError, openai.APIError) as e:
            print(e); print(f"{num_retries_api_error}/{max_retries_api_error} tries more ...")
            if num_retries_api_error < max_retries_api_error: 
                time.sleep(3); num_retries_api_error+=1
            else:
                response = "<ann>APIError</ann> <exp>APIError</exp>\n" * num_comments
                retry = False

    return response

In [9]:
def write_to_file(filepath, content):
    with open(filepath, "a") as f:
        f.write(content)

In [10]:
def annotate_comments(ids_all, comments_all, filepath, 
                      batch_size=5, query_func=query_o1, return_results=False):
    
    query_template = load_query_template()
    write_to_file(filepath, "id\tannotation\texplanation\ttime\n")
    all_csv_content = ""
    for start in range(0, len(comments_all), batch_size):
        retry = True; max_retries = 10; num_retries = 0
        
        ids_batch = ids_all[start: start+batch_size]
        comments_batch = comments_all[start: start+batch_size]
        tagged_comments = tag_comments(comments_batch)

        num_comments = len(comments_batch)
        prompt = query_template + tagged_comments
        response = get_response(prompt, query_func, num_comments)
        try:
            ann_batch, exp_batch = extract_ann_exp(response)
        except AttributeError:
            ann_batch, exp_batch = ['None']*num_comments, ['None']*num_comments
            
        time_list = [time.time()]*num_comments
        csv_content = [f"{ids_batch[i]}\t'{ann_batch[i]}'\t{exp_batch[i]}\t{time_list[i]}\n" for i in range(len(ids_batch))]
        csv_content = "".join(csv_content)
        all_csv_content += csv_content
        write_to_file(filepath, csv_content)

    if return_results: return csv_content
        

In [11]:
samori_folder = f"{os.getenv('OAK')}/samori"
comments_folder = f"{samori_folder}/tiktok/comments"
# persons_folder = f"{samori_folder}/tiktok/persons"
persons_folder = "."

In [12]:
comments_csv_path = f"{comments_folder}/cleaned/comments_5_cleaned.csv"

In [28]:
df = pd.read_csv(comments_csv_path)
df = df.sample(n=5, random_state=43)

ids_all = list(df['id'])
comments_all = list(df['text'])
start_time = time.time()

annotate_comments(ids_all, comments_all, 
                  f"{persons_folder}/gpt_annotations_{start_time}.txt", 
                  batch_size=5, query_func=query_o1)

TypeError: catching classes that do not inherit from BaseException is not allowed

In [12]:
import re 

In [12]:
df = pd.read_csv("./gpt_annotations_1740707468.6469207.txt", delimiter="\t")

In [13]:
df

Unnamed: 0,id,annotation,explanation,time
0,7.323784e+18,'100',The comment focuses on the speaker’s personal ...,1740707000.0
1,7.323225e+18,'001',The comment describes deaths of others due to ...,1740707000.0
2,7.245067e+18,'001',The comment focuses on Old Jimmy’s fear relate...,1740707000.0
3,7.191048e+18,'010',The comment directly addresses another individ...,1740707000.0
4,7.358853e+18,'101',The comment reflects the speaker’s personal ex...,1740707000.0
5,7.252892e+18,'100',The comment uses a first-person plural perspec...,1740707000.0
6,7.255314e+18,'000',The comment does not express a central drug-re...,1740707000.0
7,7.256563e+18,'001',The comment primarily describes others' experi...,1740707000.0
8,7.260791e+18,'000',The comment is a personal compliment and does ...,1740707000.0
9,7.246782e+18,'100',The comment expresses the speaker's own commit...,1740707000.0


In [160]:
comments_big

['I take 2mg daily, not so bad',
 'the shit they take is laced with f3nt',
 'sorry to hear you lost your son to opioid, I lost mine to fent poisoining',
 'I take 2mg daily, not so bad',
 'the shit they take is laced with f3nt',
 'sorry to hear you lost your son to opioid, I lost mine to fent poisoining',
 'I take 2mg daily, not so bad',
 'the shit they take is laced with f3nt',
 'sorry to hear you lost your son to opioid, I lost mine to fent poisoining',
 'I take 2mg daily, not so bad',
 'the shit they take is laced with f3nt',
 'sorry to hear you lost your son to opioid, I lost mine to fent poisoining',
 'I take 2mg daily, not so bad',
 'the shit they take is laced with f3nt',
 'sorry to hear you lost your son to opioid, I lost mine to fent poisoining']

In [156]:
pd.DataFrame()

ValueError: DataFrame constructor not properly called!

In [141]:
import openai
openai.BadRequestError

openai.BadRequestError

In [146]:
AttributeError

AttributeError

In [2]:
def main(comments_path, output_folder):
    machine_id = int(os.environ.get("SLURM_ARRAY_TASK_ID"))
    
    df = pd.read_csv(comments_path)
    df = df.sample(n=10000, random_state=43)
    
    all_ids = list(df['id'])
    comments = list(df['text'])

    # all_ids = all_ids[:30]
    # comments = comments[:30]
    
    # time.sleep(machine_id*)
    # ids, annotations, reasonings = annotate_comments(all_ids, comments)
    ids, responses = annotate_comments(all_ids, comments)
        
    gpt_df = pd.DataFrame()
    gpt_df["id"] = ids
    gpt_df["response"] = responses
    # gpt_df["annotation"] = annotations
    # gpt_df["reasoning"] = reasonings
    gpt_df.to_csv(f"{output_folder}/gpt_annotation_mono_machine_{machine_id}.csv", index=None)

# if __name__ == "__main__":
#     input_path = sys.argv[1]
#     output_folder = sys.argv[2]

#     main(input_path, output_folder)



In [101]:
with open("./query_template2.txt", "r") as f:
        query_template = f.read()


In [158]:
comments_big = ["I take 2mg daily, not so bad", 
                "the shit they take is laced with f3nt", 
                "sorry to hear you lost your son to opioid, I lost mine to fent poisoining"]*5
# comments_big

In [106]:
def tag_comments(comment_list):
    tagged_comments = [f"<comment>{i}<\comment>\n" for i in comment_list]
    tagged_comments_str = "".join(tagged_comments)
    return tagged_comments_str

In [113]:
batch_size = 5
for start in range(0, len(comments_big), batch_size):
    comments = comments_big[start: start+batch_size]

    tagged_comments = tag_comments(comments)

    prompt = query_template + tagged_comments
    response = query_o1(prompt)
    ann_batch, exp_batch = extract_ann_exp(response)

    print(ann_batch, exp_batch)
    
    break
    # tagged_comments

['100', '001', '110', '100', '001'] ["The comment centers on the speaker's own experience with taking 2mg daily, indicating a first-person perspective. There are no second or third person references.", 'The comment primarily describes others\' drug use by stating "they take," which indicates a third-person perspective. There are no first or second person elements.', 'The comment includes both the speaker\'s personal loss ("I lost mine") and directly addresses another individual about their loss ("you lost your son"), making both first and second person perspectives central to the drug-related discussion.', "The comment centers on the speaker's own experience with taking 2mg daily, indicating a first-person perspective. There are no second or third person references.", 'The comment primarily describes others\' drug use by stating "they take," which indicates a third-person perspective. There are no first or second person elements.']


In [95]:
response = query_o1(query_template+tagged_comments_str)
ann_batch, exp_batch = extract_ann_exp(response)

In [97]:
exp_batch

['The comment expresses the speaker’s own experience with drug use, using the first person perspective.',
 "The comment describes others’ drug behavior, using a third-person perspective; 'they' is central to the drug-related content.",
 'The comment includes both the speaker’s personal loss and directly addresses someone else about their loss, making both first and second person perspectives central to the drug-related discussion.']

In [29]:
response_list = response.split("\n")
annotations_batch = 
annotations = []

print(response_list)

['<ann>100</ann> <exp>The comment describes the speaker’s own drug use with “I take 2mg daily,” making the first person perspective central.</exp>', '<ann>001</ann> <exp>The comment describes others’ drug use with “they take,” focusing on third person perspective.</exp>', '<ann>110</ann> <exp>The comment both addresses another individual about their loss (“you lost your son”) and shares the speaker’s own loss (“I lost mine”), making both first and second person perspectives central.</exp>']


In [93]:
def extract_ann_exp(response):
    pattern = r"<ann>(.*?)</ann> <exp>(.*?)</exp>"
    
    response_list = response.split("\n")
    # AttributeError if one of them does not follow the patter or No match was found
    ann_batch = [re.search(pattern, i).group(1) for i in response_list]
    exp_batch = [re.search(pattern, i).group(2) for i in response_list]


    return ann_batch, exp_batch
    

In [94]:
extract_ann_exp(response)

(['100', '001', '110'],
 ['The comment describes the speaker’s own drug use with “I take 2mg daily,” making the first person perspective central.',
  'The comment describes others’ drug use with “they take,” focusing on third person perspective.',
  'The comment both addresses another individual about their loss (“you lost your son”) and shares the speaker’s own loss (“I lost mine”), making both first and second person perspectives central.'])

In [71]:
response

'<ann>100</ann> <exp>The comment describes the speaker’s own drug use with “I take 2mg daily,” making the first person perspective central.</exp>\n<ann>001</ann> <exp>The comment describes others’ drug use with “they take,” focusing on third person perspective.</exp>\n<ann>110</ann> <exp>The comment both addresses another individual about their loss (“you lost your son”) and shares the speaker’s own loss (“I lost mine”), making both first and second person perspectives central.</exp>'

In [28]:
print(response)

<ann>100</ann> <exp>The comment describes the speaker’s own drug use with “I take 2mg daily,” making the first person perspective central.</exp>
<ann>001</ann> <exp>The comment describes others’ drug use with “they take,” focusing on third person perspective.</exp>
<ann>110</ann> <exp>The comment both addresses another individual about their loss (“you lost your son”) and shares the speaker’s own loss (“I lost mine”), making both first and second person perspectives central.</exp>


In [69]:
[re.search(pattern, i).group(2) for i in response_list]

['The comment describes the speaker’s own drug use with “I take 2mg daily,” making the first person perspective central.',
 'The comment describes others’ drug use with “they take,” focusing on third person perspective.',
 'The comment both addresses another individual about their loss (“you lost your son”) and shares the speaker’s own loss (“I lost mine”), making both first and second person perspectives central.']

In [63]:
import re

# s = "<ann>text1</ann> <exp>text2</exp>"

pattern = r"<ann>(.*?)</ann>\s<exp>(.*?)</exp>"
match = re.search(pattern, response_list[2]) 
if match: 
    text1 = match.group(1) 
    text2 = match.group(2) 
    print("Text 1:", text1) 
    print("Text 2:", text2) 
else: print("Pattern not found.")

Text 1: 110
Text 2: The comment both addresses another individual about their loss (“you lost your son”) and shares the speaker’s own loss (“I lost mine”), making both first and second person perspectives central.


In [62]:
match

<re.Match object; span=(0, 219), match='<ann>110</ann> <exp>The comment both addresses an>

In [65]:
response_list[0]

'<ann>100</ann> <exp>The comment describes the speaker’s own drug use with “I take 2mg daily,” making the first person perspective central.</exp>'

In [66]:
print(pattern)

<ann>(.*?)</ann>\s<exp>(.*?)</exp>


In [48]:
s

'<ann>text1</ann> <exp>text2</exp>'