In [9]:
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
import os
import tiktoken
import re
import time
from tqdm import tqdm
import ast
import json
import io
from contextlib import redirect_stdout

In [10]:
def chunk_data(data, size):
    return [data[i:i + size] for i in range(0, len(data), size)]

In [None]:
def split_into_n_lists(full_list, n):
    # Determine the length of each sublist
    total_len = len(full_list)
    base_size = total_len // n
    remainder = total_len % n
    
    # Intialize lists
    lists = []
    start = 0
    
    # Create each sublist
    for i in range(n):
        # Add one more element to some of the lists to distribute the remainder
        end = start + base_size + (1 if i < remainder else 0)
        lists.append(full_list[start:end])
        start = end
        
    return lists

In [None]:
def convert_dict_to_dataframe(data_dict):
    # Only include 

    # Create DataFrame columns based on the number of scores collected
    columns = ['word1', 'word2'] + [f'similarity_score_{i+1}' for i in range(len(next(iter(data_dict.values()))))]
    data_list = []
    
    # Populate data list for DataFrame creation
    for (word1, word2), scores in data_dict.items():
        data_list.append([word1, word2] + scores)
    
    # Create DataFrame
    df = pd.DataFrame(data_list, columns=columns)
    return df

In [11]:
def format_prompt(pairs, prompt):
    formatted_pairs = ', '.join([f"('{p[0]}', '{p[1]}')" for p in pairs])
    # return f"{prompt}\n\n---\n\n{formatted_pairs}"
    return f"{prompt} --- {[formatted_pairs]}"

In [12]:
def print_prompts(chunks, prompt):
    for chunk in chunks:
        print(format_prompt(chunk, prompt))

In [None]:
def print_prompts_single(chunks, sample_size, prompt):
    # Print the prompts for each chunk
    for chunk in chunks:
        for _ in range(sample_size):
            # Extract word pair from chunk
            word_pair = chunk[0]

            # Format the prompts
            formatted_message = prompt.format(word1=word_pair[0], word2=word_pair[1])
            print(formatted_message)

In [None]:
def count_tokens_with_tiktoken(chunks, prompt):
    token_counts = []
    for chunk in chunks:
        formatted_prompt = format_prompt(chunk, prompt) # Format prompt
        tokens = encoding.encode(formatted_prompt) # Tokenize the formatted prompt
        token_counts.append(len(tokens)) # Count the tokens and add to the list
    
    return token_counts

In [None]:
def count_tokens_with_tiktoken_single(chunks, prompt):
    token_counts = []
    for chunk in chunks:
        word_pair = chunk[0]
        formatted_prompt = prompt.format(word1=word_pair[0], word2=word_pair[1]) # Format prompt
        tokens = encoding.encode(formatted_prompt) # Tokenize the formatted prompt
        token_counts.append(len(tokens)) # Count the tokens and add to the list
    
    return token_counts

In [None]:
def get_responses(chunks, prompt, model, sample_size, delay):
    # Initialize list to store responses
    responses = [] 

    # Start timing the total processing
    start_time_total = time.time()

    # Set up the progress bar
    total_iterations = len(chunks) * sample_size
    pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

    # Collect responses for each chunk and each sample
    for chunk in chunks:
        for _ in range(sample_size):
            # Format the prompts
            formatted_prompt = format_prompt(chunk, prompt)
            messages = [{"role": "user", "content": formatted_prompt}]

            # Call the OpenAI API
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                n=1,
                stop=None)

            # Store the response content
            responses.append(completion.choices[0].message.content)

            # Update the progress bar after each sample
            pbar.update(1)

            # Delay after each API call
            time.sleep(delay)

    # Close the progress bar when done
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return responses

In [None]:
def get_responses_conversational(chunks, prompt, user_content, assistant_content, model, sample_size, delay):
    # Initialize list to store responses
    responses = []

    # Start timing the total processing
    start_time_total = time.time()

    # Set up the progress bar
    total_iterations = len(chunks) * sample_size
    pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

    # Collect responses for each chunk and each sample
    for chunk in chunks:
        for _ in range(sample_size):
            # Format the prompts
            messages = [
                {"role": "system", "content": prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content},
                {"role": "user", "content": f'"{chunk}"'},
                ]

            # Call the OpenAI API
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                n=1,
                stop=None)

            # Store the response content
            responses.append(completion.choices[0].message.content)

            # Update the progress bar after each sample
            pbar.update(1)

            # Delay after each API call
            time.sleep(delay)

    # Close the progress bar when done
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return responses

In [None]:
def get_responses_single(prompt, chunks, model, sample_size, delay):
    # Initialize list to store responses
    responses = [] 

    # Start timing the total processing
    start_time_total = time.time()

    # Set up the progress bar
    total_iterations = len(chunks) * sample_size
    pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

    # Collect responses for each chunk and each sample
    for chunk in chunks:
        for _ in range(sample_size):
            # Extract word pair from chunk
            word_pair = chunk[0]

             # Format the prompts
            formatted_message = prompt.format(word1=word_pair[0], word2=word_pair[1])
            messages = [{"role": "user", "content": formatted_message}]

            # Call the OpenAI API
            completion = client.chat.completions.create(
                model=model,
                messages=messages,
                n=1,
                stop=None)

            # Store the response content
            responses.append(completion.choices[0].message.content)

            # Update the progress bar after each sample
            pbar.update(1)

            # Delay after each API call
            time.sleep(delay)

    # Close the progress bar when done
    pbar.close()

    # Total processing time
    end_time_total = time.time()
    print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

    return responses

In [None]:
# def get_responses_instruct(chunks, prompt, model, sample_size, delay):
#     # Initialize list to store responses
#     responses = [] 

#     # Start timing the total processing
#     start_time_total = time.time()

#     # Set up the progress bar
#     total_iterations = len(chunks) * sample_size
#     pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

#     # Collect responses for each chunk and each sample
#     for chunk in chunks:
#         for _ in range(sample_size):
#             # Format the prompts
#             formatted_prompt = format_prompt(chunk, prompt)

#             # Call the OpenAI API
#             completion = client.completions.create(
#                 model=model,
#                 prompt=formatted_prompt,
#                 n=1,
#                 max_tokens=3000)

#             # Store the response content
#             responses.append(completion.choices[0].text)

#             # Update the progress bar after each sample
#             pbar.update(1)

#             # Delay after each API call
#             time.sleep(delay)

#     # Close the progress bar when done
#     pbar.close()

#     # Total processing time
#     end_time_total = time.time()
#     print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

#     return responses

In [None]:
def process_responses(responses): # gpt-3.5-turbo-0125
    data_dict = {}

    # Define regex pattern to handle inconsistencies
    pattern = r"\(\s*'([^']+)'?\s*(?:,\s*)?'([^']+\s?[^']*?)'?\s*,\s*(['\"]?[\d,\.]+['\"]?)\s*\)" 
    # pattern = r"\(\s*'([^']+)'?\s*,\s*'([^']+\s?[^']*?)'?\s*,\s*(['\"]?[\d,\.]+['\"]?)\s*\)"

    # Process each stored response
    for response in responses:
        # Normalize the response to reduce issues with inconsistent formatting
        normalized_response = response.replace("\n", "").replace("), ", "),")
        matches = re.findall(pattern, normalized_response)
        for word1, word2, score in matches:
            # Concatenate words separated by whitespace
            word1 = word1.replace(" ", "")
            word2 = word2.replace(" ", "")
            # Remove quotes if the score is captured as a quoted string
            if score.startswith(("'", '"')) and score.endswith(("'", '"')):
                score = score[1:-1]
            # Replace comma with dot in the score string to handle decimal numbers
            score = score.replace(",", ".")
            # Ensure no trailing non-numeric characters
            score = re.sub(r'[^\d.]', '', score)
            key = (word1, word2)
            try:
                # Convert score to float and add to dictionary
                if key not in data_dict:
                    data_dict[key] = []
                data_dict[key].append(float(score))
            except ValueError as e:
                # Log the error along with the problematic score for debugging
                print(f"Error converting '{score}' to float in the pair {key}: {e}")

    return data_dict

In [None]:
def process_responses_instruct(responses):
    data_dict = {}

    # Define regex pattern to handle inconsistencies
    pattern = r"\(\s*'?\"?(\w+)'?\"?\s*,\s*'?\"?(\w+)'?\"?\s*,\s*'?\"?([\d\.]+)'?\"?\s*\)" # gpt-3.5-turbo-instruct

    # Process each stored response
    for response in responses:
        # Normalize the response to reduce issues with inconsistent formatting
        normalized_response = response.replace("\n", "").replace("), ", "),")
        matches = re.findall(pattern, normalized_response)
        for word1, word2, score in matches:
            # Concatenate words separated by whitespace
            word1 = word1.replace(" ", "")
            word2 = word2.replace(" ", "")
            # Remove quotes if the score is captured as a quoted string
            if score.startswith(("'", '"')) and score.endswith(("'", '"')):
                score = score[1:-1]
            # Replace comma with dot in the score string to handle decimal numbers
            score = score.replace(",", ".")
            # Ensure no trailing non-numeric characters
            score = re.sub(r'[^\d.]', '', score)
            key = (word1, word2)
            try:
                # Convert score to float and add to dictionary
                if key not in data_dict:
                    data_dict[key] = []
                data_dict[key].append(float(score))
            except ValueError as e:
                # Log the error along with the problematic score for debugging
                print(f"Error converting '{score}' to float in the pair {key}: {e}")

    return data_dict

In [None]:
def process_responses_categorical(responses):
    data_dict = {}

    # Define regex pattern to handle inconsistencies
    # pattern = r"\(\s*'([^']*)'\s*,\s*'([^']*)'\s*,\s*'([^']*)'\s*\)"
    pattern = r"\(\s*\'?([^']+?)\'?\s*,\s*\'?([^']+?)\'?\s*,\s*\'?([^']+?)\'?\s*\)"

    # Process each stored response
    for response in responses:
        # Normalize the response to reduce issues with inconsistent formatting
        normalized_response = response.replace("\n", "").replace("), ", "),")
        matches = re.findall(pattern, normalized_response)
        for word1, word2, label in matches:
            # Concatenate words separated by whitespace in the second word
            word1 = word1.replace(" ", "")
            word2 = word2.replace(" ", "")
            # Create a key from the two words
            key = (word1, word2)
            # Initialize a list for each key if not already present
            if key not in data_dict:
                data_dict[key] = []
            # Append the label string to the list associated with the key
            data_dict[key].append(label)

    return data_dict

In [None]:
def print_duplicate_word_pairs(cleaned_nl_simlex, data_dict):
    ### Original dataframe
    # Copy the original dataframe
    combined_cleaned_nl_simlex = cleaned_nl_simlex.copy()

    # Remove spaces from 'word1' and 'word2' columns
    combined_cleaned_nl_simlex['word1'] = combined_cleaned_nl_simlex['word1'].replace(" ", "", regex=True)
    combined_cleaned_nl_simlex['word2'] = combined_cleaned_nl_simlex['word2'].replace(" ", "", regex=True)

    # Create a new combined column and drop other columns
    combined_cleaned_nl_simlex['Combined_Columns'] = combined_cleaned_nl_simlex['word1'] + '_' + combined_cleaned_nl_simlex['word2']
    combined_cleaned_nl_simlex = combined_cleaned_nl_simlex[['Combined_Columns']]

    ### Extracted dataframe
    # Convert dictionary to dataframe
    df_combined = create_dataframe(data_dict)

    # Create a new combined column and drop other columns
    df_combined['Combined_Columns'] = df_combined['word1'] + '_' + df_combined['word2']
    df_combined = df_combined[['Combined_Columns']]

    ### Check for duplicates
    # Find values in df1 that are not in df2
    missing_values = combined_cleaned_nl_simlex[~combined_cleaned_nl_simlex['Combined_Columns'].isin(df_combined['Combined_Columns'])]
    print(missing_values)

    # Check for duplicate word pairs
    duplicate_combinations = df_combined.duplicated(subset='Combined_Columns', keep=False)

    # Print rows with duplicate word pairs
    print(df_combined[duplicate_combinations])

In [None]:
# def get_and_process_chat_completion(chunks, prompt, model, sample_size, delay):
#     # Initialize a dictionary to store data
#     word_dict = {}

#     # Start timing the total processing
#     start_time_total = time.time()

#     # Set up the progress bar
#     total_iterations = len(chunks) * sample_size
#     pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

#     # Collect results for each chunk and each sample
#     for chunk in chunks:
#         for sample_index in range(sample_size):
#             # Format the prompts
#             formatted_prompt = format_prompt(chunk, prompt)
#             messages = [{"role": "user", "content": formatted_prompt}]

#             # Call the OpenAI API
#             completion = client.chat.completions.create(
#                 model=model,
#                 messages=messages,
#                 n=1)

#             # Extract content and convert into list
#             content = completion.choices[0].message.content
#             print(content) # Validate
#             list_of_tuples = ast.literal_eval(content)
#             # list_of_tuples = parse_content(completion.choices[0].message.content)

#             # Process word pairs and scores into dictionary
#             for w1, w2, score in list_of_tuples:
#                 if (w1, w2) not in word_dict:
#                     dict[(w1, w2)] = {'word1': w1, 'word2': w2}
#                 dict[(w1, w2)][f'similarity_score_{sample_index+1}'] = score

#             # Update the progress bar after each sample
#             pbar.update(1)

#             # Delay after each API call
#             time.sleep(delay)
            
#     # Close the progress bar when done
#     pbar.close()

#     # Total processing time
#     end_time_total = time.time()
#     print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

#     return word_dict

In [None]:
# def get_and_process_chat_completion(chunks, prompt, model, sample_size, delay):
#     # Initialize an empty Pandas DataFrame with word pairs
#     df = pd.DataFrame(columns=['word1', 'word2'])

#     # Start timing the total processing
#     start_time_total = time.time()

#     # Set up the progress bar
#     total_iterations = len(chunks) * sample_size
#     pbar = tqdm(total=total_iterations, desc="Processing", unit="chunk")

#     # Create columns for each sample
#     for sample_index in range(sample_size):
#         df[f'similarity_score_{sample_index+1}'] = None

#     # Collect results for each chunk and each sample
#     for chunk_index, chunk in enumerate(chunks):
#         for sample_index in range(sample_size):
#             # Format the prompts
#             formatted_prompt = format_prompt(chunk, prompt)
#             messages = [{"role": "user", "content": formatted_prompt}]

#             # Call the OpenAI API
#             completion = client.chat.completions.create(
#                 model=model,
#                 messages=messages,
#                 n=1)

#             # Extract content from completion
#             content = parse_content(completion.choices[0].message.content)
#             print(print)
            
#             # Process and add the scores to the DataFrame
#             for row_index, row in enumerate(content):
#                 if chunk_index == 0 and sample_index == 0:
#                     # For the first sample and first chunk, initialize the word pairs
#                     if row_index >= len(df):
#                         df = df.append({'word1': row[0], 'word2': row[1]}, ignore_index=True)
#                 df.loc[row_index, f'similarity_score_{sample_index+1}'] = row[2]
            
#             # Delay after each API call
#             time.sleep(delay)

#     # Close the progress bar when done
#     pbar.close()

#     # Total processing time
#     end_time_total = time.time()
#     print(f"Total time taken: {end_time_total - start_time_total:.2f} seconds")

#     return df

In [None]:
def create_dataframe(data_dict):
    # Determine the maximum number of scores for any word pair
    max_scores = max(len(scores) for scores in data_dict.values())

    # Create DataFrame columns based on the maximum number of scores collected
    columns = ['word1', 'word2'] + [f'similarity_score_{i+1}' for i in range(max_scores)]
    data_list = []
    
    # Populate data list for DataFrame creation
    for (word1, word2), scores in data_dict.items():
        # Ensure each row has the same number of elements by filling missing scores with None
        full_scores = scores + [None] * (max_scores - len(scores))
        data_list.append([word1, word2] + full_scores)
    
    # Create DataFrame
    return pd.DataFrame(data_list, columns=columns)