### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/en-simlex-999.txt", delimiter='\t')

# Select subset
en_simlex = en_simlex.iloc[666:999]

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
en_simlex

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
666,boy,soldier,N,2.15,4.76,4.72,4,0.13,0,1.51
667,belly,abdomen,N,8.13,4.80,4.70,4,0.13,0,1.53
668,guy,girl,N,3.33,4.68,4.85,4,0.13,0,1.90
669,bed,chair,N,3.50,5.00,4.58,4,0.13,0,1.26
670,clothes,jacket,N,5.15,4.76,4.86,4,0.13,0,0.74
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75


### **1. Define and Evaluate Parameters**

In [5]:
# Define the prompt
prompt = ("Rate the semantic similarity of the word pair: [('{word1}'), ('{word2}')] on a scale from 0 to 10, "
          "where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. "
          "Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. "
          "Do not provide additional explanations or context.")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
# sample_size = 5
sample_size = 15

# Delay between individual API calls
delay = 5.0

# Define number of sublists
# n_sublists = 999
n_sublists = 333

In [7]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

333


In [8]:
# Print the prompts for each chunk
print_prompts_single(chunks, sample_size, prompt)

Rate the semantic similarity of the word pair: [('boy'), ('soldier')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('boy'), ('soldier')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('boy'), ('soldier')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanat

In [9]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken_single(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [75, 77, 75, 74, 76, 74, 74, 75, 75, 75, 74, 75, 75, 76, 74, 75, 75, 75, 74, 75, 74, 74, 75, 75, 74, 75, 75, 76, 74, 74, 75, 75, 74, 74, 74, 75, 75, 75, 75, 76, 75, 74, 74, 75, 74, 75, 74, 76, 74, 74, 74, 75, 75, 77, 75, 75, 74, 75, 75, 75, 75, 74, 76, 74, 74, 74, 75, 76, 75, 75, 74, 76, 75, 76, 74, 74, 74, 76, 77, 76, 75, 74, 76, 75, 74, 74, 74, 75, 75, 75, 75, 75, 77, 74, 76, 74, 74, 74, 74, 75, 75, 76, 74, 74, 75, 75, 76, 77, 76, 75, 75, 74, 75, 74, 76, 76, 76, 74, 75, 75, 75, 74, 74, 74, 74, 74, 74, 74, 74, 76, 77, 74, 75, 74, 74, 74, 75, 75, 76, 75, 75, 75, 75, 76, 74, 74, 75, 75, 74, 75, 75, 75, 75, 76, 75, 75, 74, 74, 75, 74, 74, 74, 74, 75, 75, 75, 74, 75, 75, 74, 75, 74, 75, 75, 76, 74, 74, 74, 74, 74, 75, 74, 74, 74, 75, 74, 77, 74, 74, 77, 74, 75, 74, 74, 74, 75, 74, 74, 75, 75, 76, 74, 75, 74, 74, 74, 76, 76, 76, 75, 76, 76, 76, 77, 75, 75, 74, 76, 74, 74, 76, 74, 75, 76, 76, 75, 75, 75, 75, 75, 74, 74, 74, 75, 75, 75, 76, 74, 75, 74,

In [10]:
# Max RPD = 10.000
len(token_counts*15)

4995

In [11]:
# Number of total tokens
print(sum(token_counts))

24915


### **2. Extract and Process Data**

In [12]:
# Process each chunk and get results using the OpenAI API
response = get_responses_single(prompt, chunks, model, sample_size, delay)

Processing:   0%|          | 0/4995 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 4995/4995 [11:11:19<00:00,  8.06s/chunk]   

Total time taken: 40279.81 seconds





In [13]:
# Define filepath
file_path = '../../../data/response/en/gpt-3.5-turbo-0125/f9-3.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [14]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{}


In [15]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [16]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,boy,soldier,3.25,6.25,6.50,3.56,3.50,3.20,3.25,3.50,2.50,6.25,3.50,3.60,6.25,3.50,3.25
1,belly,abdomen,8.00,7.50,8.00,9.00,8.00,8.50,9.00,9.50,8.00,8.00,8.50,9.00,8.00,8.00,9.50
2,guy,girl,6.50,7.50,2.50,6.50,7.50,7.50,8.00,6.50,7.50,8.50,6.50,7.50,8.80,8.20,7.50
3,bed,chair,2.50,0.21,1.50,2.50,2.00,1.70,3.00,2.50,2.00,1.25,2.50,0.15,2.00,2.50,2.50
4,clothes,jacket,6.25,6.50,6.50,5.50,6.50,6.00,6.50,6.25,7.00,6.50,6.20,7.00,6.50,6.00,6.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,join,acquire,3.04,1.80,2.50,2.85,0.15,1.25,0.19,0.06,1.35,0.06,0.22,2.00,2.25,3.85,2.50
329,send,attend,1.00,2.50,0.33,0.21,2.00,1.00,0.10,1.43,2.50,1.00,2.00,2.50,1.25,1.00,1.00
330,gather,attend,0.22,2.00,2.33,2.00,3.33,2.50,0.19,0.33,0.20,0.55,1.20,2.50,0.20,0.10,2.50
331,absorb,withdraw,1.61,0.16,1.67,0.20,0.20,2.14,0.29,0.22,2.14,0.00,1.85,1.50,1.00,0.10,0.01


In [17]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [18]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [19]:
# Define file_path
file_path = '../../../data/prompt/en/gpt-3.5-turbo-0125/f9-3.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
