### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/en-simlex-999.txt", delimiter='\t')

# Select subset
# en_simlex = en_simlex.head(150)

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
en_simlex

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75


### **1. Define and Evaluate Parameters**

In [5]:
# Define the prompt
prompt = ("Two words are synonyms if they have very similar meanings. Synonyms represent the same type or category of thing. Here are some examples of synonym pairs: cup/mug, glasses/spectacles, envy/jealousy. "
          "In practice, word pairs that are not exactly synonymous may still be very similar. Here are some very similar pairs - we could say they are nearly synonyms: alligator/crocodile, love/affection, frog/toad. "
          "In contrast, although the following word pairs are related, they are not very similar. The words represent entirely different types of things: car/tyre, car/motorway, car/crash. "
          "Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. "
          "Remember, things that are related are not necessarily similar. If you are ever unsure, think back to the examples of synonymous pairs (glasses/spectacles), and consider how close the words are (or are not) to being synonymous. "
          "There is no right answer to these questions. It is perfectly reasonable to use your intuition or gut feeling as a native English speaker, especially when you are asked to rate word pairs that you think are not similar at all. "
          "The response should strictly adhere to the structure: [('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. Use two decimals. Do not provide additional explanations or context.")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 10.0

# Define number of sublists
n_sublists = 50

In [7]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

50


In [8]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Two words are synonyms if they have very similar meanings. Synonyms represent the same type or category of thing. Here are some examples of synonym pairs: cup/mug, glasses/spectacles, envy/jealousy. In practice, word pairs that are not exactly synonymous may still be very similar. Here are some very similar pairs - we could say they are nearly synonyms: alligator/crocodile, love/affection, frog/toad. In contrast, although the following word pairs are related, they are not very similar. The words represent entirely different types of things: car/tyre, car/motorway, car/crash. Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Remember, things that are related are not necessarily similar. If you are ever unsure, think back to the examples of synonymous pairs (glasses/spectacles), and consider how close the words are (or are not) to being synonymous. There is no right answer to t

In [9]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [445, 454, 448, 454, 456, 448, 441, 446, 446, 448, 445, 442, 439, 444, 442, 451, 448, 446, 455, 447, 445, 452, 455, 443, 443, 445, 449, 437, 454, 446, 449, 448, 444, 450, 440, 444, 446, 447, 446, 443, 446, 439, 439, 450, 445, 442, 451, 446, 437, 440]


In [10]:
# Number of total tokens
tokens = [445, 454, 448, 454, 456, 448, 441, 446, 446, 448, 445, 442, 439, 444, 442, 451, 448, 446, 455, 447, 445, 452, 455, 443, 443, 445, 449, 437, 454, 446, 449, 448, 444, 450, 440, 444, 446, 447, 446, 443, 446, 439, 439, 450, 445, 442, 451, 446, 437, 440]
sum(tokens)

# Total compute time = 2 hours 45 mins

22306

### **2. Extract and Process Data**

In [11]:
# Process each chunk and get results using the OpenAI API
response = get_responses(chunks, prompt, model, sample_size, delay)

Processing:   0%|          | 0/750 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 750/750 [2:56:01<00:00, 14.08s/chunk]  

Total time taken: 10561.54 seconds





In [12]:
# Define filepath
file_path = '../../../data/response/en/gpt-3.5-turbo-0125/f7.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [15]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{('actress', 'actor'): [9.0, 8.0, 9.0, 6.0, 9.0, 8.0, 8.0, 8.5, 9.0, 8.0, 8.0, 9.0, 8.0, 8.0], ('actor', 'actress'): [8.0]}


In [16]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [17]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.1,0.1,1.0,0.2,0.1,2.0,0.1,0.2,0.10,0.2,0.1,0.1,0.1,3.0,1.0
1,smart,intelligent,0.9,0.9,1.0,0.9,0.9,9.0,0.9,0.9,0.90,0.9,0.9,0.9,0.9,9.0,0.9
2,hard,difficult,0.3,0.8,0.5,0.7,0.8,7.0,0.8,0.7,0.20,0.9,0.2,0.4,0.9,6.0,0.8
3,happy,cheerful,0.8,0.7,0.2,0.6,0.7,8.0,0.8,0.8,0.90,0.8,0.9,0.8,0.7,6.0,0.7
4,hard,easy,0.1,0.1,0.2,0.1,0.1,3.0,0.1,0.4,0.10,0.1,0.1,0.2,0.1,2.0,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,join,acquire,1.0,9.0,0.3,0.1,1.0,0.0,0.0,1.0,0.20,1.0,0.0,0.1,0.0,1.0,0.7
996,send,attend,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.1,0.0,0.0,0.1
997,gather,attend,1.0,0.0,0.8,0.1,1.0,0.0,0.0,0.5,0.10,1.0,0.0,0.1,0.0,0.0,0.7
998,absorb,withdraw,0.0,0.0,0.2,0.1,0.0,0.0,0.0,0.0,0.10,0.0,0.0,0.1,0.0,0.0,0.3


In [18]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     1
similarity_score_3     1
similarity_score_4     1
similarity_score_5     1
similarity_score_6     1
similarity_score_7     1
similarity_score_8     1
similarity_score_9     1
similarity_score_10    1
similarity_score_11    1
similarity_score_12    1
similarity_score_13    1
similarity_score_14    1
similarity_score_15    2
dtype: int64


In [19]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
127,actress,actor,9.0,8.0,9.0,6.0,9.0,8.0,8.0,8.5,9.0,8.0,8.0,9.0,8.0,8.0,
140,actor,actress,8.0,,,,,,,,,,,,,,


In [20]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'actress') & (df['word2'] == 'actor'), 'similarity_score_15'] = 8.00

# Check value
df.loc[(df['word1'] == 'actress') & (df['word2'] == 'actor')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
127,actress,actor,9.0,8.0,9.0,6.0,9.0,8.0,8.0,8.5,9.0,8.0,8.0,9.0,8.0,8.0,8.0


In [21]:
# Drop faulty row
df = df[~((df['word1'] == 'actor') & (df['word2'] == 'actress'))]
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.1,0.1,1.0,0.2,0.1,2.0,0.1,0.2,0.10,0.2,0.1,0.1,0.1,3.0,1.0
1,smart,intelligent,0.9,0.9,1.0,0.9,0.9,9.0,0.9,0.9,0.90,0.9,0.9,0.9,0.9,9.0,0.9
2,hard,difficult,0.3,0.8,0.5,0.7,0.8,7.0,0.8,0.7,0.20,0.9,0.2,0.4,0.9,6.0,0.8
3,happy,cheerful,0.8,0.7,0.2,0.6,0.7,8.0,0.8,0.8,0.90,0.8,0.9,0.8,0.7,6.0,0.7
4,hard,easy,0.1,0.1,0.2,0.1,0.1,3.0,0.1,0.4,0.10,0.1,0.1,0.2,0.1,2.0,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,join,acquire,1.0,9.0,0.3,0.1,1.0,0.0,0.0,1.0,0.20,1.0,0.0,0.1,0.0,1.0,0.7
996,send,attend,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.1,0.0,0.0,0.1
997,gather,attend,1.0,0.0,0.8,0.1,1.0,0.0,0.0,0.5,0.10,1.0,0.0,0.1,0.0,0.0,0.7
998,absorb,withdraw,0.0,0.0,0.2,0.1,0.0,0.0,0.0,0.0,0.10,0.0,0.0,0.1,0.0,0.0,0.3


In [73]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [74]:
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,slim,intelligent,8.76,9.50,9.50,9.60,9.50,8.50,7.84,7.50,9.50,6.00,7.50,8.50,9.00,9.50,9.20
1,hard,moeilijk,3.01,5.00,6.00,2.00,6.00,3.00,4.83,2.50,3.00,3.00,4.00,2.50,6.00,5.00,2.00
2,gelukkig,vrolijk,6.54,7.50,8.00,8.50,9.00,7.00,7.39,7.50,7.50,7.00,8.50,7.50,8.00,8.50,7.50
3,hard,stoer,2.56,3.00,3.00,1.00,5.00,6.00,2.35,3.00,4.00,4.00,4.00,4.50,7.00,6.50,2.00
4,snel,razendsnel,9.43,9.00,9.50,9.50,8.00,9.00,9.17,8.50,8.50,9.00,8.50,8.50,9.00,8.75,9.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,0.41,0.55,3.88,0.69,0.14,0.51,4.00,0.47,3.36,1.70,2.76,0.20,4.69,1.90,0.69
993,sturen,bijwonen,0.05,0.03,0.00,0.06,0.03,0.06,0.00,0.00,0.42,0.06,0.30,0.00,0.05,0.04,0.00
994,verzamelen,bijwonen,0.17,0.46,0.00,0.29,0.14,0.30,2.62,0.57,0.78,1.06,0.11,0.25,0.09,0.75,0.11
995,opnemen,intrekken,0.22,0.21,0.00,0.31,0.12,0.50,1.67,0.18,1.05,0.22,0.30,0.05,0.64,1.28,0.09


In [22]:
# Define file_path
file_path = '../../../data/prompt/en/gpt-3.5-turbo-0125/f7.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
