### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Select subset
# cleaned_nl_simlex = cleaned_nl_simlex.head(160)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [4]:
# Show results
tuples_list

[('oud', 'nieuw'),
 ('slim', 'intelligent'),
 ('hard', 'moeilijk'),
 ('gelukkig', 'vrolijk'),
 ('hard', 'stoer'),
 ('snel', 'razendsnel'),
 ('gelukkig', 'blij'),
 ('kort', 'lang'),
 ('dom', 'stom'),
 ('vreemd', 'eigenaardig'),
 ('breed', 'smal'),
 ('makkelijk', 'moeilijk'),
 ('moeilijk', 'gemakkelijk'),
 ('slim', 'dom'),
 ('krankzinnig', 'gek'),
 ('gelukkig', 'kwaad'),
 ('uitgebreid', 'groot'),
 ('moeilijk', 'simpel'),
 ('nieuw', 'vers'),
 ('scherp', 'saai'),
 ('vlug', 'snel'),
 ('dom', 'dwaas'),
 ('prachtig', 'fantastisch'),
 ('eigenaardig', 'vreemd'),
 ('gelukkig', 'boos'),
 ('smal', 'breed'),
 ('eenvoudig', 'gemakkelijk'),
 ('oud', 'vers'),
 ('kennelijk', 'duidelijk'),
 ('betaalbaar', 'goedkoop'),
 ('leuk', 'grootmoedig'),
 ('raar', 'vreemd'),
 ('vreemd', 'normaal'),
 ('slecht', 'immoreel'),
 ('verdrietig', 'grappig'),
 ('prachtig', 'geweldig'),
 ('schuldig', 'beschaamd'),
 ('mooi', 'prachtig'),
 ('zelfverzekerd', 'zeker'),
 ('dom', 'onderontwikkeld'),
 ('groot', 'flexibel'),
 ('aar

### **1. Define and Evaluate Parameters**

In [5]:
# Define prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: "
          "[('woord1', 'woord2', <score>), ('woord3', 'woord4', <score>),] "
          " Geef geen extra uitleg of context.")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 25

In [7]:
# Split list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count number of lists
print(len(chunks))

25


In [322]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [(woord1, woord2, <score>), (woord3, woord4, <score>),]  Geef geen extra uitleg of context. --- ('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk'), ('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('makkelijk', 'moeilijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('smal', 'breed'), ('eenvoudig', 'gemakkelijk'), ('oud', 'vers'), ('kennelijk', 

In [323]:
# Load encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)

# Show results
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [489, 481, 476, 450, 444, 452, 447, 453, 464, 459, 456, 451, 472, 467, 491, 442, 458, 479, 469, 477, 498, 492, 501, 492, 498]


### **2. Extract and Process Data**

In [325]:
# Get results from API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing:   0%|          | 0/375 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 375/375 [1:25:17<00:00, 13.65s/chunk]

Total time taken: 5117.52 seconds





In [15]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f1.json'

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

In [343]:
# Process data into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}

# Show results
print(higher_lower_samples)

{('rondzwerven', 'dwalen'): [9.0, 8.0, 6.0, 5.5, 7.0, 8.0, 9.0, 6.0, 7.0, 6.0, 8.0, 6.0, 6.0, 6.0], ('slagen', 'falen'): [3.0, 2.0, 4.0, 5.5, 3.0, 8.0, 3.0, 1.0, 2.0, 2.0, 6.0, 3.0, 3.0, 7.0], ('besteden', 'redden'): [3.0, 2.0, 1.5, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0], ('vertrekken', 'gaan'): [7.0, 7.0, 7.0, 6.0, 7.0, 7.0, 6.0, 6.0, 7.0, 6.0, 7.5, 4.0, 4.0, 6.0], ('komen', 'bijwonen'): [3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, 2.0, 3.0], ('weten', 'geloven'): [5.0, 2.5, 5.0, 4.0, 4.0, 6.0, 5.0, 4.0, 4.0, 4.0, 5.0, 2.0, 6.0, 6.0], ('verzamelen', 'ontmoeten'): [4.0, 3.0, 6.5, 3.5, 5.0, 5.0, 6.0, 5.0, 7.0, 4.0, 3.5, 4.0, 4.0, 5.0], ('maken', 'verdienen'): [1.0, 2.0, 2.0, 1.5, 3.0, 2.0, 1.0, 3.0, 2.0, 1.0, 1.5, 1.0, 3.0, 3.0], ('vergeten', 'negeren'): [7.0, 4.0, 6.5, 5.0, 5.0, 7.0, 7.0, 7.0, 6.0, 6.0, 8.5, 4.0, 5.0, 4.0], ('vermenigvuldigen', 'toevoegen'): [2.0, 1.5, 4.5, 2.5, 4.0, 5.0, 4.5, 4.0, 3.0, 4.0, 2.0, 3.5, 2.0, 3.0], ('krimpen', 'groeien'): [

In [52]:
# Print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [37]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.1,0.10,0.01,0.00,0.05,0.2,0.10,0.10,0.0,0.05,0.0,2.0,0.03,0.10,0.18
1,slim,intelligent,0.9,0.95,0.95,8.00,9.50,0.9,0.95,0.95,0.4,0.95,9.0,9.0,0.97,0.95,0.79
2,hard,moeilijk,0.4,0.20,0.10,2.00,2.00,0.1,0.05,0.20,0.1,0.10,2.0,3.0,0.00,0.05,0.39
3,gelukkig,vrolijk,0.8,0.80,0.80,6.00,8.50,0.7,0.85,0.85,0.6,0.80,8.0,8.0,0.83,0.80,0.76
4,hard,stoer,0.7,0.40,0.45,2.00,6.00,0.5,0.05,0.65,0.4,0.20,4.0,6.0,0.61,0.50,0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,3.0,1.50,2.11,2.06,4.00,3.0,3.00,4.00,3.0,2.00,4.8,1.0,1.00,5.50,4.00
993,sturen,bijwonen,0.0,1.00,1.56,0.00,1.00,0.0,0.00,1.00,1.0,1.00,0.0,1.0,1.00,0.00,1.00
994,verzamelen,bijwonen,1.0,1.00,1.56,0.00,5.00,0.0,3.00,3.00,1.0,2.00,0.0,1.0,4.00,2.50,1.33
995,opnemen,intrekken,1.0,2.00,1.11,0.00,1.00,0.0,1.00,2.00,2.0,1.00,0.0,1.0,2.00,0.00,1.00


In [38]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      0
similarity_score_3      0
similarity_score_4      0
similarity_score_5      0
similarity_score_6      0
similarity_score_7      0
similarity_score_8      0
similarity_score_9      0
similarity_score_10     0
similarity_score_11     0
similarity_score_12     0
similarity_score_13     0
similarity_score_14     0
similarity_score_15    21
dtype: int64


In [39]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
817,rondzwerven,dwalen,9.0,8.0,6.0,5.5,7.0,8.0,9.0,6.0,7.0,6.0,8.0,6.0,6.0,6.0,
818,slagen,falen,3.0,2.0,4.0,5.5,3.0,8.0,3.0,1.0,2.0,2.0,6.0,3.0,3.0,7.0,
819,besteden,redden,3.0,2.0,1.5,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,
820,vertrekken,gaan,7.0,7.0,7.0,6.0,7.0,7.0,6.0,6.0,7.0,6.0,7.5,4.0,4.0,6.0,
821,komen,bijwonen,3.0,2.0,3.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,1.0,2.0,2.0,3.0,
822,weten,geloven,5.0,2.5,5.0,4.0,4.0,6.0,5.0,4.0,4.0,4.0,5.0,2.0,6.0,6.0,
823,verzamelen,ontmoeten,4.0,3.0,6.5,3.5,5.0,5.0,6.0,5.0,7.0,4.0,3.5,4.0,4.0,5.0,
824,maken,verdienen,1.0,2.0,2.0,1.5,3.0,2.0,1.0,3.0,2.0,1.0,1.5,1.0,3.0,3.0,
825,vergeten,negeren,7.0,4.0,6.5,5.0,5.0,7.0,7.0,7.0,6.0,6.0,8.5,4.0,5.0,4.0,
826,vermenigvuldigen,toevoegen,2.0,1.5,4.5,2.5,4.0,5.0,4.5,4.0,3.0,4.0,2.0,3.5,2.0,3.0,


In [40]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))

# Show results
missing_word_pair_list

[('rondzwerven', 'dwalen'),
 ('slagen', 'falen'),
 ('besteden', 'redden'),
 ('vertrekken', 'gaan'),
 ('komen', 'bijwonen'),
 ('weten', 'geloven'),
 ('verzamelen', 'ontmoeten'),
 ('maken', 'verdienen'),
 ('vergeten', 'negeren'),
 ('vermenigvuldigen', 'toevoegen'),
 ('krimpen', 'groeien'),
 ('aankomen', 'komen'),
 ('slagen', 'proberen'),
 ('accepteren', 'ontkennen'),
 ('aankomen', 'vertrekken'),
 ('akkoordgaan', 'afwijken'),
 ('sturen', 'ontvangen'),
 ('winnen', 'domineren'),
 ('toevoegen', 'verdelen'),
 ('doden', 'wurgen'),
 ('verwerven', 'krijgen')]

In [43]:
# Format message
formatted_prompt = format_prompt(missing_word_pair_list, prompt)
messages = [{"role": "user", "content": formatted_prompt}]

# Make API call
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1)

# Store the response content
missing_word_pairs_response = [completion.choices[0].message.content]

In [44]:
# Extract data with regular expressions into dictionary
missing_word_pairs_dict = process_responses(missing_word_pairs_response)

# Show results
missing_word_pairs_dict

{('rondzwerven', 'dwalen'): [7.7],
 ('slagen', 'falen'): [4.0],
 ('besteden', 'redden'): [1.5],
 ('vertrekken', 'gaan'): [6.0],
 ('komen', 'bijwonen'): [3.0],
 ('weten', 'geloven'): [2.3],
 ('verzamelen', 'ontmoeten'): [1.0],
 ('maken', 'verdienen'): [1.0],
 ('vergeten', 'negeren'): [7.5],
 ('vermenigvuldigen', 'toevoegen'): [1.3],
 ('krimpen', 'groeien'): [2.5],
 ('aankomen', 'komen'): [7.5],
 ('slagen', 'proberen'): [6.0],
 ('accepteren', 'ontkennen'): [1.0],
 ('aankomen', 'vertrekken'): [1.0],
 ('akkoordgaan', 'afwijken'): [1.3],
 ('sturen', 'ontvangen'): [2.0],
 ('winnen', 'domineren'): [3.0],
 ('toevoegen', 'verdelen'): [1.0],
 ('doden', 'wurgen'): [1.9],
 ('verwerven', 'krijgen'): [8.2]}

In [45]:
# Iterate over DataFrame and replace missing values
for index, row in df.iterrows():
    word_pair = (row['word1'], row['word2'])

    # Check if current value is NaN
    if pd.isna(row['similarity_score_15']):
        if word_pair in missing_word_pairs_dict:

            # Extract first element from list
            df.at[index, 'similarity_score_15'] = missing_word_pairs_dict[word_pair][0]

# Check if any NaN values left
print(df[df['similarity_score_15'].isna()])

Empty DataFrame
Columns: [word1, word2, similarity_score_1, similarity_score_2, similarity_score_3, similarity_score_4, similarity_score_5, similarity_score_6, similarity_score_7, similarity_score_8, similarity_score_9, similarity_score_10, similarity_score_11, similarity_score_12, similarity_score_13, similarity_score_14, similarity_score_15]
Index: []


In [46]:
# Filter the DataFrame based on pairs
filtered_df = df[df.apply(lambda row: (row['word1'], row['word2']) in missing_word_pair_list, axis=1)]

# Select only the similarity_score_15 column
result = filtered_df[['word1', 'word2', 'similarity_score_15']]

# Show results
print(result)

                word1       word2  similarity_score_15
817       rondzwerven      dwalen                  7.7
818            slagen       falen                  4.0
819          besteden      redden                  1.5
820        vertrekken        gaan                  6.0
821             komen    bijwonen                  3.0
822             weten     geloven                  2.3
823        verzamelen   ontmoeten                  1.0
824             maken   verdienen                  1.0
825          vergeten     negeren                  7.5
826  vermenigvuldigen   toevoegen                  1.3
827           krimpen     groeien                  2.5
828          aankomen       komen                  7.5
829            slagen    proberen                  6.0
830        accepteren   ontkennen                  1.0
831          aankomen  vertrekken                  1.0
832       akkoordgaan    afwijken                  1.3
833            sturen   ontvangen                  2.0
834       

In [47]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [48]:
# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.1,0.10,0.01,0.00,0.05,0.2,0.10,0.10,0.0,0.05,0.0,2.0,0.03,0.10,0.18
1,slim,intelligent,0.9,0.95,0.95,8.00,9.50,0.9,0.95,0.95,0.4,0.95,9.0,9.0,0.97,0.95,0.79
2,hard,moeilijk,0.4,0.20,0.10,2.00,2.00,0.1,0.05,0.20,0.1,0.10,2.0,3.0,0.00,0.05,0.39
3,gelukkig,vrolijk,0.8,0.80,0.80,6.00,8.50,0.7,0.85,0.85,0.6,0.80,8.0,8.0,0.83,0.80,0.76
4,hard,stoer,0.7,0.40,0.45,2.00,6.00,0.5,0.05,0.65,0.4,0.20,4.0,6.0,0.61,0.50,0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,3.0,1.50,2.11,2.06,4.00,3.0,3.00,4.00,3.0,2.00,4.8,1.0,1.00,5.50,4.00
993,sturen,bijwonen,0.0,1.00,1.56,0.00,1.00,0.0,0.00,1.00,1.0,1.00,0.0,1.0,1.00,0.00,1.00
994,verzamelen,bijwonen,1.0,1.00,1.56,0.00,5.00,0.0,3.00,3.00,1.0,2.00,0.0,1.0,4.00,2.50,1.33
995,opnemen,intrekken,1.0,2.00,1.11,0.00,1.00,0.0,1.00,2.00,2.0,1.00,0.0,1.0,2.00,0.00,1.00


In [50]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f1.csv'

# Check if file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
