### **0. Set-up**

In [55]:
# Import libraries and utils
%run '../../utils.ipynb'

In [47]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [48]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Remove first word pair
cleaned_nl_simlex = cleaned_nl_simlex.iloc[1:]

# Select subset
# cleaned_nl_simlex = cleaned_nl_simlex.head(160)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [49]:
# Show results
cleaned_nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
1,slim,intelligent,8.19,A
2,hard,moeilijk,4.46,A
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
5,snel,razendsnel,7.18,A
...,...,...,...,...
992,samenvoegen,verwerven,3.89,V
993,sturen,bijwonen,1.85,V
994,verzamelen,bijwonen,1.06,V
995,opnemen,intrekken,2.29,V


In [50]:
tuples_list

[('slim', 'intelligent'),
 ('hard', 'moeilijk'),
 ('gelukkig', 'vrolijk'),
 ('hard', 'stoer'),
 ('snel', 'razendsnel'),
 ('gelukkig', 'blij'),
 ('kort', 'lang'),
 ('dom', 'stom'),
 ('vreemd', 'eigenaardig'),
 ('breed', 'smal'),
 ('makkelijk', 'moeilijk'),
 ('moeilijk', 'gemakkelijk'),
 ('slim', 'dom'),
 ('krankzinnig', 'gek'),
 ('gelukkig', 'kwaad'),
 ('uitgebreid', 'groot'),
 ('moeilijk', 'simpel'),
 ('nieuw', 'vers'),
 ('scherp', 'saai'),
 ('vlug', 'snel'),
 ('dom', 'dwaas'),
 ('prachtig', 'fantastisch'),
 ('eigenaardig', 'vreemd'),
 ('gelukkig', 'boos'),
 ('smal', 'breed'),
 ('eenvoudig', 'gemakkelijk'),
 ('oud', 'vers'),
 ('kennelijk', 'duidelijk'),
 ('betaalbaar', 'goedkoop'),
 ('leuk', 'grootmoedig'),
 ('raar', 'vreemd'),
 ('vreemd', 'normaal'),
 ('slecht', 'immoreel'),
 ('verdrietig', 'grappig'),
 ('prachtig', 'geweldig'),
 ('schuldig', 'beschaamd'),
 ('mooi', 'prachtig'),
 ('zelfverzekerd', 'zeker'),
 ('dom', 'onderontwikkeld'),
 ('groot', 'flexibel'),
 ('aardig', 'wreed'),
 ('

### **1. Define and Evaluate Parameters**

In [51]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: "
          "[(woord1, woord2, <score>), (woord3, woord4, <score>),] "
          " Geef geen extra uitleg of context."
          " Een voorbeeld van een woordpaar en zijn semantische gelijkenisscore is: [('oud', 'nieuw', 1.94)]")

In [52]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 25

In [53]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

25


In [12]:
# chunks = [
# [('oud', 'nieuw'),
#  ('slim', 'intelligent'),
#  ('hard', 'moeilijk'),],

# [('slecht', 'vreselijk'),
#  ('moeilijk', 'gemakkelijk'),
#  ('slim', 'dom'),],
  
# [('gelukkig', 'vrolijk'),
#   ('hard', 'stoer'),
#   ('gelukkig', 'blij'),],
# ]

In [13]:
# Set the size of chunks
# chunk_size = 90

# Chunk the data
# chunks = chunk_data(tuples_list, chunk_size)

# Count chunks
# print("Count of chunks:", len(chunks))

In [14]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [(woord1, woord2, <score>), (woord3, woord4, <score>),]  Geef geen extra uitleg of context. Een voorbeeld van een woordpaar en zijn semantische gelijkenisscore is: [(oud, nieuw, 1.94)] --- ('slim', 'intelligent'), ('hard', 'moeilijk'), ('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('makkelijk', 'moeilijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('sma

In [15]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [521, 513, 507, 478, 478, 482, 478, 483, 498, 487, 491, 479, 503, 500, 520, 472, 490, 511, 498, 511, 527, 515, 532, 523, 529]


### **2. Extract and Process Data**

In [16]:
# Process each chunk and get results using the OpenAI API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 375/375 [1:38:03<00:00, 15.69s/chunk]

Total time taken: 5883.92 seconds





In [10]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f2.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

In [54]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{('hysterie', 'verwarring'): [6.75, 6.14, 0.65, 4.41, 6.12, 7.89, 4.5, 0.8, 0.8, 5.9, 7.12, 0.47, 5.0, 5.34], ('hysterie', 'verwardheid'): [7.8], ('pistool', 'dageraad'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('maaltijd', 'staart'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.04, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('camera', 'president'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('likeur', 'muziekgroep'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('buik', 'ader'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.05, 0.0, 0.0, 0.0, 0.0, 0.03, 0.0, 0.0], ('pistool', 'bont'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.06, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('bank', 'honkbal'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.0, 0.0, 0.0, 0.0, 0.01, 0.0, 0.0], ('arbeider', 'camera'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.03, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('dek', 'muis'): [0.0, 0.0, 0.0, 0.0, 0.0, 0.01, 0.01, 0.0, 0.0, 0.0, 0.0, 

In [56]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [57]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,slim,intelligent,8.76,9.50,9.50,9.60,9.50,8.50,7.84,7.50,9.50,6.00,7.50,8.50,9.00,9.50,9.20
1,hard,moeilijk,3.01,5.00,6.00,2.00,6.00,3.00,4.83,2.50,3.00,3.00,4.00,2.50,6.00,5.00,2.00
2,gelukkig,vrolijk,6.54,7.50,8.00,8.50,9.00,7.00,7.39,7.50,7.50,7.00,8.50,7.50,8.00,8.50,7.50
3,hard,stoer,2.56,3.00,3.00,1.00,5.00,6.00,2.35,3.00,4.00,4.00,4.00,4.50,7.00,6.50,2.00
4,snel,razendsnel,9.43,9.00,9.50,9.50,8.00,9.00,9.17,8.50,8.50,9.00,8.50,8.50,9.00,8.75,9.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,0.41,0.55,3.88,0.69,0.14,0.51,4.00,0.47,3.36,1.70,2.76,0.20,4.69,1.90,0.69
993,sturen,bijwonen,0.05,0.03,0.00,0.06,0.03,0.06,0.00,0.00,0.42,0.06,0.30,0.00,0.05,0.04,0.00
994,verzamelen,bijwonen,0.17,0.46,0.00,0.29,0.14,0.30,2.62,0.57,0.78,1.06,0.11,0.25,0.09,0.75,0.11
995,opnemen,intrekken,0.22,0.21,0.00,0.31,0.12,0.50,1.67,0.18,1.05,0.22,0.30,0.05,0.64,1.28,0.09


In [58]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      1
similarity_score_3      1
similarity_score_4      1
similarity_score_5      1
similarity_score_6      1
similarity_score_7      1
similarity_score_8      1
similarity_score_9      1
similarity_score_10     1
similarity_score_11     1
similarity_score_12     1
similarity_score_13     1
similarity_score_14     1
similarity_score_15    30
dtype: int64


In [59]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
577,hysterie,verwarring,6.75,6.14,0.65,4.41,6.12,7.89,4.5,0.8,0.8,5.9,7.12,0.47,5.0,5.34,
600,hysterie,verwardheid,7.8,,,,,,,,,,,,,,
752,pistool,dageraad,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
753,maaltijd,staart,0.0,0.0,0.0,0.0,0.0,0.01,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
754,camera,president,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
755,likeur,muziekgroep,0.0,0.0,0.0,0.0,0.0,0.01,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
756,buik,ader,0.0,0.0,0.0,0.0,0.0,0.01,0.05,0.0,0.0,0.0,0.0,0.03,0.0,0.0,
757,pistool,bont,0.0,0.0,0.0,0.0,0.0,0.01,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
758,bank,honkbal,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,
759,arbeider,camera,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [32]:
response

["[('slim', 'intelligent', 8.76), ('hard', 'moeilijk', 3.01), ('gelukkig', 'vrolijk', 6.54), ('hard', 'stoer', 2.56), ('snel', 'razendsnel', 9.43), ('gelukkig', 'blij', 7.18), ('kort', 'lang', 1.25), ('dom', 'stom', 6.20), ('vreemd', 'eigenaardig', 7.34), ('breed', 'smal', 1.65), ('makkelijk', 'moeilijk', 4.65), ('moeilijk', 'gemakkelijk', 7.12), ('slim', 'dom', 2.34), ('krankzinnig', 'gek', 8.76), ('gelukkig', 'kwaad', 2.34), ('uitgebreid', 'groot', 6.78), ('moeilijk', 'simpel', 6.32), ('nieuw', 'vers', 8.43), ('scherp', 'saai', 2.65), ('vlug', 'snel', 8.53), ('dom', 'dwaas', 7.54), ('prachtig', 'fantastisch', 8.76), ('eigenaardig', 'vreemd', 7.78), ('gelukkig', 'boos', 2.15), ('smal', 'breed', 1.65), ('eenvoudig', 'gemakkelijk', 8.12), ('oud', 'vers', 2.56), ('kennelijk', 'duidelijk', 6.21), ('betaalbaar', 'goedkoop', 8.21), ('leuk', 'grootmoedig', 2.54), ('raar', 'vreemd', 7.89), ('vreemd', 'normaal', 3.67), ('slecht', 'immoreel', 6.54), ('verdrietig', 'grappig', 2.35), ('prachtig',

In [64]:
# Custom prompt for wordpair 'hysterie' 'verwardheid'
custom_prompt = ("Beoordeel de semantische gelijkenis van het woordpaar ('hysterie', 'verwarring') met een score op een schaal van 0 tot 10, "
                                      "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
                                      "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: "
                                      "[(woord1, woord2, <score>)] "
                                      "Geef geen extra uitleg of context. "
                                      "Een voorbeeld van een woordpaar en zijn semantische gelijkenisscore is: [('oud', 'nieuw', 1.94)]")


# Make extra API call for 'hysterie' 'verwardheid'
messages = [{"role": "user", "content": custom_prompt}]
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1)

# Show results
print(completion.choices[0].message.content)

[(hysterie, verwarring, 6.25)]


In [65]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'hysterie') & (df['word2'] == 'verwarring'), 'similarity_score_15'] = 6.25

# Check value
df.loc[(df['word1'] == 'hysterie') & (df['word2'] == 'verwarring')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
577,hysterie,verwarring,6.75,6.14,0.65,4.41,6.12,7.89,4.5,0.8,0.8,5.9,7.12,0.47,5.0,5.34,6.25


In [66]:
# Drop faulty row
df = df[~((df['word1'] == 'hysterie') & (df['word2'] == 'verwardheid'))]
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,slim,intelligent,8.76,9.50,9.50,9.60,9.50,8.50,7.84,7.50,9.50,6.00,7.50,8.50,9.00,9.50,9.20
1,hard,moeilijk,3.01,5.00,6.00,2.00,6.00,3.00,4.83,2.50,3.00,3.00,4.00,2.50,6.00,5.00,2.00
2,gelukkig,vrolijk,6.54,7.50,8.00,8.50,9.00,7.00,7.39,7.50,7.50,7.00,8.50,7.50,8.00,8.50,7.50
3,hard,stoer,2.56,3.00,3.00,1.00,5.00,6.00,2.35,3.00,4.00,4.00,4.00,4.50,7.00,6.50,2.00
4,snel,razendsnel,9.43,9.00,9.50,9.50,8.00,9.00,9.17,8.50,8.50,9.00,8.50,8.50,9.00,8.75,9.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,0.41,0.55,3.88,0.69,0.14,0.51,4.00,0.47,3.36,1.70,2.76,0.20,4.69,1.90,0.69
993,sturen,bijwonen,0.05,0.03,0.00,0.06,0.03,0.06,0.00,0.00,0.42,0.06,0.30,0.00,0.05,0.04,0.00
994,verzamelen,bijwonen,0.17,0.46,0.00,0.29,0.14,0.30,2.62,0.57,0.78,1.06,0.11,0.25,0.09,0.75,0.11
995,opnemen,intrekken,0.22,0.21,0.00,0.31,0.12,0.50,1.67,0.18,1.05,0.22,0.30,0.05,0.64,1.28,0.09


In [67]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
752,pistool,dageraad,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
753,maaltijd,staart,0.0,0.0,0.0,0.0,0.0,0.01,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
754,camera,president,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
755,likeur,muziekgroep,0.0,0.0,0.0,0.0,0.0,0.01,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
756,buik,ader,0.0,0.0,0.0,0.0,0.0,0.01,0.05,0.0,0.0,0.0,0.0,0.03,0.0,0.0,
757,pistool,bont,0.0,0.0,0.0,0.0,0.0,0.01,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
758,bank,honkbal,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,
759,arbeider,camera,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
760,dek,muis,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
849,analyseren,evalueren,6.47,8.84,7.12,0.74,6.04,7.29,0.73,7.16,8.75,7.81,0.82,9.11,0.81,0.77,


In [68]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))
missing_word_pair_list

[('pistool', 'dageraad'),
 ('maaltijd', 'staart'),
 ('camera', 'president'),
 ('likeur', 'muziekgroep'),
 ('buik', 'ader'),
 ('pistool', 'bont'),
 ('bank', 'honkbal'),
 ('arbeider', 'camera'),
 ('dek', 'muis'),
 ('analyseren', 'evalueren'),
 ('argumenteren', 'rationaliseren'),
 ('verliezen', 'houden'),
 ('vergelijken', 'analyseren'),
 ('desorganiseren', 'organiseren'),
 ('gaan', 'verkopen'),
 ('nemen', 'bezitten'),
 ('leren', 'luisteren'),
 ('vernietigen', 'bouwen'),
 ('creëren', 'bouwen'),
 ('stelen', 'kopen'),
 ('doden', 'ophangen'),
 ('vergeten', 'weten'),
 ('creëren', 'voorstellen'),
 ('doen', 'gebeuren'),
 ('winnen', 'bereiken'),
 ('geven', 'weigeren'),
 ('verdienen', 'nemen'),
 ('krijgen', 'zetten')]

In [69]:
# Make extra API call
formatted_prompt = format_prompt(missing_word_pair_list, prompt)
messages = [{"role": "user", "content": formatted_prompt}]

# Call the OpenAI API
completion = client.chat.completions.create(
    model=model,
    messages=messages,
    n=1)

# Store the response content
missing_word_pairs_response = [completion.choices[0].message.content]

In [70]:
# Extract data with regular expressions into dictionary
missing_word_pairs_dict = process_responses(missing_word_pairs_response)
missing_word_pairs_dict

{('pistool', 'dageraad'): [0.0],
 ('maaltijd', 'staart'): [0.0],
 ('camera', 'president'): [0.0],
 ('likeur', 'muziekgroep'): [0.0],
 ('buik', 'ader'): [0.0],
 ('pistool', 'bont'): [0.0],
 ('bank', 'honkbal'): [0.0],
 ('arbeider', 'camera'): [0.0],
 ('dek', 'muis'): [0.0],
 ('analyseren', 'evalueren'): [6.82],
 ('argumenteren', 'rationaliseren'): [4.4],
 ('verliezen', 'houden'): [0.0],
 ('vergelijken', 'analyseren'): [2.33],
 ('desorganiseren', 'organiseren'): [5.45],
 ('gaan', 'verkopen'): [0.0],
 ('nemen', 'bezitten'): [4.41],
 ('leren', 'luisteren'): [4.36],
 ('vernietigen', 'bouwen'): [0.0],
 ('creëren', 'bouwen'): [5.36],
 ('stelen', 'kopen'): [0.0],
 ('doden', 'ophangen'): [0.0],
 ('vergeten', 'weten'): [3.94],
 ('creëren', 'voorstellen'): [6.11],
 ('doen', 'gebeuren'): [1.54],
 ('winnen', 'bereiken'): [1.63],
 ('geven', 'weigeren'): [3.04],
 ('verdienen', 'nemen'): [0.0],
 ('krijgen', 'zetten'): [0.0]}

In [71]:
# Iterate over the DataFrame and replace missing values
for index, row in df.iterrows():
    word_pair = (row['word1'], row['word2'])
    # Check if the current value is NaN
    if pd.isna(row['similarity_score_15']):
        if word_pair in missing_word_pairs_dict:
            # Extract the first element from the list to get the scalar value
            df.at[index, 'similarity_score_15'] = missing_word_pairs_dict[word_pair][0]

# Print to check if there are any NaN values left in 'similarity_score_15'
print(df[df['similarity_score_15'].isna()])

Empty DataFrame
Columns: [word1, word2, similarity_score_1, similarity_score_2, similarity_score_3, similarity_score_4, similarity_score_5, similarity_score_6, similarity_score_7, similarity_score_8, similarity_score_9, similarity_score_10, similarity_score_11, similarity_score_12, similarity_score_13, similarity_score_14, similarity_score_15]
Index: []


In [72]:
# Filter the DataFrame based on pairs
filtered_df = df[df.apply(lambda row: (row['word1'], row['word2']) in missing_word_pair_list, axis=1)]

# Select only the similarity_score_15 column
result = filtered_df[['word1', 'word2', 'similarity_score_15']]

# Print or output the result
print(result)

              word1           word2  similarity_score_15
752         pistool        dageraad                 0.00
753        maaltijd          staart                 0.00
754          camera       president                 0.00
755          likeur     muziekgroep                 0.00
756            buik            ader                 0.00
757         pistool            bont                 0.00
758            bank         honkbal                 0.00
759        arbeider          camera                 0.00
760             dek            muis                 0.00
849      analyseren       evalueren                 6.82
850    argumenteren  rationaliseren                 4.40
851       verliezen          houden                 0.00
852     vergelijken      analyseren                 2.33
853  desorganiseren     organiseren                 5.45
854            gaan        verkopen                 0.00
855           nemen        bezitten                 4.41
856           leren       luist

In [73]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [74]:
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,slim,intelligent,8.76,9.50,9.50,9.60,9.50,8.50,7.84,7.50,9.50,6.00,7.50,8.50,9.00,9.50,9.20
1,hard,moeilijk,3.01,5.00,6.00,2.00,6.00,3.00,4.83,2.50,3.00,3.00,4.00,2.50,6.00,5.00,2.00
2,gelukkig,vrolijk,6.54,7.50,8.00,8.50,9.00,7.00,7.39,7.50,7.50,7.00,8.50,7.50,8.00,8.50,7.50
3,hard,stoer,2.56,3.00,3.00,1.00,5.00,6.00,2.35,3.00,4.00,4.00,4.00,4.50,7.00,6.50,2.00
4,snel,razendsnel,9.43,9.00,9.50,9.50,8.00,9.00,9.17,8.50,8.50,9.00,8.50,8.50,9.00,8.75,9.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,0.41,0.55,3.88,0.69,0.14,0.51,4.00,0.47,3.36,1.70,2.76,0.20,4.69,1.90,0.69
993,sturen,bijwonen,0.05,0.03,0.00,0.06,0.03,0.06,0.00,0.00,0.42,0.06,0.30,0.00,0.05,0.04,0.00
994,verzamelen,bijwonen,0.17,0.46,0.00,0.29,0.14,0.30,2.62,0.57,0.78,1.06,0.11,0.25,0.09,0.75,0.11
995,opnemen,intrekken,0.22,0.21,0.00,0.31,0.12,0.50,1.67,0.18,1.05,0.22,0.30,0.05,0.64,1.28,0.09


In [75]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f2.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
