### **0. Set-up**

In [313]:
# Import libraries and utils
%run '../utils.ipynb'

In [314]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [315]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../data/cleaned_data/cleaned-nl-simlex-999.csv")

# Select subset
# nl_simlex = nl_simlex.head(160)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [327]:
cleaned_nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
0,oud,nieuw,1.94,A
1,slim,intelligent,8.19,A
2,hard,moeilijk,4.46,A
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
...,...,...,...,...
992,samenvoegen,verwerven,3.89,V
993,sturen,bijwonen,1.85,V
994,verzamelen,bijwonen,1.06,V
995,opnemen,intrekken,2.29,V


### **1. Define and Evaluate Parameters**

In [316]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 5.0

In [317]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: "
          "[(woord1, woord2, <score>), (woord3, woord4, <score>),] "
          " Geef geen extra uitleg of context.")

In [319]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

25


In [320]:
# chunks = [
# [('oud', 'nieuw'),
#  ('slim', 'intelligent'),
#  ('hard', 'moeilijk'),],

# [('slecht', 'vreselijk'),
#  ('moeilijk', 'gemakkelijk'),
#  ('slim', 'dom'),],
  
# [('gelukkig', 'vrolijk'),
#   ('hard', 'stoer'),
#   ('gelukkig', 'blij'),],
# ]

In [321]:
# Set the size of chunks
# chunk_size = 90

# Chunk the data
# chunks = chunk_data(tuples_list, chunk_size)

# Count chunks
# print("Count of chunks:", len(chunks))

In [322]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [(woord1, woord2, <score>), (woord3, woord4, <score>),]  Geef geen extra uitleg of context. --- ('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk'), ('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('makkelijk', 'moeilijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('smal', 'breed'), ('eenvoudig', 'gemakkelijk'), ('oud', 'vers'), ('kennelijk', 

In [323]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")  # Common encoding for GPT models

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [489, 481, 476, 450, 444, 452, 447, 453, 464, 459, 456, 451, 472, 467, 491, 442, 458, 479, 469, 477, 498, 492, 501, 492, 498]


### **2. Similarity Scores: 4-6**

In [None]:
# Set sample size
sample_size = 3

In [55]:
# Process each chunk and get results using the OpenAI API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 75/75 [16:26<00:00, 13.15s/chunk]

Total time taken: 986.41 seconds





In [92]:
# Define filepath
file_path = '../../data/response/nl/r-f1-nl-s4-6.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

In [243]:
# 1 sample = 217.16 seconds
# 3 samples = (12 chunks)
# 10 samples = 2002.08 seconds (12 chunks)
# 20 samples = 2657.10 seconds (3 chunks)

In [100]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)
data_dict

{('oud', 'nieuw'): [1.2, 0.1, 1.0],
 ('slim', 'intelligent'): [9.0, 0.95, 0.95],
 ('hard', 'moeilijk'): [2.0, 0.15, 0.05],
 ('gelukkig', 'vrolijk'): [7.0, 0.8, 0.8],
 ('hard', 'stoer'): [4.0, 0.2, 0.1],
 ('snel', 'razendsnel'): [7.5, 0.9, 1.0],
 ('gelukkig', 'blij'): [8.5, 0.85, 0.9],
 ('kort', 'lang'): [1.0, 0.05, 0.1],
 ('dom', 'stom'): [7.0, 0.9, 0.8],
 ('vreemd', 'eigenaardig'): [9.0, 0.9, 1.0],
 ('breed', 'smal'): [1.0, 0.15, 0.1],
 ('makkelijk', 'moeilijk'): [2.0, 0.1, 0.05],
 ('moeilijk', 'gemakkelijk'): [9.0, 0.9, 0.95],
 ('slim', 'dom'): [2.0, 0.05, 0.1],
 ('krankzinnig', 'gek'): [9.0, 0.85, 1.0],
 ('gelukkig', 'kwaad'): [1.0, 0.1, 0.0],
 ('uitgebreid', 'groot'): [2.0, 0.8, 0.7],
 ('moeilijk', 'simpel'): [8.0, 0.95, 0.9],
 ('nieuw', 'vers'): [5.5, 0.9, 0.9],
 ('scherp', 'saai'): [1.0, 0.1, 0.05],
 ('vlug', 'snel'): [7.0, 0.85, 0.9],
 ('dom', 'dwaas'): [7.0, 0.85, 0.9],
 ('prachtig', 'fantastisch'): [8.5, 0.9, 0.8],
 ('eigenaardig', 'vreemd'): [9.0, 0.95, 1.0],
 ('gelukkig', 'b

In [101]:
# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
higher_lower_samples

{}

In [131]:
# Create combined column for original dataframe
combined_cleaned_nl_simlex = cleaned_nl_simlex.copy()
combined_cleaned_nl_simlex['Combined_Columns'] = combined_cleaned_nl_simlex['word1'] + '_' + combined_cleaned_nl_simlex['word2']
combined_cleaned_nl_simlex = combined_cleaned_nl_simlex[['Combined_Columns']]

# Create combined column for extracted dataframe
df_combined = create_dataframe(data_dict)
df_combined['Combined_Columns'] = df_combined['word1'] + '_' + df_combined['word2']
df_combined = df_combined[['Combined_Columns']]

# Find values in df1 that are not in df2
missing_values = combined_cleaned_nl_simlex[~combined_cleaned_nl_simlex['Combined_Columns'].isin(df_combined['Combined_Columns'])]
print(missing_values)

# Check for duplicate word pairws
duplicate_combinations = df_combined.duplicated(subset='Combined_Columns', keep=False)

# Print rows with duplicate word pairs
print(df_combined[duplicate_combinations])

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [132]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3
0,oud,nieuw,1.2,0.10,1.00
1,slim,intelligent,9.0,0.95,0.95
2,hard,moeilijk,2.0,0.15,0.05
3,gelukkig,vrolijk,7.0,0.80,0.80
4,hard,stoer,4.0,0.20,0.10
...,...,...,...,...,...
992,samenvoegen,verwerven,0.0,3.00,4.00
993,sturen,bijwonen,0.0,1.00,1.00
994,verzamelen,bijwonen,0.0,2.50,1.00
995,opnemen,intrekken,0.0,1.00,1.00


In [133]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                 0
word2                 0
similarity_score_1    0
similarity_score_2    0
similarity_score_3    0
dtype: int64


In [134]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3


In [135]:
# Inspect response -> the missing value is because the score of 'arts', 'dokter' is between quotations and not recognizes by RE ('10')
# response

In [136]:
# Add missing value manually
# df.loc[(df['word1'] == 'arts') & (df['word2'] == 'dokter'), 'similarity_score_3'] = 10.00

# # Check value
# df.loc[(df['word1'] == 'arts') & (df['word2'] == 'dokter')]

In [137]:
# Check for rows with at least one null value
# rows_with_null = df[df.isnull().any(axis=1)]
# rows_with_null

In [138]:
# Define file_path
file_path = '../../data/prompt_data/nl/p1-f1-nl-s4-6.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.


### **3. Similarity Scores: 7-10**

In [139]:
# Set sample size
sample_size = 4

In [140]:
# Process each chunk and get results using the OpenAI API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing:   0%|          | 0/100 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 100/100 [22:46<00:00, 13.67s/chunk]

Total time taken: 1366.69 seconds





In [141]:
# Define filepath
file_path = '../../data/response/nl/r-f1-nl-s7-10.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [147]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)
data_dict

{('oud', 'nieuw'): [0.2, 4.0, 0.1, 0.1],
 ('slim', 'intelligent'): [0.95, 9.0, 0.9, 0.95],
 ('hard', 'moeilijk'): [0.1, 1.0, 0.2, 0.2],
 ('gelukkig', 'vrolijk'): [0.7, 7.0, 0.8, 0.8],
 ('hard', 'stoer'): [0.1, 4.0, 0.2, 0.4],
 ('snel', 'razendsnel'): [1.0, 8.0, 0.9, 1.0],
 ('gelukkig', 'blij'): [0.8, 6.0, 0.7, 0.8],
 ('kort', 'lang'): [0.2, 1.0, 0.1, 0.1],
 ('dom', 'stom'): [0.9, 6.0, 0.7, 0.95],
 ('vreemd', 'eigenaardig'): [1.0, 9.0, 0.9, 0.9],
 ('breed', 'smal'): [0.1, 2.0, 0.2, 0.2],
 ('makkelijk', 'moeilijk'): [0.2, 3.0, 0.2, 0.9],
 ('moeilijk', 'gemakkelijk'): [0.1, 2.0, 0.8, 0.9],
 ('slim', 'dom'): [0.1, 2.0, 0.3, 0.2],
 ('krankzinnig', 'gek'): [1.0, 9.0, 0.8, 1.0],
 ('gelukkig', 'kwaad'): [0.1, 1.0, 0.2, 0.2],
 ('uitgebreid', 'groot'): [0.4, 6.0, 0.3, 0.2],
 ('moeilijk', 'simpel'): [0.7, 8.0, 0.7, 0.9],
 ('nieuw', 'vers'): [0.8, 6.0, 0.9, 0.9],
 ('scherp', 'saai'): [0.1, 1.0, 0.1, 0.1],
 ('vlug', 'snel'): [0.8, 8.0, 0.8, 0.9],
 ('dom', 'dwaas'): [0.9, 8.0, 0.7, 0.9],
 ('prachtig

In [148]:
# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
higher_lower_samples

{}

In [149]:
# Create combined column for original dataframe
combined_cleaned_nl_simlex = cleaned_nl_simlex.copy()
combined_cleaned_nl_simlex['Combined_Columns'] = combined_cleaned_nl_simlex['word1'] + '_' + combined_cleaned_nl_simlex['word2']
combined_cleaned_nl_simlex = combined_cleaned_nl_simlex[['Combined_Columns']]

# Create combined column for extracted dataframe
df_combined = create_dataframe(data_dict)
df_combined['Combined_Columns'] = df_combined['word1'] + '_' + df_combined['word2']
df_combined = df_combined[['Combined_Columns']]

# Find values in df1 that are not in df2
missing_values = combined_cleaned_nl_simlex[~combined_cleaned_nl_simlex['Combined_Columns'].isin(df_combined['Combined_Columns'])]
print(missing_values)

# Check for duplicate word pairws
duplicate_combinations = df_combined.duplicated(subset='Combined_Columns', keep=False)

# Print rows with duplicate word pairs
print(df_combined[duplicate_combinations])

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [150]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4
0,oud,nieuw,0.20,4.0,0.1,0.10
1,slim,intelligent,0.95,9.0,0.9,0.95
2,hard,moeilijk,0.10,1.0,0.2,0.20
3,gelukkig,vrolijk,0.70,7.0,0.8,0.80
4,hard,stoer,0.10,4.0,0.2,0.40
...,...,...,...,...,...,...
992,samenvoegen,verwerven,5.00,4.0,2.0,5.00
993,sturen,bijwonen,1.00,1.0,0.0,0.00
994,verzamelen,bijwonen,4.00,3.0,0.0,2.00
995,opnemen,intrekken,3.00,2.0,0.0,0.00


In [151]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                 0
word2                 0
similarity_score_1    0
similarity_score_2    0
similarity_score_3    0
similarity_score_4    0
dtype: int64


In [152]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4


In [153]:
# Define file_path
file_path = '../../data/prompt_data/nl/p1-f1-nl-s7-10.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.


### **4. Similarity Scores: 11-20**

In [154]:
# Set sample size
sample_size = 10

In [155]:
# Process each chunk and get results using the OpenAI API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 250/250 [56:50<00:00, 13.64s/chunk]

Total time taken: 3410.74 seconds





In [156]:
# Define filepath
file_path = '../../data/response/nl/r-f1-nl-s11-20.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [260]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)
data_dict

Error converting '8.00.' to float in the pair ('weten', 'begrijpen'): could not convert string to float: '8.00.'


{('oud', 'nieuw'): [1.0, 1.0, 0.08, 0.12, 0.0, 0.1, 0.01, 0.1, 0.1, 0.14],
 ('slim', 'intelligent'): [1.0,
  1.0,
  0.1,
  0.97,
  10.0,
  0.9,
  7.5,
  0.95,
  0.95,
  0.38],
 ('hard', 'moeilijk'): [0.1, 0.0, 0.03, 0.24, 0.0, 0.2, 0.01, 0.15, 0.1, 0.13],
 ('gelukkig', 'vrolijk'): [0.1, 1.0, 0.1, 0.89, 7.5, 0.7, 6.5, 0.7, 0.7, 0.57],
 ('hard', 'stoer'): [0.2, 0.0, 0.05, 0.36, 3.0, 0.3, 3.0, 0.3, 0.4, 0.17],
 ('snel', 'razendsnel'): [1.0,
  0.67,
  0.1,
  0.95,
  10.0,
  0.9,
  9.0,
  0.9,
  0.8,
  0.71],
 ('gelukkig', 'blij'): [0.8, 0.67, 0.08, 0.87, 7.5, 0.8, 8.0, 0.8, 0.8, 0.57],
 ('kort', 'lang'): [0.1, 0.0, 0.06, 0.37, 0.0, 0.1, 1.0, 0.1, 0.1, 0.17],
 ('dom', 'stom'): [0.9, 0.67, 0.1, 0.89, 5.0, 0.8, 6.5, 0.8, 0.7, 0.6],
 ('vreemd', 'eigenaardig'): [1.0,
  1.0,
  0.1,
  0.87,
  10.0,
  0.9,
  9.0,
  0.9,
  0.9,
  0.83],
 ('breed', 'smal'): [0.1, 0.0, 0.03, 0.13, 0.0, 0.1, 1.0, 0.1, 0.1, 0.17],
 ('makkelijk', 'moeilijk'): [0.1,
  0.0,
  0.07,
  0.25,
  0.0,
  0.2,
  1.0,
  0.2,
  0.

In [261]:
# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
higher_lower_samples

{('overeenkomst', 'afspraak'): [6.0, 0.4, 0.5, 0.7, 8.0, 8.0, 6.0, 6.5, 6.0],
 ('overeenkomt', 'afspraak'): [3.0],
 ('weten', 'begrijpen'): [6.0, 6.0, 7.0, 8.0, 7.0, 7.5, 7.0, 8.0, 7.5]}

In [262]:
# Add the value 3.0 to the list for the key ('overeenkomst', 'afspraak')
data_dict[('overeenkomst', 'afspraak')].append(3.0)

# Remove the incorrect key ('overeenkomt', 'afspraak')
del data_dict[('overeenkomt', 'afspraak')]

In [263]:
# Check results
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
higher_lower_samples

{('weten', 'begrijpen'): [6.0, 6.0, 7.0, 8.0, 7.0, 7.5, 7.0, 8.0, 7.5]}

In [229]:
# Create combined column for original dataframe
combined_cleaned_nl_simlex = cleaned_nl_simlex.copy()
combined_cleaned_nl_simlex['Combined_Columns'] = combined_cleaned_nl_simlex['word1'] + '_' + combined_cleaned_nl_simlex['word2']
combined_cleaned_nl_simlex = combined_cleaned_nl_simlex[['Combined_Columns']]

# Create combined column for extracted dataframe
df_combined = create_dataframe(data_dict)
df_combined['Combined_Columns'] = df_combined['word1'] + '_' + df_combined['word2']
df_combined = df_combined[['Combined_Columns']]

# Find values in df1 that are not in df2
missing_values = combined_cleaned_nl_simlex[~combined_cleaned_nl_simlex['Combined_Columns'].isin(df_combined['Combined_Columns'])]
print(missing_values)

# Check for duplicate word pairws
duplicate_combinations = df_combined.duplicated(subset='Combined_Columns', keep=False)

# Print rows with duplicate word pairs
print(df_combined[duplicate_combinations])

                   Combined_Columns
519      pot_huishoudelijk apparaat
791               kiezen_stemmen op
907            vermoeden_doen alsof
982      samenwerken_aansluiten bij
988  binnentreden_verschuldigd zijn
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [272]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10
0,oud,nieuw,1.0,1.0,0.08,0.12,0.00,0.1,0.01,0.10,0.10,0.14
1,slim,intelligent,1.0,1.0,0.10,0.97,10.00,0.9,7.50,0.95,0.95,0.38
2,hard,moeilijk,0.1,0.0,0.03,0.24,0.00,0.2,0.01,0.15,0.10,0.13
3,gelukkig,vrolijk,0.1,1.0,0.10,0.89,7.50,0.7,6.50,0.70,0.70,0.57
4,hard,stoer,0.2,0.0,0.05,0.36,3.00,0.3,3.00,0.30,0.40,0.17
...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,3.0,4.0,4.00,0.00,1.39,3.0,3.00,2.50,7.00,3.00
993,sturen,bijwonen,1.0,1.0,1.00,0.10,0.00,0.0,1.00,0.00,1.00,1.00
994,verzamelen,bijwonen,2.0,2.0,2.00,0.12,0.00,0.0,1.00,0.00,2.00,2.00
995,opnemen,intrekken,2.0,4.0,1.00,0.00,0.00,1.0,1.00,0.00,1.00,1.00


In [273]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    1
dtype: int64


In [274]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10
870,weten,begrijpen,6.0,6.0,7.0,8.0,7.0,7.5,7.0,8.0,7.5,


In [275]:
# Inspect response
response

["[('oud', 'nieuw', 1.00), ('slim', 'intelligent', 1.00), ('hard', 'moeilijk', 0.10), ('gelukkig', 'vrolijk', 0.10), ('hard', 'stoer', 0.20), ('snel', 'razendsnel', 1.00), ('gelukkig', 'blij', 0.80), ('kort', 'lang', 0.10), ('dom', 'stom', 0.90), ('vreemd', 'eigenaardig', 1.00), ('breed', 'smal', 0.10), ('makkelijk', 'moeilijk', 0.10), ('moeilijk', 'gemakkelijk', 0.90), ('slim', 'dom', 0.10), ('krankzinnig', 'gek', 1.00), ('gelukkig', 'kwaad', 0.10), ('uitgebreid', 'groot', 0.10), ('moeilijk', 'simpel', 0.90), ('nieuw', 'vers', 0.80), ('scherp', 'saai', 0.10), ('vlug', 'snel', 0.90), ('dom', 'dwaas', 0.90), ('prachtig', 'fantastisch', 0.90), ('eigenaardig', 'vreemd', 0.90), ('gelukkig', 'boos', 0.10), ('smal', 'breed', 0.10), ('eenvoudig', 'gemakkelijk', 0.90), ('oud', 'vers', 0.10), ('kennelijk', 'duidelijk', 0.10), ('betaalbaar', 'goedkoop', 1.00), ('leuk', 'grootmoedig', 0.10), ('raar', 'vreemd', 0.90), ('vreemd', 'normaal', 0.10), ('slecht', 'immoreel', 0.10), ('verdrietig', 'grapp

In [276]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'weten') & (df['word2'] == 'begrijpen'), 'similarity_score_10'] = 8.00

# Check value
df.loc[(df['word1'] == 'weten') & (df['word2'] == 'begrijpen')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10
870,weten,begrijpen,6.0,6.0,7.0,8.0,7.0,7.5,7.0,8.0,7.5,8.0


In [277]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10


In [278]:
# Define file_path
file_path = '../../data/prompt_data/nl/p1-f1-nl-s11-20.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.


### **4. Create Final DatFrame**

In [294]:
# Load dataframes
sim_1_3 = pd.read_csv("../../data/prompt_data/nl/p1-f1-nl-s1-3.csv")
sim_4_6 = pd.read_csv("../../data/prompt_data/nl/p1-f1-nl-s4-6.csv")
sim_7_10 = pd.read_csv("../../data/prompt_data/nl/p1-f1-nl-s7-10.csv")
sim_11_20 = pd.read_csv("../../data/prompt_data/nl/p1-f1-nl-s11-20.csv")

In [297]:
# Rename columns
sim_1_3.columns = ['word1', 'word2', 'similarity_score_1', 'similarity_score_2', 'similarity_score_3']
sim_4_6.columns = ['word1', 'word2', 'similarity_score_4', 'similarity_score_5', 'similarity_score_6']
sim_7_10.columns = ['word1', 'word2', 'similarity_score_7', 'similarity_score_8', 'similarity_score_9', 'similarity_score_10']
sim_11_20.columns = ['word1', 'word2', 'similarity_score_11', 'similarity_score_12', 'similarity_score_13', 'similarity_score_14', 'similarity_score_15', 'similarity_score_16', 'similarity_score_17', 'similarity_score_18', 'similarity_score_19', 'similarity_score_20']

In [298]:
# Join dataframes
joined_df = pd.concat([sim_1_3, sim_4_6, sim_7_10, sim_11_20]).drop_duplicates(subset=['word1', 'word2'])

# Show results
joined_df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,...,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15,similarity_score_16,similarity_score_17,similarity_score_18,similarity_score_19,similarity_score_20
0,oud,nieuw,3.0,0.10,0.10,,,,,,...,,,,,,,,,,
1,slim,intelligent,9.5,0.95,0.95,,,,,,...,,,,,,,,,,
2,hard,moeilijk,1.0,0.10,0.30,,,,,,...,,,,,,,,,,
3,gelukkig,vrolijk,7.5,0.70,0.70,,,,,,...,,,,,,,,,,
4,hard,stoer,5.0,0.40,0.40,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
519,pot,huishoudelijkapparaat,,,,,,,,,...,7.0,7.0,7.0,5.00,6.00,5.0,7.5,6.0,7.00,5.0
791,kiezen,stemmenop,,,,,,,,,...,0.0,1.0,3.0,1.00,3.00,0.0,0.0,0.0,0.01,2.0
907,vermoeden,doenalsof,,,,,,,,,...,0.0,0.0,2.0,1.00,1.00,3.0,5.0,2.0,2.00,6.5
982,samenwerken,aansluitenbij,,,,,,,,,...,5.0,5.0,4.0,0.38,3.93,7.0,6.0,0.0,5.00,5.0


### **5. Similarity Scores: 1-15**

In [324]:
# Define number of sublists
n_sublists = 25

In [325]:
# Process each chunk and get results using the OpenAI API
response = get_responses(chunks, prompt, model, sample_size, delay)

Processing:   0%|          | 0/375 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 375/375 [1:25:17<00:00, 13.65s/chunk]

Total time taken: 5117.52 seconds





In [326]:
# Define filepath
file_path = '../../data/response/nl/response-f1.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [None]:
# 1 sample = 217.16 seconds
# 3 samples = (12 chunks)
# 10 samples = 2002.08 seconds (12 chunks)
# 15 samples = 5117.52 seconds (25 chunks) -> 1.5 hours
# 20 samples = 2657.10 seconds (3 chunks)

In [330]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
higher_lower_samples

{('rondzwerven', 'dwalen'): [9.0,
  8.0,
  6.0,
  5.5,
  7.0,
  8.0,
  9.0,
  6.0,
  7.0,
  6.0,
  8.0,
  6.0,
  6.0,
  6.0],
 ('slagen', 'falen'): [3.0,
  2.0,
  4.0,
  5.5,
  3.0,
  8.0,
  3.0,
  1.0,
  2.0,
  2.0,
  6.0,
  3.0,
  3.0,
  7.0],
 ('besteden', 'redden'): [3.0,
  2.0,
  1.5,
  2.0,
  2.0,
  2.0,
  1.0,
  2.0,
  2.0,
  1.0,
  1.0,
  1.0,
  2.0,
  2.0],
 ('vertrekken', 'gaan'): [7.0,
  7.0,
  7.0,
  6.0,
  7.0,
  7.0,
  6.0,
  6.0,
  7.0,
  6.0,
  7.5,
  4.0,
  4.0,
  6.0],
 ('komen', 'bijwonen'): [3.0,
  2.0,
  3.0,
  3.0,
  2.0,
  3.0,
  2.0,
  3.0,
  2.0,
  3.0,
  1.0,
  2.0,
  2.0,
  3.0],
 ('weten', 'geloven'): [5.0,
  2.5,
  5.0,
  4.0,
  4.0,
  6.0,
  5.0,
  4.0,
  4.0,
  4.0,
  5.0,
  2.0,
  6.0,
  6.0],
 ('verzamelen', 'ontmoeten'): [4.0,
  3.0,
  6.5,
  3.5,
  5.0,
  5.0,
  6.0,
  5.0,
  7.0,
  4.0,
  3.5,
  4.0,
  4.0,
  5.0],
 ('maken', 'verdienen'): [1.0,
  2.0,
  2.0,
  1.5,
  3.0,
  2.0,
  1.0,
  3.0,
  2.0,
  1.0,
  1.5,
  1.0,
  3.0,
  3.0],
 ('vergeten', 

In [332]:
# Create combined column for original dataframe
combined_cleaned_nl_simlex = cleaned_nl_simlex.copy()
combined_cleaned_nl_simlex['Combined_Columns'] = combined_cleaned_nl_simlex['word1'] + '_' + combined_cleaned_nl_simlex['word2']
combined_cleaned_nl_simlex = combined_cleaned_nl_simlex[['Combined_Columns']]

# Create combined column for extracted dataframe
df_combined = create_dataframe(data_dict)
df_combined['Combined_Columns'] = df_combined['word1'] + '_' + df_combined['word2']
df_combined = df_combined[['Combined_Columns']]

# Find values in df1 that are not in df2
missing_values = combined_cleaned_nl_simlex[~combined_cleaned_nl_simlex['Combined_Columns'].isin(df_combined['Combined_Columns'])]
print(missing_values)

# Check for duplicate word pairws
duplicate_combinations = df_combined.duplicated(subset='Combined_Columns', keep=False)

# Print rows with duplicate word pairs
print(df_combined[duplicate_combinations])

                   Combined_Columns
519      pot_huishoudelijk apparaat
791               kiezen_stemmen op
907            vermoeden_doen alsof
982      samenwerken_aansluiten bij
988  binnentreden_verschuldigd zijn
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [333]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.1,0.10,0.01,0.00,0.05,0.2,0.10,0.10,0.0,0.05,0.0,2.0,0.03,0.10,0.18
1,slim,intelligent,0.9,0.95,0.95,8.00,9.50,0.9,0.95,0.95,0.4,0.95,9.0,9.0,0.97,0.95,0.79
2,hard,moeilijk,0.4,0.20,0.10,2.00,2.00,0.1,0.05,0.20,0.1,0.10,2.0,3.0,0.00,0.05,0.39
3,gelukkig,vrolijk,0.8,0.80,0.80,6.00,8.50,0.7,0.85,0.85,0.6,0.80,8.0,8.0,0.83,0.80,0.76
4,hard,stoer,0.7,0.40,0.45,2.00,6.00,0.5,0.05,0.65,0.4,0.20,4.0,6.0,0.61,0.50,0.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,3.0,1.50,2.11,2.06,4.00,3.0,3.00,4.00,3.0,2.00,4.8,1.0,1.00,5.50,4.00
993,sturen,bijwonen,0.0,1.00,1.56,0.00,1.00,0.0,0.00,1.00,1.0,1.00,0.0,1.0,1.00,0.00,1.00
994,verzamelen,bijwonen,1.0,1.00,1.56,0.00,5.00,0.0,3.00,3.00,1.0,2.00,0.0,1.0,4.00,2.50,1.33
995,opnemen,intrekken,1.0,2.00,1.11,0.00,1.00,0.0,1.00,2.00,2.0,1.00,0.0,1.0,2.00,0.00,1.00


In [334]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      0
similarity_score_3      0
similarity_score_4      0
similarity_score_5      0
similarity_score_6      0
similarity_score_7      0
similarity_score_8      0
similarity_score_9      0
similarity_score_10     0
similarity_score_11     0
similarity_score_12     0
similarity_score_13     0
similarity_score_14     0
similarity_score_15    21
dtype: int64


In [335]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
817,rondzwerven,dwalen,9.0,8.0,6.0,5.5,7.0,8.0,9.0,6.0,7.0,6.0,8.0,6.0,6.0,6.0,
818,slagen,falen,3.0,2.0,4.0,5.5,3.0,8.0,3.0,1.0,2.0,2.0,6.0,3.0,3.0,7.0,
819,besteden,redden,3.0,2.0,1.5,2.0,2.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,2.0,2.0,
820,vertrekken,gaan,7.0,7.0,7.0,6.0,7.0,7.0,6.0,6.0,7.0,6.0,7.5,4.0,4.0,6.0,
821,komen,bijwonen,3.0,2.0,3.0,3.0,2.0,3.0,2.0,3.0,2.0,3.0,1.0,2.0,2.0,3.0,
822,weten,geloven,5.0,2.5,5.0,4.0,4.0,6.0,5.0,4.0,4.0,4.0,5.0,2.0,6.0,6.0,
823,verzamelen,ontmoeten,4.0,3.0,6.5,3.5,5.0,5.0,6.0,5.0,7.0,4.0,3.5,4.0,4.0,5.0,
824,maken,verdienen,1.0,2.0,2.0,1.5,3.0,2.0,1.0,3.0,2.0,1.0,1.5,1.0,3.0,3.0,
825,vergeten,negeren,7.0,4.0,6.5,5.0,5.0,7.0,7.0,7.0,6.0,6.0,8.5,4.0,5.0,4.0,
826,vermenigvuldigen,toevoegen,2.0,1.5,4.5,2.5,4.0,5.0,4.5,4.0,3.0,4.0,2.0,3.5,2.0,3.0,


In [339]:
# Inspect response
response[-1]

"('brengen', 'sturen', 4.00), \n('opnemen', 'leren', 1.67), \n('verwerven', 'vinden', 2.33), \n('vertrekken', 'verschijnen', 1.00), \n('creëren', 'vernietigen', 1.33), \n('beginnen', 'gaan', 3.00), \n('krijgen', 'kopen', 3.00), \n('verzamelen', 'opslaan', 6.50), \n('vervangen', 'herstellen', 3.00), \n('samenvoegen', 'toevoegen', 6.50), \n('samenvoegen', 'trouwen', 1.00), \n('accepteren', 'bezorgen', 1.33), \n('bijvoegen', 'samenvoegen', 6.00), \n('plaatsen', 'ophangen', 4.00), \n('gaan', 'binnenkomen', 2.00), \n('communiceren', 'bidden', 1.00), \n('geven', 'stelen', 1.33), \n('toevoegen', 'bouwen', 1.33), \n('brengen', 'herstellen', 3.33), \n('begrijpen', 'voldoen', 1.00), \n('belichamen', 'beslissen', 1.00), \n('organiseren', 'worden', 1.00), \n('geven', 'weten', 2.00), \n('zeggen', 'verifiëren', 1.33), \n('samenwerken', 'aansluiten bij', 2.00), \n('regelen', 'vereisen', 1.33), \n('lenen', 'willen', 1.33), \n('onderzoeken', 'streven', 2.00), \n('negeren', 'verkennen', 1.33), \n('breng

In [None]:
# Manually fix inconsistencies
# df.loc[(df['word1'] == 'weten') & (df['word2'] == 'begrijpen'), 'similarity_score_10'] = 8.00

# Check value
# df.loc[(df['word1'] == 'weten') & (df['word2'] == 'begrijpen')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10
870,weten,begrijpen,6.0,6.0,7.0,8.0,7.0,7.5,7.0,8.0,7.5,8.0


In [337]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

In [338]:
# Define file_path
file_path = '../../data/prompt_data/nl/prompt-f1.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
