### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Remove first three word pairs
cleaned_nl_simlex = cleaned_nl_simlex.iloc[3:]

# Select subset
# cleaned_nl_simlex = cleaned_nl_simlex.head(150)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [4]:
# Show results
tuples_list

[('gelukkig', 'vrolijk'),
 ('hard', 'stoer'),
 ('snel', 'razendsnel'),
 ('gelukkig', 'blij'),
 ('kort', 'lang'),
 ('dom', 'stom'),
 ('vreemd', 'eigenaardig'),
 ('breed', 'smal'),
 ('makkelijk', 'moeilijk'),
 ('moeilijk', 'gemakkelijk'),
 ('slim', 'dom'),
 ('krankzinnig', 'gek'),
 ('gelukkig', 'kwaad'),
 ('uitgebreid', 'groot'),
 ('moeilijk', 'simpel'),
 ('nieuw', 'vers'),
 ('scherp', 'saai'),
 ('vlug', 'snel'),
 ('dom', 'dwaas'),
 ('prachtig', 'fantastisch'),
 ('eigenaardig', 'vreemd'),
 ('gelukkig', 'boos'),
 ('smal', 'breed'),
 ('eenvoudig', 'gemakkelijk'),
 ('oud', 'vers'),
 ('kennelijk', 'duidelijk'),
 ('betaalbaar', 'goedkoop'),
 ('leuk', 'grootmoedig'),
 ('raar', 'vreemd'),
 ('vreemd', 'normaal'),
 ('slecht', 'immoreel'),
 ('verdrietig', 'grappig'),
 ('prachtig', 'geweldig'),
 ('schuldig', 'beschaamd'),
 ('mooi', 'prachtig'),
 ('zelfverzekerd', 'zeker'),
 ('dom', 'onderontwikkeld'),
 ('groot', 'flexibel'),
 ('aardig', 'wreed'),
 ('ongeduldig', 'nerveus'),
 ('groot', 'breed'),
 ('

### **1. Define and Evaluate Parameters**

In [5]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis.")
user_content = ("[('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk')]")
assistant_content = ("[('oud', 'nieuw', 1.94), ('slim', 'intelligent', 8.19), ('hard', 'moeilijk', 4.46)]")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 35

In [7]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

35


In [8]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. --- ["('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('makkelijk', 'moeilijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('smal', 'breed'), ('eenvoudig', 'gemakkelijk'), ('oud', 'vers'), ('kennelijk', 'duidelijk'), ('betaalbaar', 'goedkoop'), ('leuk', 'grootmoedig'), ('raar', 'vreemd')"]
Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte sem

In [9]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [329, 336, 317, 317, 307, 288, 299, 305, 301, 301, 297, 311, 313, 299, 302, 287, 298, 312, 311, 311, 315, 288, 290, 297, 308, 321, 290, 325, 320, 323, 327, 337, 331, 329, 332]


### **2. Extract and Process Data**

In [10]:
# Process each chunk and get results using the OpenAI API
response = get_responses_conversational(chunks, prompt, user_content, assistant_content, model, sample_size, delay)

Processing:   0%|          | 0/525 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 525/525 [3:23:47<00:00, 23.29s/chunk]  

Total time taken: 12227.79 seconds





In [11]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f8.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [12]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{('dom', 'onderontwikkeld'): [4.34, 3.35, 2.21, 2.17, 4.5, 3.19, 3.5, 4.36, 2.55, 3.06, 3.26, 4.88, 3.39, 4.71], ('dom', 'onderontwikkel'): [2.5], ('enorm', 'sprakeloos'): [1.89, 1.43, 2.11, 1.25, 1.83, 1.5, 2.88, 1.46, 1.07, 2.11, 0.41, 1.0, 3.25, 1.54], ('enorm', 'spraakeloos'): [3.63], ('verjaardag', 'datum'): [2.66, 6.5, 2.5, 5.67, 2.71, 2.42, 3.51, 5.17, 5.11, 2.15, 2.73, 6.2, 1.0, 1.76], ('vergoeding', 'betaling'): [5.47, 6.34, 5.1, 7.24, 2.38, 5.42, 4.09, 6.24, 5.23, 3.35, 4.18, 5.0, 2.0, 3.16], ('bijbel', 'psalm'): [0.94, 4.0, 4.86, 4.03, 2.81, 4.35, 2.76, 2.19, 3.84, 3.71, 2.45, 3.15, 1.75, 2.96], ('uitgang', 'deuropening'): [6.09, 3.0, 7.2, 2.94, 6.48, 1.15, 6.45, 4.05, 6.02, 2.32, 3.18, 3.2, 6.0, 3.04], ('man', 'schildwacht'): [4.38, 1.4, 1.8, 2.44, 3.38, 2.5, 2.71, 2.18, 2.38, 1.61, 3.36, 3.2, 4.75, 1.71], ('gangpad', 'hal'): [5.16, 3.0, 5.7, 5.16, 1.77, 3.04, 2.98, 2.83, 1.11, 4.58, 2.55, 2.8, 2.75, 1.67], ('whisky', 'jenever'): [5.0, 7.0, 6.5, 6.86, 2.37, 5.12, 6.08, 7.1,

In [13]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [25]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,gelukkig,vrolijk,7.69,7.94,6.78,7.69,6.88,7.14,7.48,6.87,8.25,8.27,7.56,7.84,5.81,5.78,7.50
1,hard,stoer,4.31,6.04,5.30,5.08,5.43,6.00,6.05,3.75,4.04,7.36,6.38,6.12,4.23,5.04,7.50
2,snel,razendsnel,8.77,8.30,7.93,9.84,9.42,9.00,8.84,8.50,9.52,9.09,8.24,8.82,8.93,8.42,9.50
3,gelukkig,blij,6.62,7.43,7.81,6.73,8.43,7.14,8.02,6.07,8.68,7.64,7.16,6.76,8.68,7.73,8.50
4,kort,lang,2.31,4.32,4.98,5.22,2.38,3.13,4.47,3.80,2.08,4.09,0.24,5.18,5.72,3.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,samenvoegen,verwerven,2.22,0.00,4.94,0.68,2.84,1.08,1.10,1.50,0.99,3.02,0.00,0.00,2.01,2.34,2.34
992,sturen,bijwonen,0.06,0.00,0.47,0.07,0.57,1.08,1.08,1.14,0.00,1.40,0.00,0.00,0.78,0.65,0.22
993,verzamelen,bijwonen,0.66,0.00,0.74,0.03,0.64,1.08,1.08,1.22,4.07,2.22,0.00,0.00,2.12,1.78,1.16
994,opnemen,intrekken,1.52,1.43,1.98,0.42,0.35,1.08,1.09,1.42,1.22,1.79,0.00,0.00,1.56,0.12,1.86


In [26]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      2
similarity_score_3      2
similarity_score_4      2
similarity_score_5      2
similarity_score_6      2
similarity_score_7      2
similarity_score_8      2
similarity_score_9      2
similarity_score_10     2
similarity_score_11     2
similarity_score_12     2
similarity_score_13     2
similarity_score_14     2
similarity_score_15    18
dtype: int64


In [27]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
36,dom,onderontwikkeld,4.34,3.35,2.21,2.17,4.5,3.19,3.5,4.36,2.55,3.06,3.26,4.88,3.39,4.71,
58,dom,onderontwikkel,2.5,,,,,,,,,,,,,,
104,enorm,sprakeloos,1.89,1.43,2.11,1.25,1.83,1.5,2.88,1.46,1.07,2.11,0.41,1.0,3.25,1.54,
117,enorm,spraakeloos,3.63,,,,,,,,,,,,,,
481,verjaardag,datum,2.66,6.5,2.5,5.67,2.71,2.42,3.51,5.17,5.11,2.15,2.73,6.2,1.0,1.76,
482,vergoeding,betaling,5.47,6.34,5.1,7.24,2.38,5.42,4.09,6.24,5.23,3.35,4.18,5.0,2.0,3.16,
483,bijbel,psalm,0.94,4.0,4.86,4.03,2.81,4.35,2.76,2.19,3.84,3.71,2.45,3.15,1.75,2.96,
484,uitgang,deuropening,6.09,3.0,7.2,2.94,6.48,1.15,6.45,4.05,6.02,2.32,3.18,3.2,6.0,3.04,
485,man,schildwacht,4.38,1.4,1.8,2.44,3.38,2.5,2.71,2.18,2.38,1.61,3.36,3.2,4.75,1.71,
486,gangpad,hal,5.16,3.0,5.7,5.16,1.77,3.04,2.98,2.83,1.11,4.58,2.55,2.8,2.75,1.67,


In [17]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis.")
user_content_1 = ("[('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk')]")
assistant_content = ("[('oud', 'nieuw', 1.94), ('slim', 'intelligent', 8.19), ('hard', 'moeilijk', 4.46)]")
user_content_2 = "[('dom', 'onderontwikkeld'), ('enorm', 'sprakeloos')]"

# Define the message
messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": user_content_1},
    {"role": "assistant", "content": assistant_content},
    {"role": "user", "content": user_content_2},
    ]

# Call the OpenAI API
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1,
#     stop=None)

# Show results
print(completion.choices[0].message.content)

[('dom', 'onderontwikkeld', 6.52), ('enorm', 'sprakeloos', 1.23)]


In [28]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'dom') & (df['word2'] == 'onderontwikkeld'), 'similarity_score_15'] = 6.52

# Check value
df.loc[(df['word1'] == 'dom') & (df['word2'] == 'onderontwikkeld')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
36,dom,onderontwikkeld,4.34,3.35,2.21,2.17,4.5,3.19,3.5,4.36,2.55,3.06,3.26,4.88,3.39,4.71,6.52


In [29]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'enorm') & (df['word2'] == 'sprakeloos'), 'similarity_score_15'] = 1.23

# Check value
df.loc[(df['word1'] == 'enorm') & (df['word2'] == 'sprakeloos')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
104,enorm,sprakeloos,1.89,1.43,2.11,1.25,1.83,1.5,2.88,1.46,1.07,2.11,0.41,1.0,3.25,1.54,1.23


In [30]:
# Drop faulty row
df = df[~((df['word1'] == 'dom') & (df['word2'] == 'onderontwikkel'))]
df = df[~((df['word1'] == 'enorm') & (df['word2'] == 'spraakeloos'))]

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,gelukkig,vrolijk,7.69,7.94,6.78,7.69,6.88,7.14,7.48,6.87,8.25,8.27,7.56,7.84,5.81,5.78,7.50
1,hard,stoer,4.31,6.04,5.30,5.08,5.43,6.00,6.05,3.75,4.04,7.36,6.38,6.12,4.23,5.04,7.50
2,snel,razendsnel,8.77,8.30,7.93,9.84,9.42,9.00,8.84,8.50,9.52,9.09,8.24,8.82,8.93,8.42,9.50
3,gelukkig,blij,6.62,7.43,7.81,6.73,8.43,7.14,8.02,6.07,8.68,7.64,7.16,6.76,8.68,7.73,8.50
4,kort,lang,2.31,4.32,4.98,5.22,2.38,3.13,4.47,3.80,2.08,4.09,0.24,5.18,5.72,3.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,samenvoegen,verwerven,2.22,0.00,4.94,0.68,2.84,1.08,1.10,1.50,0.99,3.02,0.00,0.00,2.01,2.34,2.34
992,sturen,bijwonen,0.06,0.00,0.47,0.07,0.57,1.08,1.08,1.14,0.00,1.40,0.00,0.00,0.78,0.65,0.22
993,verzamelen,bijwonen,0.66,0.00,0.74,0.03,0.64,1.08,1.08,1.22,4.07,2.22,0.00,0.00,2.12,1.78,1.16
994,opnemen,intrekken,1.52,1.43,1.98,0.42,0.35,1.08,1.09,1.42,1.22,1.79,0.00,0.00,1.56,0.12,1.86


In [31]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
481,verjaardag,datum,2.66,6.5,2.5,5.67,2.71,2.42,3.51,5.17,5.11,2.15,2.73,6.2,1.0,1.76,
482,vergoeding,betaling,5.47,6.34,5.1,7.24,2.38,5.42,4.09,6.24,5.23,3.35,4.18,5.0,2.0,3.16,
483,bijbel,psalm,0.94,4.0,4.86,4.03,2.81,4.35,2.76,2.19,3.84,3.71,2.45,3.15,1.75,2.96,
484,uitgang,deuropening,6.09,3.0,7.2,2.94,6.48,1.15,6.45,4.05,6.02,2.32,3.18,3.2,6.0,3.04,
485,man,schildwacht,4.38,1.4,1.8,2.44,3.38,2.5,2.71,2.18,2.38,1.61,3.36,3.2,4.75,1.71,
486,gangpad,hal,5.16,3.0,5.7,5.16,1.77,3.04,2.98,2.83,1.11,4.58,2.55,2.8,2.75,1.67,
487,whisky,jenever,5.0,7.0,6.5,6.86,2.37,5.12,6.08,7.1,6.07,7.62,7.09,2.85,7.0,5.05,
488,bloed,merg,4.06,2.0,5.7,2.16,2.4,3.38,1.82,3.55,3.54,3.05,3.91,1.9,2.0,2.56,
489,olie,nerts,0.63,1.2,3.1,2.15,1.96,1.15,1.87,1.45,1.16,1.88,1.18,1.05,2.0,0.18,
490,vloer,dek,3.75,3.5,5.8,3.53,5.38,3.65,4.93,4.3,2.21,4.26,3.0,3.6,3.0,2.89,


In [32]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      0
similarity_score_3      0
similarity_score_4      0
similarity_score_5      0
similarity_score_6      0
similarity_score_7      0
similarity_score_8      0
similarity_score_9      0
similarity_score_10     0
similarity_score_11     0
similarity_score_12     0
similarity_score_13     0
similarity_score_14     0
similarity_score_15    14
dtype: int64


In [33]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))
missing_word_pair_list

[('verjaardag', 'datum'),
 ('vergoeding', 'betaling'),
 ('bijbel', 'psalm'),
 ('uitgang', 'deuropening'),
 ('man', 'schildwacht'),
 ('gangpad', 'hal'),
 ('whisky', 'jenever'),
 ('bloed', 'merg'),
 ('olie', 'nerts'),
 ('vloer', 'dek'),
 ('dak', 'vloer'),
 ('boek', 'artikel'),
 ('kiezen', 'stemmenop'),
 ('luisteren', 'gehoorzamen')]

In [34]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis.")
user_content_1 = ("[('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk')]")
assistant_content = ("[('oud', 'nieuw', 1.94), ('slim', 'intelligent', 8.19), ('hard', 'moeilijk', 4.46)]")
user_content_2 = f'"{missing_word_pair_list}"'

In [35]:
# Define the message
messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": user_content_1},
    {"role": "assistant", "content": assistant_content},
    {"role": "user", "content": user_content_2},
    ]

# Call the OpenAI API
completion = client.chat.completions.create(
    model=model,
    messages=messages,
    n=1,
    stop=None)

# Store the response content
missing_word_pairs_response = [completion.choices[0].message.content]

In [36]:
# Extract data with regular expressions into dictionary
missing_word_pairs_dict = process_responses(missing_word_pairs_response)
missing_word_pairs_dict

{('verjaardag', 'datum'): [3.16],
 ('vergoeding', 'betaling'): [7.14],
 ('bijbel', 'psalm'): [2.78],
 ('uitgang', 'deuropening'): [7.26],
 ('man', 'schildwacht'): [4.67],
 ('gangpad', 'hal'): [5.16],
 ('whisky', 'jenever'): [6.91],
 ('bloed', 'merg'): [4.18],
 ('olie', 'nerts'): [1.39],
 ('vloer', 'dek'): [3.42],
 ('dak', 'vloer'): [1.82],
 ('boek', 'artikel'): [4.17],
 ('kiezen', 'stemmenop'): [4.74],
 ('luisteren', 'gehoorzamen'): [8.81]}

In [37]:
# Iterate over the DataFrame and replace missing values
for index, row in df.iterrows():
    word_pair = (row['word1'], row['word2'])
    # Check if the current value is NaN
    if pd.isna(row['similarity_score_15']):
        if word_pair in missing_word_pairs_dict:
            # Extract the first element from the list to get the scalar value
            df.at[index, 'similarity_score_15'] = missing_word_pairs_dict[word_pair][0]

# Print to check if there are any NaN values left in 'similarity_score_15'
print(df[df['similarity_score_15'].isna()])

Empty DataFrame
Columns: [word1, word2, similarity_score_1, similarity_score_2, similarity_score_3, similarity_score_4, similarity_score_5, similarity_score_6, similarity_score_7, similarity_score_8, similarity_score_9, similarity_score_10, similarity_score_11, similarity_score_12, similarity_score_13, similarity_score_14, similarity_score_15]
Index: []


In [38]:
# Filter the DataFrame based on pairs
filtered_df = df[df.apply(lambda row: (row['word1'], row['word2']) in missing_word_pair_list, axis=1)]

# Select only the similarity_score_15 column
result = filtered_df[['word1', 'word2', 'similarity_score_15']]

# Print or output the result
print(result)

          word1        word2  similarity_score_15
481  verjaardag        datum                 3.16
482  vergoeding     betaling                 7.14
483      bijbel        psalm                 2.78
484     uitgang  deuropening                 7.26
485         man  schildwacht                 4.67
486     gangpad          hal                 5.16
487      whisky      jenever                 6.91
488       bloed         merg                 4.18
489        olie        nerts                 1.39
490       vloer          dek                 3.42
491         dak        vloer                 1.82
621        boek      artikel                 4.17
790      kiezen    stemmenop                 4.74
795   luisteren  gehoorzamen                 8.81


In [39]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [40]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [41]:
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,gelukkig,vrolijk,7.69,7.94,6.78,7.69,6.88,7.14,7.48,6.87,8.25,8.27,7.56,7.84,5.81,5.78,7.50
1,hard,stoer,4.31,6.04,5.30,5.08,5.43,6.00,6.05,3.75,4.04,7.36,6.38,6.12,4.23,5.04,7.50
2,snel,razendsnel,8.77,8.30,7.93,9.84,9.42,9.00,8.84,8.50,9.52,9.09,8.24,8.82,8.93,8.42,9.50
3,gelukkig,blij,6.62,7.43,7.81,6.73,8.43,7.14,8.02,6.07,8.68,7.64,7.16,6.76,8.68,7.73,8.50
4,kort,lang,2.31,4.32,4.98,5.22,2.38,3.13,4.47,3.80,2.08,4.09,0.24,5.18,5.72,3.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,samenvoegen,verwerven,2.22,0.00,4.94,0.68,2.84,1.08,1.10,1.50,0.99,3.02,0.00,0.00,2.01,2.34,2.34
992,sturen,bijwonen,0.06,0.00,0.47,0.07,0.57,1.08,1.08,1.14,0.00,1.40,0.00,0.00,0.78,0.65,0.22
993,verzamelen,bijwonen,0.66,0.00,0.74,0.03,0.64,1.08,1.08,1.22,4.07,2.22,0.00,0.00,2.12,1.78,1.16
994,opnemen,intrekken,1.52,1.43,1.98,0.42,0.35,1.08,1.09,1.42,1.22,1.79,0.00,0.00,1.56,0.12,1.86


In [42]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f8.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
