### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Remove first three word pairs
cleaned_nl_simlex = cleaned_nl_simlex.iloc[664:997]

# Select subset
# cleaned_nl_simlex = cleaned_nl_simlex.head(150)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [4]:
# Show results
tuples_list

[('jongen', 'soldaat'),
 ('buik', 'onderbuik'),
 ('kerel', 'meisje'),
 ('bed', 'stoel'),
 ('kleding', 'mantel'),
 ('pistool', 'mes'),
 ('blik', 'metaal'),
 ('fles', 'container'),
 ('kip', 'kalkoen'),
 ('vlees', 'broodje'),
 ('arm', 'botten'),
 ('hals', 'ruggengraat'),
 ('appel', 'citroen'),
 ('lijden', 'verdriet'),
 ('opdracht', 'taak'),
 ('nacht', 'dageraad'),
 ('diner', 'soep'),
 ('kalf', 'stier'),
 ('sneeuw', 'storm'),
 ('nagel', 'hand'),
 ('hond', 'paard'),
 ('arm', 'hals'),
 ('kogel', 'kanon'),
 ('griep', 'koorts'),
 ('vergoeding', 'salaris'),
 ('zenuw', 'hersenen'),
 ('beest', 'dier'),
 ('diner', 'kip'),
 ('meisje', 'dienstmeisje'),
 ('kind', 'jongen'),
 ('alcohol', 'wijn'),
 ('neus', 'mond'),
 ('weg', 'steeg'),
 ('bel', 'deur'),
 ('doos', 'hoed'),
 ('geloof', 'indruk'),
 ('vooroordeel', 'mening'),
 ('aandacht', 'bewustzijn'),
 ('woede', 'stemming'),
 ('elegantie', 'stijl'),
 ('schoonheid', 'leeftijd'),
 ('boek', 'thema'),
 ('vriend', 'moeder'),
 ('vitamine', 'ijzer'),
 ('auto', 

### **1. Define and Evaluate Parameters**

In [5]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van het woordpaar: [('{word1}'), ('{word2}')] op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. "
          "Geef geen extra uitleg of context.")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
# sample_size = 5
sample_size = 15

# Delay between individual API calls
delay = 3.0

# Define number of sublists
# n_sublists = 997
n_sublists = 333

In [7]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

333


In [8]:
# Print the prompts for each chunk
print_prompts_single(chunks, sample_size, prompt)

Beoordeel de semantische gelijkenis van het woordpaar: [('jongen'), ('soldaat')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen extra uitleg of context.
Beoordeel de semantische gelijkenis van het woordpaar: [('jongen'), ('soldaat')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen extra uitleg of context.
Beoordeel de semantische gelijkenis van het woordpaar: [('jongen'), ('soldaat')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <scor

In [9]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken_single(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [108, 109, 109, 107, 109, 108, 108, 107, 108, 110, 107, 111, 109, 109, 109, 109, 108, 108, 109, 108, 108, 107, 109, 109, 110, 110, 108, 107, 113, 107, 108, 107, 107, 107, 108, 109, 110, 111, 108, 110, 111, 108, 108, 108, 108, 110, 110, 110, 111, 109, 109, 108, 109, 111, 117, 109, 109, 109, 110, 110, 110, 108, 111, 107, 110, 109, 109, 108, 108, 108, 110, 109, 109, 107, 108, 107, 110, 109, 110, 110, 110, 108, 107, 107, 107, 110, 108, 110, 110, 109, 107, 112, 107, 109, 108, 108, 107, 108, 109, 108, 108, 109, 107, 107, 108, 108, 107, 109, 109, 108, 107, 109, 108, 109, 110, 111, 113, 112, 108, 109, 110, 109, 109, 109, 111, 108, 111, 109, 109, 112, 109, 110, 111, 108, 109, 108, 110, 111, 109, 108, 109, 109, 110, 109, 109, 109, 108, 109, 109, 108, 109, 108, 109, 112, 108, 109, 109, 109, 108, 111, 109, 110, 114, 110, 109, 109, 109, 110, 111, 109, 109, 111, 109, 111, 110, 112, 108, 108, 109, 109, 110, 107, 108, 108, 110, 110, 110, 109, 110, 111, 109, 109,

In [10]:
# Max RPD = 10.000
len(token_counts*15)

4995

In [11]:
# Number of total tokens
sum(token_counts)

36407

### **2. Extract and Process Data**

In [51]:
# Process each chunk and get results using the OpenAI API
# response = get_responses_single(prompt, chunks, model, sample_size, delay)

Processing: 100%|██████████| 4995/4995 [11:53:41<00:00,  8.57s/chunk]    

Total time taken: 42821.16 seconds





In [52]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f9-3.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [46]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{('slecht', 'truc'): [0.1, 0.1, 2.56, 0.0, 0.0, 1.5, 1.55, 1.47, 2.3, 1.45], ('bad', 'truc'): [0.0, 0.0, 0.0, 0.0, 0.0], ('slecht', 'vrouw'): [0.1, 1.2, 0.1, 0.1, 0.12, 0.1, 0.2, 0.1, 0.1], ('bad', 'vrouw'): [0.23, 1.03, 0.15, 0.1, 0.1, 0.05], ('bad', 'ballon'): [0.0, 0.0, 0.0, 0.0, 1.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ('slecht', 'ballon'): [0.0, 0.0], ('besteden', 'redden'): [0.0, 0.0, 0.0, 0.0, 0.11, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0], ('investeren', 'redden'): [0.0]}


In [47]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [48]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,jongen,soldaat,3.25,3.25,0.20,3.20,3.25,2.56,3.21,2.50,1.25,2.50,2.45,2.50,3.20,3.25,2.33
1,buik,onderbuik,9.00,8.50,9.00,8.50,8.50,8.50,9.00,9.00,8.50,8.33,8.50,8.00,9.00,8.50,9.50
2,kerel,meisje,0.15,2.00,2.00,0.20,5.50,2.25,2.10,1.43,0.16,3.20,2.33,1.02,1.23,0.22,0.15
3,bed,stoel,3.20,2.50,1.20,2.50,2.50,2.50,2.25,2.50,1.40,2.50,2.50,2.50,2.50,0.30,2.50
4,kleding,mantel,7.50,8.50,7.20,6.75,7.80,7.50,6.25,8.50,6.25,6.25,7.50,7.50,6.75,7.50,7.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,samenvoegen,verwerven,1.20,0.00,2.50,2.00,0.00,0.10,0.20,0.00,0.00,2.50,0.20,0.00,2.50,0.00,0.00
333,sturen,bijwonen,0.00,1.20,0.00,0.00,0.00,1.23,2.27,0.00,0.00,0.10,0.00,0.00,0.00,0.00,0.00
334,verzamelen,bijwonen,0.00,0.10,0.00,0.00,0.00,0.00,0.00,0.00,1.20,0.00,0.00,0.00,0.00,0.00,0.00
335,opnemen,intrekken,0.10,2.50,2.33,2.50,3.45,1.50,0.00,2.50,2.14,1.25,1.67,2.50,2.50,0.20,2.50


In [49]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     1
similarity_score_3     2
similarity_score_4     2
similarity_score_5     2
similarity_score_6     3
similarity_score_7     4
similarity_score_8     4
similarity_score_9     4
similarity_score_10    5
similarity_score_11    6
similarity_score_12    6
similarity_score_13    6
similarity_score_14    7
similarity_score_15    8
dtype: int64


In [50]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
73,slecht,truc,0.1,0.1,2.56,0.0,0.0,1.5,1.55,1.47,2.3,1.45,,,,,
74,bad,truc,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,
84,slecht,vrouw,0.1,1.2,0.1,0.1,0.12,0.1,0.2,0.1,0.1,,,,,,
85,bad,vrouw,0.23,1.03,0.15,0.1,0.1,0.05,,,,,,,,,
108,bad,ballon,0.0,0.0,0.0,0.0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
109,slecht,ballon,0.0,0.0,,,,,,,,,,,,,
158,besteden,redden,0.0,0.0,0.0,0.0,0.11,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,
159,investeren,redden,0.0,,,,,,,,,,,,,,


The issue is that GPT literally translated the word "bad" into "slecht", leading to word pairs and rows that do not belong in SimLex. Found in the response json file.

In [51]:
cleaned_nl_simlex.reset_index(drop=True).iloc[70:85]

Unnamed: 0,word1,word2,SimLex999,POS
70,nacht,hoofdstuk,1.2,N
71,vervuiling,president,0.88,N
72,pistool,truc,0.94,N
73,bad,truc,1.06,N
74,dieet,appel,1.72,N
75,cent,vrouw,1.08,N
76,hoofdstuk,staart,2.09,N
77,cursus,buik,0.72,N
78,volkslied,rietje,0.87,N
79,tandarts,kolonel,0.7,N


In [52]:
# Copy dataframe
test = cleaned_nl_simlex.copy()
test['word1'] = test['word1'].replace(" ", "", regex=True)
test['word2'] = test['word2'].replace(" ", "", regex=True)

# Perform an anti-join to find rows in df2 not in df1
result = pd.merge(df, test, on=['word1', 'word2'], how='left', indicator=True)
result = result[result['_merge'] == 'left_only']

# Drop the indicator column to clean up the DataFrame
result = result.drop(columns=['_merge'])

# Show which words do not belong
result

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15,SimLex999,POS
73,slecht,truc,0.1,0.1,2.56,0.0,0.0,1.5,1.55,1.47,2.3,1.45,,,,,,,
84,slecht,vrouw,0.1,1.2,0.1,0.1,0.12,0.1,0.2,0.1,0.1,,,,,,,,
109,slecht,ballon,0.0,0.0,,,,,,,,,,,,,,,
159,investeren,redden,0.0,,,,,,,,,,,,,,,,


In [53]:
# Drop faulty rows
df = df[~((df['word1'] == 'slecht') & (df['word2'] == 'truc'))]
df = df[~((df['word1'] == 'slecht') & (df['word2'] == 'vrouw'))]
df = df[~((df['word1'] == 'slecht') & (df['word2'] == 'ballon'))]
df = df[~((df['word1'] == 'investeren') & (df['word2'] == 'redden'))]

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,jongen,soldaat,3.25,3.25,0.20,3.20,3.25,2.56,3.21,2.50,1.25,2.50,2.45,2.50,3.20,3.25,2.33
1,buik,onderbuik,9.00,8.50,9.00,8.50,8.50,8.50,9.00,9.00,8.50,8.33,8.50,8.00,9.00,8.50,9.50
2,kerel,meisje,0.15,2.00,2.00,0.20,5.50,2.25,2.10,1.43,0.16,3.20,2.33,1.02,1.23,0.22,0.15
3,bed,stoel,3.20,2.50,1.20,2.50,2.50,2.50,2.25,2.50,1.40,2.50,2.50,2.50,2.50,0.30,2.50
4,kleding,mantel,7.50,8.50,7.20,6.75,7.80,7.50,6.25,8.50,6.25,6.25,7.50,7.50,6.75,7.50,7.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,samenvoegen,verwerven,1.20,0.00,2.50,2.00,0.00,0.10,0.20,0.00,0.00,2.50,0.20,0.00,2.50,0.00,0.00
333,sturen,bijwonen,0.00,1.20,0.00,0.00,0.00,1.23,2.27,0.00,0.00,0.10,0.00,0.00,0.00,0.00,0.00
334,verzamelen,bijwonen,0.00,0.10,0.00,0.00,0.00,0.00,0.00,0.00,1.20,0.00,0.00,0.00,0.00,0.00,0.00
335,opnemen,intrekken,0.10,2.50,2.33,2.50,3.45,1.50,0.00,2.50,2.14,1.25,1.67,2.50,2.50,0.20,2.50


In [54]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     1
similarity_score_7     2
similarity_score_8     2
similarity_score_9     2
similarity_score_10    2
similarity_score_11    2
similarity_score_12    2
similarity_score_13    2
similarity_score_14    3
similarity_score_15    4
dtype: int64


In [55]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
74,bad,truc,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,
85,bad,vrouw,0.23,1.03,0.15,0.1,0.1,0.05,,,,,,,,,
108,bad,ballon,0.0,0.0,0.0,0.0,1.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
158,besteden,redden,0.0,0.0,0.0,0.0,0.11,0.1,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,


In [56]:
# Convert to tuple
missing_tuples_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))

# Show results
missing_tuples_list

[('bad', 'truc'), ('bad', 'vrouw'), ('bad', 'ballon'), ('besteden', 'redden')]

In [57]:
# To prevent GPT translating literally translating the word "bad" into "slecht", specific it is about Dutch word pairs 
prompt = ("Beoordeel de semantische gelijkenis van het Nederlandse woordpaar: [('{word1}'), ('{word2}')] op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. "
          "Geef geen extra uitleg of context.")

In [58]:
# Define number of sublists
n_sublists = 4

# Split the list
missing_chunks = split_into_n_lists(missing_tuples_list, n_sublists)

# Count the number of lists
print(len(missing_chunks))

4


In [59]:
# Print the prompts for each chunk
print_prompts_single(missing_chunks, sample_size, prompt)

Beoordeel de semantische gelijkenis van het Nederlandse woordpaar: [('bad'), ('truc')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen extra uitleg of context.
Beoordeel de semantische gelijkenis van het Nederlandse woordpaar: [('bad'), ('truc')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen extra uitleg of context.
Beoordeel de semantische gelijkenis van het Nederlandse woordpaar: [('bad'), ('truc')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1

In [33]:
# Process each chunk and get results using the OpenAI API
# missing_response = get_responses_single(prompt, missing_chunks, model, sample_size, delay)

Processing: 100%|██████████| 60/60 [04:13<00:00,  4.23s/chunk]

Total time taken: 253.93 seconds





In [60]:
# Extract data with regular expressions into dictionary
missing_data_dict = process_responses(missing_response)

# Convert dict to Pandas DataFrame
missing_df = create_dataframe(missing_data_dict)

# Show results
missing_df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,bad,truc,0.13,0.0,0.1,0.15,0.0,1.25,0.1,0.0,1.12,1.2,0.0,1.42,1.24,0.0,1.0
1,bad,vrouw,0.21,0.23,1.23,0.05,1.2,0.15,0.1,0.12,2.35,0.2,0.12,0.02,0.1,0.12,1.2
2,bad,ballon,0.1,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,besteden,redden,0.2,0.1,1.3,0.1,0.1,2.0,0.1,0.0,0.3,0.1,0.1,1.2,0.0,0.2,0.0


In [62]:
# Replace the rows on the exact same index
for index, missing_row in missing_df.iterrows():
    # Find matching indices
    match_indices = df[(df['word1'] == missing_row['word1']) & (df['word2'] == missing_row['word2'])].index
    # Replace rows using the indices
    if not match_indices.empty:
        for idx in match_indices:
            df.loc[idx] = missing_row

# Show results
df[df.set_index(['word1', 'word2']).index.isin(missing_tuples_list)].reset_index(drop=True)

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,bad,truc,0.13,0.0,0.1,0.15,0.0,1.25,0.1,0.0,1.12,1.2,0.0,1.42,1.24,0.0,1.0
1,bad,vrouw,0.21,0.23,1.23,0.05,1.2,0.15,0.1,0.12,2.35,0.2,0.12,0.02,0.1,0.12,1.2
2,bad,ballon,0.1,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,besteden,redden,0.2,0.1,1.3,0.1,0.1,2.0,0.1,0.0,0.3,0.1,0.1,1.2,0.0,0.2,0.0


In [66]:
# Reset index
df = df.reset_index(drop=True)
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,jongen,soldaat,3.25,3.25,0.20,3.20,3.25,2.56,3.21,2.50,1.25,2.50,2.45,2.50,3.20,3.25,2.33
1,buik,onderbuik,9.00,8.50,9.00,8.50,8.50,8.50,9.00,9.00,8.50,8.33,8.50,8.00,9.00,8.50,9.50
2,kerel,meisje,0.15,2.00,2.00,0.20,5.50,2.25,2.10,1.43,0.16,3.20,2.33,1.02,1.23,0.22,0.15
3,bed,stoel,3.20,2.50,1.20,2.50,2.50,2.50,2.25,2.50,1.40,2.50,2.50,2.50,2.50,0.30,2.50
4,kleding,mantel,7.50,8.50,7.20,6.75,7.80,7.50,6.25,8.50,6.25,6.25,7.50,7.50,6.75,7.50,7.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,samenvoegen,verwerven,1.20,0.00,2.50,2.00,0.00,0.10,0.20,0.00,0.00,2.50,0.20,0.00,2.50,0.00,0.00
329,sturen,bijwonen,0.00,1.20,0.00,0.00,0.00,1.23,2.27,0.00,0.00,0.10,0.00,0.00,0.00,0.00,0.00
330,verzamelen,bijwonen,0.00,0.10,0.00,0.00,0.00,0.00,0.00,0.00,1.20,0.00,0.00,0.00,0.00,0.00,0.00
331,opnemen,intrekken,0.10,2.50,2.33,2.50,3.45,1.50,0.00,2.50,2.14,1.25,1.67,2.50,2.50,0.20,2.50


In [67]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [68]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [69]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f9-3.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
