### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Select subset
cleaned_nl_simlex = cleaned_nl_simlex.iloc[0:331]
# cleaned_nl_simlex = cleaned_nl_simlex.iloc[331:664]
# cleaned_nl_simlex = cleaned_nl_simlex.iloc[664:997]

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [4]:
cleaned_nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
0,oud,nieuw,1.94,A
1,slim,intelligent,8.19,A
2,hard,moeilijk,4.46,A
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
...,...,...,...,...
326,hart,operatie,1.80,N
327,vrouw,secretaresse,1.84,N
328,man,vader,4.31,N
329,strand,eiland,2.72,N


### **1. Define and Evaluate Parameters**

In [5]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van het woordpaar: [('{word1}'), ('{word2}')] op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. "
          "Geef geen extra uitleg of context.")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
# sample_size = 5
sample_size = 15

# Delay between individual API calls
delay = 3.0

# Define number of sublists
# n_sublists = 997
n_sublists = 331

In [7]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

331


In [8]:
# Print the prompts for each chunk
print_prompts_single(chunks, sample_size, prompt)

Beoordeel de semantische gelijkenis van het woordpaar: [('oud'), ('nieuw')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen extra uitleg of context.
Beoordeel de semantische gelijkenis van het woordpaar: [('oud'), ('nieuw')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen extra uitleg of context.
Beoordeel de semantische gelijkenis van het woordpaar: [('oud'), ('nieuw')] op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. Geef geen

In [9]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken_single(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [107, 108, 109, 110, 107, 109, 110, 107, 107, 111, 108, 111, 112, 107, 111, 111, 110, 110, 107, 109, 108, 108, 110, 111, 110, 108, 112, 106, 110, 111, 111, 108, 108, 110, 111, 109, 110, 110, 110, 110, 108, 109, 111, 108, 108, 112, 108, 107, 109, 112, 112, 112, 108, 109, 110, 109, 108, 113, 107, 108, 111, 107, 111, 110, 111, 108, 107, 109, 108, 108, 110, 110, 106, 107, 109, 109, 109, 109, 108, 109, 109, 111, 107, 110, 111, 108, 108, 108, 109, 112, 111, 108, 111, 109, 110, 109, 107, 108, 109, 108, 108, 109, 109, 110, 108, 107, 110, 108, 108, 112, 108, 109, 107, 108, 113, 107, 108, 111, 108, 108, 109, 106, 116, 107, 108, 108, 109, 107, 110, 109, 109, 107, 108, 107, 109, 109, 112, 108, 108, 109, 107, 108, 108, 108, 109, 109, 108, 108, 109, 106, 108, 107, 108, 109, 107, 107, 109, 107, 108, 108, 106, 107, 107, 109, 109, 109, 108, 106, 107, 108, 109, 109, 110, 107, 109, 109, 107, 106, 107, 108, 110, 107, 108, 109, 109, 107, 108, 108, 109, 109, 109, 112,

In [13]:
# Max RPD = 10.000
len(token_counts*15)

4965

In [10]:
# Number of total tokens
sum(token_counts)

35947

### **2. Extract and Process Data (1)**

In [12]:
# Process each chunk and get results using the OpenAI API
response = get_responses_single(prompt, chunks, model, sample_size, delay)

Processing: 100%|██████████| 4965/4965 [11:32:32<00:00,  8.37s/chunk]    

Total time taken: 41552.14 seconds





In [14]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f9-1.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [25]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{('sluw', 'koppig'): [0.1, 2.5, 0.2, 0.1, 0.2, 0.1, 0.11, 0.0, 0.1, 1.25, 0.2, 0.1, 0.2, 0.1]}


In [26]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [31]:
cleaned_nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
0,oud,nieuw,1.94,A
1,slim,intelligent,8.19,A
2,hard,moeilijk,4.46,A
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
...,...,...,...,...
326,hart,operatie,1.80,N
327,vrouw,secretaresse,1.84,N
328,man,vader,4.31,N
329,strand,eiland,2.72,N


In [32]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.10,1.91,1.00,0.14,2.00,0.25,1.00,0.10,1.00,1.00,1.00,0.10,1.00,1.00,0.20
1,slim,intelligent,8.50,9.00,8.70,9.00,9.00,9.50,8.50,9.00,9.00,9.50,8.50,8.50,8.50,8.50,8.50
2,hard,moeilijk,7.50,7.20,7.50,8.00,9.50,8.00,8.50,8.50,8.50,8.50,8.50,7.50,6.50,7.50,9.00
3,gelukkig,vrolijk,5.80,6.50,7.50,8.50,0.65,7.50,7.50,9.00,7.50,7.50,6.50,7.50,7.50,7.50,7.50
4,hard,stoer,6.00,0.30,6.00,6.50,7.50,7.50,0.60,5.50,5.65,0.30,6.50,0.20,5.00,0.40,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,hart,operatie,1.75,2.56,1.50,2.80,0.33,2.36,1.67,0.18,0.23,2.56,2.34,3.12,2.34,4.22,2.55
327,vrouw,secretaresse,6.75,7.85,8.75,7.50,7.89,6.82,7.80,7.89,7.25,8.25,6.80,8.75,7.80,7.80,5.32
328,man,vader,7.20,6.50,6.50,8.50,8.40,7.50,8.50,7.50,7.50,7.80,6.75,6.50,6.75,7.50,7.50
329,strand,eiland,2.50,3.50,1.50,0.00,2.50,3.20,0.10,1.20,3.50,3.50,2.50,2.50,2.50,2.50,1.50


In [28]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    1
dtype: int64


In [45]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
102,sluw,koppig,0.1,2.5,0.2,0.1,0.2,0.1,0.11,0.0,0.1,1.25,0.2,0.1,0.2,0.1,


In [46]:
# Custom prompt for wordpair 'hysterie' 'verwardheid'
custom_prompt = ("Beoordeel de semantische gelijkenis van het woordpaar: [('sluw'), ('koppig')] op een schaal van 0 tot 10, "
                 "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
                 "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [('woord1', 'woord2', <score>)]. "
                 "Geef geen extra uitleg of context.")


# Make extra API call for 'hysterie' 'verwardheid'
messages = [{"role": "user", "content": custom_prompt}]
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1)
# Show results
print(completion.choices[0].message.content)

[('sluw', 'koppig', 0.10)]


In [47]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'sluw') & (df['word2'] == 'koppig'), 'similarity_score_15'] = 0.10

# Check value
df.loc[(df['word1'] == 'sluw') & (df['word2'] == 'koppig')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
102,sluw,koppig,0.1,2.5,0.2,0.1,0.2,0.1,0.11,0.0,0.1,1.25,0.2,0.1,0.2,0.1,0.1


In [48]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [49]:
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.10,1.91,1.00,0.14,2.00,0.25,1.00,0.10,1.00,1.00,1.00,0.10,1.00,1.00,0.20
1,slim,intelligent,8.50,9.00,8.70,9.00,9.00,9.50,8.50,9.00,9.00,9.50,8.50,8.50,8.50,8.50,8.50
2,hard,moeilijk,7.50,7.20,7.50,8.00,9.50,8.00,8.50,8.50,8.50,8.50,8.50,7.50,6.50,7.50,9.00
3,gelukkig,vrolijk,5.80,6.50,7.50,8.50,0.65,7.50,7.50,9.00,7.50,7.50,6.50,7.50,7.50,7.50,7.50
4,hard,stoer,6.00,0.30,6.00,6.50,7.50,7.50,0.60,5.50,5.65,0.30,6.50,0.20,5.00,0.40,6.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326,hart,operatie,1.75,2.56,1.50,2.80,0.33,2.36,1.67,0.18,0.23,2.56,2.34,3.12,2.34,4.22,2.55
327,vrouw,secretaresse,6.75,7.85,8.75,7.50,7.89,6.82,7.80,7.89,7.25,8.25,6.80,8.75,7.80,7.80,5.32
328,man,vader,7.20,6.50,6.50,8.50,8.40,7.50,8.50,7.50,7.50,7.80,6.75,6.50,6.75,7.50,7.50
329,strand,eiland,2.50,3.50,1.50,0.00,2.50,3.20,0.10,1.20,3.50,3.50,2.50,2.50,2.50,2.50,1.50


In [50]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f9-1.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.


### **3. Extract and Process Data (2)**

### **4. Extract and Process Data (3)**