### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Select subset
# cleaned_nl_simlex = cleaned_nl_simlex.head(160)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [4]:
# Show results
cleaned_nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
0,oud,nieuw,1.94,A
1,slim,intelligent,8.19,A
2,hard,moeilijk,4.46,A
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
...,...,...,...,...
992,samenvoegen,verwerven,3.89,V
993,sturen,bijwonen,1.85,V
994,verzamelen,bijwonen,1.06,V
995,opnemen,intrekken,2.29,V


### **1. Define and Evaluate Parameters**

In [24]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 5, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 5 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: "
          "[(woord1, woord2, <score>), (woord3, woord4, <score>),] "
          " Geef geen extra uitleg of context.")

In [7]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 10.0

# Define number of sublists
n_sublists = 25

In [44]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

25


In [45]:
# chunks = [
# [('oud', 'nieuw'),
#  ('slim', 'intelligent'),
#  ('hard', 'moeilijk'),],

# [('slecht', 'vreselijk'),
#  ('moeilijk', 'gemakkelijk'),
#  ('slim', 'dom'),],
  
# [('gelukkig', 'vrolijk'),
#   ('hard', 'stoer'),
#   ('gelukkig', 'blij'),],
# ]

In [46]:
# Set the size of chunks
# chunk_size = 90

# Chunk the data
# chunks = chunk_data(tuples_list, chunk_size)

# Count chunks
# print("Count of chunks:", len(chunks))

In [47]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 5, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 5 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [(woord1, woord2, <score>), (woord3, woord4, <score>),]  Geef geen extra uitleg of context. --- ('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk'), ('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('makkelijk', 'moeilijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('smal', 'breed'), ('eenvoudig', 'gemakkelijk'), ('oud', 'vers'), ('kennelijk', 'd

In [48]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")  # Common encoding for GPT models

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [489, 481, 476, 450, 444, 452, 447, 453, 464, 459, 456, 451, 472, 467, 491, 442, 458, 479, 469, 477, 498, 492, 501, 492, 498]


### **2. Extract and Process Data**

In [49]:
# Process each chunk and get results using the OpenAI API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 375/375 [1:57:15<00:00, 18.76s/chunk]

Total time taken: 7035.20 seconds





In [50]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f4.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [8]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{}


In [9]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [10]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.05,0.25,0.14,0.50,0.01,0.01,0.01,1.00,1.0,1.00,1.00,0.10,0.12,0.14,0.60
1,slim,intelligent,0.80,0.85,0.00,1.00,4.50,4.50,4.96,1.00,1.0,0.67,4.00,0.95,0.95,4.49,0.95
2,hard,moeilijk,0.10,0.40,0.00,0.20,0.50,0.01,1.05,0.00,0.0,0.00,1.00,0.20,0.05,1.10,0.10
3,gelukkig,vrolijk,0.50,0.75,0.71,0.60,3.50,3.50,3.83,1.00,0.0,1.00,3.00,0.90,0.90,3.48,0.85
4,hard,stoer,0.20,0.40,0.29,0.40,2.00,2.00,2.37,0.00,0.0,0.67,2.00,0.70,0.60,3.15,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,2.20,1.25,2.00,0.50,0.40,0.50,0.43,0.60,3.0,0.33,0.36,0.50,0.25,1.53,0.17
993,sturen,bijwonen,0.00,0.04,0.00,0.00,0.05,0.00,0.07,0.00,0.0,0.00,0.00,0.00,0.00,0.06,0.00
994,verzamelen,bijwonen,1.00,0.10,1.00,0.20,0.05,0.10,0.27,0.00,0.0,0.00,0.00,0.00,0.00,0.43,0.00
995,opnemen,intrekken,0.50,0.04,0.00,0.20,0.10,0.20,0.19,0.00,1.0,0.00,0.00,0.00,0.00,0.18,0.00


In [11]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [12]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [13]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))
missing_word_pair_list

[]

In [14]:
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,oud,nieuw,0.05,0.25,0.14,0.50,0.01,0.01,0.01,1.00,1.0,1.00,1.00,0.10,0.12,0.14,0.60
1,slim,intelligent,0.80,0.85,0.00,1.00,4.50,4.50,4.96,1.00,1.0,0.67,4.00,0.95,0.95,4.49,0.95
2,hard,moeilijk,0.10,0.40,0.00,0.20,0.50,0.01,1.05,0.00,0.0,0.00,1.00,0.20,0.05,1.10,0.10
3,gelukkig,vrolijk,0.50,0.75,0.71,0.60,3.50,3.50,3.83,1.00,0.0,1.00,3.00,0.90,0.90,3.48,0.85
4,hard,stoer,0.20,0.40,0.29,0.40,2.00,2.00,2.37,0.00,0.0,0.67,2.00,0.70,0.60,3.15,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,samenvoegen,verwerven,2.20,1.25,2.00,0.50,0.40,0.50,0.43,0.60,3.0,0.33,0.36,0.50,0.25,1.53,0.17
993,sturen,bijwonen,0.00,0.04,0.00,0.00,0.05,0.00,0.07,0.00,0.0,0.00,0.00,0.00,0.00,0.06,0.00
994,verzamelen,bijwonen,1.00,0.10,1.00,0.20,0.05,0.10,0.27,0.00,0.0,0.00,0.00,0.00,0.00,0.43,0.00
995,opnemen,intrekken,0.50,0.04,0.00,0.20,0.10,0.20,0.19,0.00,1.0,0.00,0.00,0.00,0.00,0.18,0.00


In [15]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f4.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
