### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
cleaned_nl_simlex = pd.read_csv("../../../data/dataset/cleaned-nl-simlex-999.csv")

# Remove first three word pairs
cleaned_nl_simlex = cleaned_nl_simlex.iloc[3:]

# Select subset
# cleaned_nl_simlex = cleaned_nl_simlex.head(160)

# Convert to tuple
tuples_list = list(zip(cleaned_nl_simlex['word1'], cleaned_nl_simlex['word2']))

In [4]:
# Show results
cleaned_nl_simlex

Unnamed: 0,word1,word2,SimLex999,POS
3,gelukkig,vrolijk,6.49,A
4,hard,stoer,5.69,A
5,snel,razendsnel,7.18,A
6,gelukkig,blij,7.09,A
7,kort,lang,1.78,A
...,...,...,...,...
992,samenvoegen,verwerven,3.89,V
993,sturen,bijwonen,1.85,V
994,verzamelen,bijwonen,1.06,V
995,opnemen,intrekken,2.29,V


### **1. Define and Evaluate Parameters**

In [5]:
# Define the prompt
prompt = ("Beoordeel de semantische gelijkenis van elk woordpaar op een schaal van 0 tot 10, "
          "waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. "
          "Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: "
          "[(woord1, woord2, <score>), (woord3, woord4, <score>),] "
          " Geef geen extra uitleg of context."
          " Voorbeelden van woordparen en hun semantische gelijkenisscore zijn: [('oud', 'nieuw', 1.94), ('slim', 'intelligent', 8.19), ('hard', 'moeilijk', 4.46)].")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 27

In [29]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

27


In [30]:
# chunks = [
# [('oud', 'nieuw'),
#  ('slim', 'intelligent'),
#  ('hard', 'moeilijk'),],

# [('slecht', 'vreselijk'),
#  ('moeilijk', 'gemakkelijk'),
#  ('slim', 'dom'),],
  
# [('gelukkig', 'vrolijk'),
#   ('hard', 'stoer'),
#   ('gelukkig', 'blij'),],
# ]

In [31]:
# Set the size of chunks
# chunk_size = 90

# Chunk the data
# chunks = chunk_data(tuples_list, chunk_size)

# Count chunks
# print("Count of chunks:", len(chunks))

In [32]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Beoordeel de semantische gelijkenis van elk woordpaar met een score op een schaal van 0 tot 10, waarbij 0 geen semantische gelijkenis vertegenwoordigt, en 10 perfecte semantische gelijkenis. Gebruik twee decimalen. Het antwoord moet strikt voldoen aan de structuur: [(woord1, woord2, <score>), (woord3, woord4, <score>),]  Geef geen extra uitleg of context. Voorbeelden van woordparen en hun semantische gelijkenisscore zijn: [(oud, nieuw, 1.94), (slim, intelligent, 8.19), (hard, moeilijk, 4.46)]. --- ('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('makkelijk', 'moeilijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('sm

In [33]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [517, 507, 499, 487, 462, 484, 472, 485, 473, 494, 481, 485, 481, 496, 496, 513, 478, 476, 489, 511, 482, 516, 513, 519, 523, 510, 523]


### **2. Extract and Process Data**

In [34]:
# Process each chunk and get results using the OpenAI API
response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 405/405 [2:07:08<00:00, 18.84s/chunk]  

Total time taken: 7628.28 seconds





In [35]:
# Define filepath
file_path = '../../../data/response/nl/gpt-3.5-turbo-0125/f3.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [8]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{}


In [10]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(cleaned_nl_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [11]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,gelukkig,vrolijk,6.47,6.67,5.00,7.82,5.00,3.33,4.68,5.48,3.57,6.67,4.23,5.00,7.14,6.67,5.29
1,hard,stoer,1.45,3.33,3.00,3.58,4.00,4.12,3.00,6.23,2.00,5.00,6.15,4.00,2.67,4.17,2.13
2,snel,razendsnel,7.85,10.00,8.00,9.53,7.00,9.45,7.83,9.30,8.00,10.00,9.74,9.00,9.37,10.00,9.34
3,gelukkig,blij,8.10,8.33,6.00,7.76,6.50,7.16,7.27,8.70,7.50,5.83,7.55,7.00,8.37,8.33,7.41
4,kort,lang,1.78,0.00,1.00,1.17,1.00,1.00,1.55,1.86,1.00,1.67,2.84,1.00,1.00,1.67,1.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,samenvoegen,verwerven,0.93,0.26,2.84,1.23,3.98,0.48,0.68,0.00,1.12,2.00,0.33,0.16,0.00,0.19,0.00
990,sturen,bijwonen,0.12,0.05,1.74,0.15,0.00,0.12,0.14,0.00,1.20,1.00,0.14,0.07,0.00,0.06,0.00
991,verzamelen,bijwonen,0.77,0.05,1.28,0.25,2.59,0.32,0.69,1.83,2.09,3.00,0.15,0.10,1.76,0.01,1.76
992,opnemen,intrekken,1.09,0.47,2.67,2.10,0.00,0.14,0.14,5.57,1.24,1.00,0.21,0.21,0.00,0.04,0.00


In [12]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [13]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [14]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))
missing_word_pair_list

[]

In [15]:
# Define file_path
file_path = '../../../data/prompt/nl/gpt-3.5-turbo-0125/f3.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
