### 0. Set-up

In [19]:
# Import libraries and utils
%run '../../data_processing/simulate/utils.ipynb'

In [20]:
# Load dataframe and convert into tuple
nl_simlex = pd.read_csv("../../data/nl-simlex-999.txt", delimiter='\t')
tuples_list = list(zip(nl_simlex['word1'], nl_simlex['word2']))

### 1. Sending prompts

In [21]:
# Define the prompt
prompt = ("Rate the semantic similarity of each word pair on a scale from 0 to 10, "
          "where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. "
          "Use two decimals. Your response should strictly adhere to the structure: "
          "[('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. Do not provide additional explanations or context.")

In [22]:
# Set the size of chunks
chunk_size = 200

# Chunk the data
chunks = chunk_data(tuples_list, chunk_size)

# Count chunks
print("Count of chunks:", len(chunks))

Count of chunks: 5


In [23]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. Your response should strictly adhere to the structure: [('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. Do not provide additional explanations or context.

---

('oud', 'nieuw'), ('slim', 'intelligent'), ('hard', 'moeilijk'), ('gelukkig', 'vrolijk'), ('hard', 'stoer'), ('snel', 'razendsnel'), ('gelukkig', 'blij'), ('kort', 'lang'), ('dom', 'stom'), ('vreemd', 'eigenaardig'), ('breed', 'smal'), ('slecht', 'vreselijk'), ('makkelijk', 'moeilijk'), ('slecht', 'vreselijk'), ('moeilijk', 'gemakkelijk'), ('slim', 'dom'), ('krankzinnig', 'gek'), ('gelukkig', 'kwaad'), ('uitgebreid', 'groot'), ('moeilijk', 'simpel'), ('nieuw', 'vers'), ('scherp', 'saai'), ('vlug', 'snel'), ('dom', 'dwaas'), ('prachtig', 'fantastisch'), ('eigenaardig', 'vreemd'), ('gelukkig', 'boos'), ('smal', 'breed'), ('eenvoudig', 'g

In [18]:

# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")  # Common encoding for GPT models

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [1843, 1772, 1830, 1824, 1995]


### 2. Receiving prompts

In [8]:
# Process each chunk and get results using the simulated API
results = process_simulated_responses(chunks, 20)

# Arrange columns
column_order = ['word1', 'word2'] + [f'similarity_score_{i+1}' for i in range(20)]

# Create DataFrame with specific column order
df = pd.DataFrame(results, columns=column_order)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,...,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15,similarity_score_16,similarity_score_17,similarity_score_18,similarity_score_19,similarity_score_20
0,oud,nieuw,3.75,7.83,7.64,0.79,5.06,3.64,8.96,4.22,...,6.85,1.61,9.24,9.16,8.46,1.05,7.14,5.13,0.42,2.84
1,slim,intelligent,5.75,7.30,5.55,3.96,3.51,6.16,2.95,8.92,...,7.84,0.46,1.19,7.06,1.61,8.28,9.37,7.01,2.60,6.51
2,hard,moeilijk,5.25,7.71,5.01,2.79,3.82,6.52,2.60,1.34,...,6.76,9.06,3.47,9.55,1.13,8.09,9.57,1.34,0.35,2.39
3,gelukkig,vrolijk,0.45,7.23,5.30,1.07,5.34,2.25,8.00,9.06,...,5.24,2.68,7.22,6.57,8.92,2.58,5.34,0.30,8.57,9.53
4,hard,stoer,4.46,0.07,2.65,0.87,9.58,1.32,8.15,9.48,...,7.45,3.15,2.65,3.39,2.60,6.29,4.09,9.54,0.92,9.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,samenvoegen,verwerven,4.13,6.18,5.58,5.92,7.99,4.11,1.50,4.01,...,5.20,8.43,1.44,7.79,0.55,0.30,3.51,3.01,5.40,4.80
995,sturen,bijwonen,3.88,8.48,0.76,4.27,6.26,8.14,6.06,1.94,...,4.71,9.71,7.01,7.38,9.87,9.60,4.00,3.04,0.40,0.05
996,verzamelen,bijwonen,7.42,1.03,4.23,9.49,1.21,2.62,6.12,4.38,...,1.62,8.64,6.82,7.67,0.78,3.23,1.19,9.81,3.39,9.04
997,opnemen,intrekken,1.01,0.27,0.45,7.07,1.98,3.51,7.24,4.43,...,1.75,6.48,1.39,5.00,10.00,5.09,4.11,7.29,2.29,0.85


In [26]:
# Show info
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   word1                999 non-null    object 
 1   word2                999 non-null    object 
 2   similarity_score_1   999 non-null    float64
 3   similarity_score_2   999 non-null    float64
 4   similarity_score_3   999 non-null    float64
 5   similarity_score_4   999 non-null    float64
 6   similarity_score_5   999 non-null    float64
 7   similarity_score_6   999 non-null    float64
 8   similarity_score_7   999 non-null    float64
 9   similarity_score_8   999 non-null    float64
 10  similarity_score_9   999 non-null    float64
 11  similarity_score_10  999 non-null    float64
 12  similarity_score_11  999 non-null    float64
 13  similarity_score_12  999 non-null    float64
 14  similarity_score_13  999 non-null    float64
 15  similarity_score_14  999 non-null    flo