### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

# Set client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

# Select subset
en_simlex = en_simlex.iloc[333:666]

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
tuples_list

[('game', 'fun'),
 ('weekend', 'week'),
 ('couple', 'pair'),
 ('woman', 'wife'),
 ('sheep', 'cattle'),
 ('purse', 'bag'),
 ('ceiling', 'cathedral'),
 ('bean', 'coffee'),
 ('wood', 'paper'),
 ('top', 'side'),
 ('crime', 'fraud'),
 ('pain', 'harm'),
 ('lover', 'companion'),
 ('evening', 'dusk'),
 ('father', 'daughter'),
 ('wine', 'liquor'),
 ('cow', 'goat'),
 ('belief', 'opinion'),
 ('reality', 'illusion'),
 ('pact', 'agreement'),
 ('wealth', 'poverty'),
 ('accident', 'emergency'),
 ('battle', 'conquest'),
 ('friend', 'teacher'),
 ('illness', 'infection'),
 ('game', 'trick'),
 ('brother', 'son'),
 ('aunt', 'nephew'),
 ('worker', 'mechanic'),
 ('doctor', 'orthodontist'),
 ('oak', 'maple'),
 ('bee', 'queen'),
 ('car', 'bicycle'),
 ('goal', 'quest'),
 ('august', 'month'),
 ('army', 'squad'),
 ('cloud', 'weather'),
 ('physician', 'doctor'),
 ('canyon', 'valley'),
 ('river', 'valley'),
 ('sun', 'sky'),
 ('target', 'arrow'),
 ('chocolate', 'pie'),
 ('circumstance', 'situation'),
 ('opinion', '

### **1. Define and Evaluate Parameters**

In [5]:
prompt = ("Rate the semantic similarity of the word pair: [('{word1}'), ('{word2}')] on a scale from 0 to 10, "
          "where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. "
          "Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. "
          "Do not provide additional explanations or context.")

In [6]:
# Define model
model = "mistralai/mixtral-8x7b-instruct"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 0.01

# Define number of sublists
n_sublists = 333

In [7]:
# Split list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

333


In [8]:
# Print prompts for each chunk
print_prompts_single(chunks, sample_size, prompt)

Rate the semantic similarity of the word pair: [('game'), ('fun')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('game'), ('fun')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('game'), ('fun')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or c

In [9]:
# Load encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count tokens per chunk
token_counts = count_tokens_with_tiktoken_single(chunks, prompt)

# Show results
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [74, 75, 75, 74, 76, 75, 77, 74, 74, 74, 75, 75, 75, 76, 74, 75, 75, 75, 75, 76, 75, 75, 75, 74, 76, 75, 75, 77, 76, 76, 76, 74, 75, 74, 75, 76, 74, 75, 77, 75, 74, 74, 75, 77, 75, 76, 76, 75, 76, 75, 75, 75, 76, 74, 75, 75, 75, 74, 74, 75, 74, 76, 74, 75, 75, 76, 75, 74, 75, 75, 75, 74, 75, 76, 75, 74, 75, 74, 74, 77, 74, 76, 74, 74, 75, 76, 75, 76, 76, 74, 75, 75, 77, 74, 75, 76, 75, 74, 75, 75, 76, 74, 76, 76, 75, 74, 76, 75, 76, 77, 77, 75, 75, 76, 75, 74, 75, 75, 75, 75, 76, 76, 75, 76, 75, 74, 75, 76, 75, 74, 75, 74, 75, 75, 74, 75, 75, 75, 74, 76, 74, 76, 74, 75, 74, 75, 74, 75, 74, 74, 74, 74, 74, 77, 75, 75, 75, 75, 75, 75, 74, 74, 74, 75, 75, 75, 76, 75, 76, 74, 74, 77, 75, 74, 74, 75, 74, 74, 75, 74, 74, 75, 76, 74, 76, 76, 75, 75, 75, 75, 77, 74, 76, 74, 75, 76, 75, 74, 75, 75, 75, 75, 75, 75, 75, 74, 76, 74, 74, 75, 74, 75, 74, 74, 75, 74, 75, 75, 74, 74, 74, 75, 75, 74, 75, 75, 74, 76, 76, 74, 75, 76, 76, 76, 75, 74, 75, 76, 75, 74,

In [10]:
# Max RPD = 10.000
len(token_counts*15)

4995

In [11]:
# Number of total tokens
print(sum(token_counts))

24968


### **2. Extract Data**

In [12]:
# Get results from API
response = get_responses_single(prompt, chunks, model, sample_size, delay)

Processing: 100%|██████████| 4995/4995 [34:59<00:00,  2.38chunk/s]  

Total time taken: 2099.52 seconds





In [13]:
# Define filepath
file_path = '../../../data/mixtral-8x7b-instruct/response/en/f9-2.json'

# Ensure directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File already exists. JSON was not saved to prevent overwriting.


### **3. Process and Inspect Data**

In [14]:
# Process data into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}

# Show results
print(higher_lower_samples)

{('game', 'fun'): [8.5, 8.5, 8.5, 8.5, 8.2, 8.5, 7.5, 7.5, 8.2, 8.2, 7.5, 7.5, 8.2, 7.5], ('weekend', 'week'): [8.5, 8.5, 8.5], ('couple', 'pair'): [9.0], ('ceiling', 'cathedral'): [1.75, 3.4, 3.4, 1.75, 3.25, 3.4, 3.4, 1.75, 1.5, 7.23, 7.23, 1.5, 1.5, 7.23], ('bean', 'coffee'): [3.0, 4.5, 4.5, 4.5, 7.23], ('wood', 'paper'): [3.2, 3.2, 3.2, 3.2, 3.0, 3.2, 3.2, 3.2, 3.2, 3.2], ('top', 'side'): [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ('pain', 'harm'): [8.2, 8.2], ('lover', 'companion'): [6.23, 6.23, 6.2, 0.67, 0.67, 0.67, 0.67, 7.2, 7.2, 0.67, 6.2, 0.67, 0.67, 0.67], ('wine', 'liquor'): [8.2, 8.2], ('cow', 'goat'): [7.5, 7.5, 7.5, 7.5, 8.0], ('belief', 'opinion'): [7.5, 7.56, 7.5, 7.56, 7.5, 7.5, 7.56, 8.2, 8.2], ('reality', 'illusion'): [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], ('pact', 'agreement'): [9.23, 9.23, 9.25, 9.23, 9.23, 9.23, 9.23, 9.23, 9.23, 9.23, 9.23, 9.23, 9.23], ('wealth', 'poverty'): [2.15, 1.0, 2.0, 1.0, 1.0, 2.15], ('friend', 'teacher'): [1.0, 1

In [15]:
# Print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

        Combined_Columns
336           woman_wife
347      father_daughter
371        canyon_valley
378        rhythm_melody
387           hill_cliff
394  basketball_baseball
398        anarchy_chaos
402           alley_bowl
406          guitar_drum
411           dad_mother
412       captain_sailor
420          crowd_bunch
457          flower_bulb
464          moon_planet
472          vessel_vein
485          fee_payment
488           man_sentry
491         blood_marrow
492             oil_mink
501         sorrow_shame
506   college_profession
516           meter_inch
553              cab_bus
564          shelter_hut
590       danger_disease
600         molecule_air
620         crowd_parade
623              hip_lip
648             horse_ox
658       author_creator
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [16]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,...,similarity_score_23,similarity_score_24,similarity_score_25,similarity_score_26,similarity_score_27,similarity_score_28,similarity_score_29,similarity_score_30,similarity_score_31,similarity_score_32
0,game,fun,8.50,8.5,8.50,8.50,8.20,8.50,7.50,7.50,...,,,,,,,,,,
1,weekend,week,8.50,8.5,8.50,,,,,,...,,,,,,,,,,
2,couple,pair,9.00,,,,,,,,...,,,,,,,,,,
3,sheep,cattle,7.50,7.2,7.20,7.20,7.20,7.20,7.20,7.23,...,,,,,,,,,,
4,purse,bag,8.50,8.5,8.56,8.50,8.50,8.50,8.50,8.56,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,letter,paragraph,1.50,1.5,1.50,1.50,1.50,3.40,3.40,1.73,...,,,,,,,,,,
313,page,paragraph,7.00,7.0,7.00,3.00,7.00,7.00,6.00,7.00,...,,,,,,,,,,
314,steeple,chapel,7.23,8.1,8.20,7.20,7.23,7.20,7.20,7.23,...,,,,,,,,,,
315,muscle,bone,1.00,1.0,1.00,1.00,1.00,1.00,0.00,1.00,...,,,,,,,,,,


In [17]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                    0
word2                    0
similarity_score_1       0
similarity_score_2      28
similarity_score_3      38
similarity_score_4      42
similarity_score_5      47
similarity_score_6      62
similarity_score_7      73
similarity_score_8      87
similarity_score_9      93
similarity_score_10    109
similarity_score_11    124
similarity_score_12    141
similarity_score_13    150
similarity_score_14    160
similarity_score_15    196
similarity_score_16    315
similarity_score_17    316
similarity_score_18    316
similarity_score_19    316
similarity_score_20    316
similarity_score_21    316
similarity_score_22    316
similarity_score_23    316
similarity_score_24    316
similarity_score_25    316
similarity_score_26    316
similarity_score_27    316
similarity_score_28    316
similarity_score_29    316
similarity_score_30    316
similarity_score_31    316
similarity_score_32    316
dtype: int64


In [18]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,...,similarity_score_23,similarity_score_24,similarity_score_25,similarity_score_26,similarity_score_27,similarity_score_28,similarity_score_29,similarity_score_30,similarity_score_31,similarity_score_32
0,game,fun,8.50,8.5,8.50,8.50,8.20,8.50,7.50,7.50,...,,,,,,,,,,
1,weekend,week,8.50,8.5,8.50,,,,,,...,,,,,,,,,,
2,couple,pair,9.00,,,,,,,,...,,,,,,,,,,
3,sheep,cattle,7.50,7.2,7.20,7.20,7.20,7.20,7.20,7.23,...,,,,,,,,,,
4,purse,bag,8.50,8.5,8.56,8.50,8.50,8.50,8.50,8.56,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,letter,paragraph,1.50,1.5,1.50,1.50,1.50,3.40,3.40,1.73,...,,,,,,,,,,
313,page,paragraph,7.00,7.0,7.00,3.00,7.00,7.00,6.00,7.00,...,,,,,,,,,,
314,steeple,chapel,7.23,8.1,8.20,7.20,7.23,7.20,7.20,7.23,...,,,,,,,,,,
315,muscle,bone,1.00,1.0,1.00,1.00,1.00,1.00,0.00,1.00,...,,,,,,,,,,


### **4. Fix Faulty Word Pairs**

In [None]:
# ...

### **5. Export Data**

In [None]:
# Define file_path
file_path = '../../../data/mixtral-8x7b-instruct/processed/en/f9-2.csv'

# Ensure directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Check if file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")