### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

# Set client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

# Remove first word pair
en_simlex = en_simlex.iloc[1:]

# Select subset
# en_simlex = en_simlex.head(100)

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
tuples_list

[('smart', 'intelligent'),
 ('hard', 'difficult'),
 ('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),
 ('strong', 'pro

### **1. Define and Evaluate Parameters**

In [5]:
# Define prompt
prompt = ("Rate the semantic similarity of each word pair on a scale from 0 to 10, "
          "where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. "
          "Use two decimals. The response should strictly adhere to the structure: "
          "[('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. "
          "Do not provide additional explanations or context. "
          "An example of a word pair and its semantic similarity score is: [('old', 'new', 1.58)]")

In [6]:
# Define model
model = "meta-llama/llama-4-scout"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 25

In [7]:
# Split list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

25


In [8]:
# Print prompts for each chunk
print_prompts(chunks, prompt)

Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. Do not provide additional explanations or context. An example of a word pair and its semantic similarity score is: [('old', 'new', 1.58)] --- ["('smart', 'intelligent'), ('hard', 'difficult'), ('happy', 'cheerful'), ('hard', 'easy'), ('fast', 'rapid'), ('happy', 'glad'), ('short', 'long'), ('stupid', 'dumb'), ('weird', 'strange'), ('wide', 'narrow'), ('bad', 'awful'), ('easy', 'difficult'), ('bad', 'terrible'), ('hard', 'simple'), ('smart', 'dumb'), ('insane', 'crazy'), ('happy', 'mad'), ('large', 'huge'), ('hard', 'tough'), ('new', 'fresh'), ('sharp', 'dull'), ('quick', 'rapid'), ('dumb', 'foolish'), ('wonderful', 'terrific'), ('strange', 'odd'), ('happy', 'angry'), ('narrow', 'broad'), ('si

In [9]:
# Load encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)

# Show results
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [392, 393, 397, 378, 386, 379, 376, 386, 387, 391, 390, 390, 380, 379, 391, 392, 382, 377, 386, 379, 379, 380, 378, 383, 369]


### **2. Extract and Process Data**

In [10]:
# Get results from API
response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 375/375 [2:34:34<00:00, 24.73s/chunk]  

Total time taken: 9274.98 seconds





In [11]:
# Define filepath
file_path = '../../../data/llama-4-scout/response/en/f2.json'

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [12]:
# Process data into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}

# Show results
print(higher_lower_samples)

{('smart', 'intelligent'): [9.5, 9.95, 9.95, 9.5, 9.95, 9.5, 9.95, 9.95, 9.95, 9.95, 9.5, 9.95, 9.5, 9.95, 9.5, 9.5], ('hard', 'difficult'): [9.8, 9.83, 9.83, 9.2, 9.8, 9.2, 9.83, 9.88, 9.82, 9.8, 9.25, 9.82, 9.2, 9.9, 9.2, 9.2], ('happy', 'cheerful'): [8.4, 9.23, 8.42, 8.3, 9.3, 8.8, 8.92, 9.21, 9.23, 8.85, 8.42, 8.45, 8.8, 9.2, 8.8, 8.8], ('hard', 'easy'): [0.2, 1.17, 1.17, 1.2, 1.2, 1.2, 1.19, 1.14, 0.05, 1.2, 0.0, 1.19, 1.2, 1.1, 0.0, 0.0], ('fast', 'rapid'): [9.2, 9.62, 9.62, 8.8, 9.5, 9.3, 9.42, 9.55, 9.87, 9.5, 9.35, 9.38, 8.5, 9.5, 9.5, 9.5], ('happy', 'glad'): [8.8, 9.56, 8.65, 8.6, 9.4, 8.9, 8.62, 9.38, 9.46, 8.65, 8.7, 8.21, 8.3, 9.3, 9.3, 9.3], ('short', 'long'): [0.0, 1.31, 1.35, 1.5, 1.5, 2.1, 1.58, 1.58, 0.12, 1.4, 0.0, 1.58, 1.3, 0.0, 0.0, 0.0], ('stupid', 'dumb'): [8.5, 8.53, 8.37, 8.5, 9.2, 8.5, 8.23, 9.23, 8.42, 8.2, 8.35, 8.56, 8.5, 9.8, 8.5, 8.5], ('weird', 'strange'): [8.2, 8.42, 8.23, 7.8, 8.5, 8.2, 8.41, 8.42, 8.56, 8.3, 8.51, 8.34, 8.0, 8.5, 8.2, 8.2], ('wide',

In [13]:
# Print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [14]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15,similarity_score_16,similarity_score_17,similarity_score_18
0,smart,intelligent,9.50,9.95,9.95,9.50,9.95,9.50,9.95,9.95,9.95,9.95,9.50,9.95,9.50,9.95,9.50,9.5,,
1,hard,difficult,9.80,9.83,9.83,9.20,9.80,9.20,9.83,9.88,9.82,9.80,9.25,9.82,9.20,9.90,9.20,9.2,,
2,happy,cheerful,8.40,9.23,8.42,8.30,9.30,8.80,8.92,9.21,9.23,8.85,8.42,8.45,8.80,9.20,8.80,8.8,,
3,hard,easy,0.20,1.17,1.17,1.20,1.20,1.20,1.19,1.14,0.05,1.20,0.00,1.19,1.20,1.10,0.00,0.0,,
4,fast,rapid,9.20,9.62,9.62,8.80,9.50,9.30,9.42,9.55,9.87,9.50,9.35,9.38,8.50,9.50,9.50,9.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,5.47,5.46,6.23,5.93,5.98,5.96,5.54,5.38,5.49,5.24,5.23,5.95,5.38,5.67,5.01,,,
995,send,attend,4.41,5.78,4.21,4.52,3.66,4.42,4.21,4.17,4.41,4.11,4.56,4.39,4.86,4.76,3.39,,,
996,gather,attend,6.15,6.51,6.35,6.42,6.56,5.92,5.64,5.83,6.21,5.98,6.42,6.33,6.36,6.45,5.88,,,
997,absorb,withdraw,4.36,3.21,3.58,3.95,2.92,3.96,2.68,3.32,2.37,2.98,3.85,2.91,3.21,3.27,3.72,,,


In [15]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                    0
word2                    0
similarity_score_1       0
similarity_score_2       0
similarity_score_3       1
similarity_score_4       1
similarity_score_5       1
similarity_score_6       1
similarity_score_7       1
similarity_score_8       1
similarity_score_9       1
similarity_score_10      1
similarity_score_11      1
similarity_score_12      1
similarity_score_13      1
similarity_score_14     41
similarity_score_15    201
similarity_score_16    721
similarity_score_17    841
similarity_score_18    919
dtype: int64


In [19]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null.reset_index(drop=True)

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15,similarity_score_16,similarity_score_17,similarity_score_18
0,smart,intelligent,9.50,9.95,9.95,9.50,9.95,9.50,9.95,9.95,9.95,9.95,9.50,9.95,9.50,9.95,9.50,9.5,,
1,hard,difficult,9.80,9.83,9.83,9.20,9.80,9.20,9.83,9.88,9.82,9.80,9.25,9.82,9.20,9.90,9.20,9.2,,
2,happy,cheerful,8.40,9.23,8.42,8.30,9.30,8.80,8.92,9.21,9.23,8.85,8.42,8.45,8.80,9.20,8.80,8.8,,
3,hard,easy,0.20,1.17,1.17,1.20,1.20,1.20,1.19,1.14,0.05,1.20,0.00,1.19,1.20,1.10,0.00,0.0,,
4,fast,rapid,9.20,9.62,9.62,8.80,9.50,9.30,9.42,9.55,9.87,9.50,9.35,9.38,8.50,9.50,9.50,9.5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
914,join,acquire,5.47,5.46,6.23,5.93,5.98,5.96,5.54,5.38,5.49,5.24,5.23,5.95,5.38,5.67,5.01,,,
915,send,attend,4.41,5.78,4.21,4.52,3.66,4.42,4.21,4.17,4.41,4.11,4.56,4.39,4.86,4.76,3.39,,,
916,gather,attend,6.15,6.51,6.35,6.42,6.56,5.92,5.64,5.83,6.21,5.98,6.42,6.33,6.36,6.45,5.88,,,
917,absorb,withdraw,4.36,3.21,3.58,3.95,2.92,3.96,2.68,3.32,2.37,2.98,3.85,2.91,3.21,3.27,3.72,,,


In [17]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))

# Show results
missing_word_pair_list

[('smart', 'intelligent'),
 ('hard', 'difficult'),
 ('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),
 ('strong', 'pro

In [None]:
# Define file_path
file_path = '../../../data/llama-3.3-70b/processed/en/f2.csv'

# Check if file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")