### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

# Set client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

# Select subset
# en_simlex = en_simlex.head(150)

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
tuples_list

[('old', 'new'),
 ('smart', 'intelligent'),
 ('hard', 'difficult'),
 ('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),

### **1. Define and Evaluate Parameters**

In [5]:
# Define prompt
prompt = ("Classify the semantic similarity of each word pair in the hierarchical categories: "
          "'very dissimilar', 'dissimilar', 'neutral', 'similar', and 'very similar'. "
          "The response should strictly adhere to the structure: "
          "[('word1', 'word2', <classification>), ('word3', 'word4', <classification>), ...]. "
          "Do not provide additional explanations or context.")

In [6]:
# Define model
model = "meta-llama/llama-3.3-70b-instruct"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 20

In [7]:
# Split list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count number of lists
print(len(chunks))

20


In [8]:
# Print prompts for each chunk
print_prompts(chunks, prompt)

Classify the semantic similarity of each word pair in the hierarchical categories: 'very dissimilar', 'dissimilar', 'neutral', 'similar', and 'very similar'. The response should strictly adhere to the structure: [('word1', 'word2', <classification>), ('word3', 'word4', <classification>), ...]. Do not provide additional explanations or context. --- ["('old', 'new'), ('smart', 'intelligent'), ('hard', 'difficult'), ('happy', 'cheerful'), ('hard', 'easy'), ('fast', 'rapid'), ('happy', 'glad'), ('short', 'long'), ('stupid', 'dumb'), ('weird', 'strange'), ('wide', 'narrow'), ('bad', 'awful'), ('easy', 'difficult'), ('bad', 'terrible'), ('hard', 'simple'), ('smart', 'dumb'), ('insane', 'crazy'), ('happy', 'mad'), ('large', 'huge'), ('hard', 'tough'), ('new', 'fresh'), ('sharp', 'dull'), ('quick', 'rapid'), ('dumb', 'foolish'), ('wonderful', 'terrific'), ('strange', 'odd'), ('happy', 'angry'), ('narrow', 'broad'), ('simple', 'easy'), ('old', 'fresh'), ('apparent', 'obvious'), ('inexpensive', 

In [9]:
# Load encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)

# Show results
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [436, 443, 424, 427, 418, 416, 431, 438, 437, 423, 423, 430, 433, 420, 426, 422, 413, 428, 434, 404]


### **2. Extract and Process Data**

In [10]:
# Get results from API
response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 300/300 [2:40:42<00:00, 32.14s/chunk]  

Total time taken: 9642.07 seconds





In [11]:
# Define filepath
file_path = '../../../data/llama-3.3-70b/response/en/f5.json'

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [29]:
# Process data into dictionary
data_dict = process_responses_categorical(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}

# Show results
print(higher_lower_samples)

{('bad', 'great'): ['very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar'], ('difficult', 'simple'): ['very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar'], ('necessary', 'important'): ['similar', 'similar', 'similar', 'similar', 'similar', 'very similar', 'similar', 'similar', 'similar', 'similar', 'very similar', 'similar', 'similar', 'similar'], ('bad', 'terrific'): ['very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimil

In [30]:
# Print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [31]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar
1,smart,intelligent,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar
2,hard,difficult,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar
3,happy,cheerful,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar
4,hard,easy,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005,join,acquire,very similar,very similar,very similar,similar,similar,similar,similar,very similar,similar,similar,very similar,very similar,very similar,very similar,similar
1006,send,attend,similar,dissimilar,neutral,similar,similar,neutral,neutral,neutral,neutral,neutral,similar,dissimilar,dissimilar,similar,similar
1007,gather,attend,very similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,very similar,similar,similar,very similar,similar
1008,absorb,withdraw,dissimilar,very dissimilar,dissimilar,very dissimilar,very dissimilar,very dissimilar,dissimilar,very dissimilar,dissimilar,dissimilar,dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar


In [32]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                    0
word2                    0
similarity_score_1       0
similarity_score_2      11
similarity_score_3      11
similarity_score_4      11
similarity_score_5      11
similarity_score_6      11
similarity_score_7      11
similarity_score_8      11
similarity_score_9      11
similarity_score_10     11
similarity_score_11     11
similarity_score_12     11
similarity_score_13     12
similarity_score_14     61
similarity_score_15    402
dtype: int64


In [33]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
50,bad,great,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,
51,difficult,simple,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,
52,necessary,important,similar,similar,similar,similar,similar,very similar,similar,similar,similar,similar,very similar,similar,similar,similar,
53,bad,terrific,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,
54,mad,glad,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
856,appoint,elect,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,
857,engage,marry,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,
858,ask,pray,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,
859,go,send,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,


In [34]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))

# Show results
missing_word_pair_list

[('bad', 'great'),
 ('difficult', 'simple'),
 ('necessary', 'important'),
 ('bad', 'terrific'),
 ('mad', 'glad'),
 ('honest', 'guilty'),
 ('easy', 'tough'),
 ('easy', 'flexible'),
 ('certain', 'sure'),
 ('essential', 'necessary'),
 ('different', 'normal'),
 ('sly', 'clever'),
 ('crucial', 'important'),
 ('harsh', 'cruel'),
 ('childish', 'foolish'),
 ('scarce', 'rare'),
 ('friendly', 'generous'),
 ('fragile', 'frigid'),
 ('long', 'narrow'),
 ('big', 'heavy'),
 ('rough', 'frigid'),
 ('bizarre', 'strange'),
 ('illegal', 'immoral'),
 ('bad', 'guilty'),
 ('modern', 'ancient'),
 ('new', 'ancient'),
 ('dull', 'funny'),
 ('happy', 'young'),
 ('easy', 'big'),
 ('great', 'awful'),
 ('tiny', 'huge'),
 ('polite', 'proper'),
 ('modest', 'ashamed'),
 ('exotic', 'rare'),
 ('dumb', 'clever'),
 ('delightful', 'wonderful'),
 ('noticeable', 'obvious'),
 ('afraid', 'anxious'),
 ('formal', 'proper'),
 ('dreary', 'dull'),
 ('delightful', 'cheerful'),
 ('unhappy', 'mad'),
 ('sad', 'terrible'),
 ('sick', 'cra

In [None]:
# Define file_path
file_path = '../../../data/prompt/en/llama-3.3-70b/f5.csv'

# Check if file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")