### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')

# Set client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

In [3]:
# Load predicted
f9_1 = pd.read_csv("../../../data/gpt-oss-20b/processed/en/f9-1.csv")
f9_2 = pd.read_csv("../../../data/gpt-oss-20b/processed/en/f9-2.csv")
f9_3 = pd.read_csv("../../../data/gpt-oss-20b/processed/en/f9-3.csv")

# Combine the dataframes
raw_predicted = pd.concat([f9_1, f9_2, f9_3], ignore_index=True)

# Load actual
raw_actual = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

In [4]:
# Show results
raw_predicted

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.00,0.00,0.75,2.00,0.00,0.0,0.10,0.05,1.00,0.00,0.0,0.00,0.10,0.00,0.00
1,smart,intelligent,9.00,8.75,9.50,9.50,9.50,9.5,9.20,8.50,9.50,9.75,9.5,9.00,8.50,9.00,9.00
2,hard,difficult,9.00,9.50,9.75,8.75,9.50,9.0,9.50,9.75,8.75,9.20,9.8,9.50,9.00,9.50,9.50
3,happy,cheerful,8.75,9.00,9.20,8.00,9.00,9.5,9.30,9.80,9.00,8.75,8.5,9.45,8.50,9.00,8.50
4,hard,easy,0.00,0.00,0.00,0.05,0.00,0.0,0.00,0.00,0.10,0.00,0.1,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
935,join,acquire,4.00,2.75,4.50,6.50,4.00,4.5,6.25,3.50,3.50,4.00,6.5,4.00,2.50,2.50,7.50
936,send,attend,1.00,0.15,1.30,0.10,0.75,1.5,2.50,0.20,1.50,1.00,2.0,0.05,0.05,0.00,2.50
937,gather,attend,5.50,6.00,7.50,5.00,5.00,4.5,4.50,7.50,7.00,6.25,5.5,7.20,7.50,6.50,6.50
938,absorb,withdraw,2.50,0.20,0.10,0.20,0.05,0.0,1.50,1.50,0.10,0.10,0.1,1.00,0.00,0.50,1.00


### **1. Get all the missing word pairs**

In [5]:
# Create a set of word pairs from raw_actual
actual_pairs = set(zip(raw_actual['word1'], raw_actual['word2']))

# Filter raw_predicted to only keep word pairs that exist in raw_actual
mask = raw_predicted.apply(lambda row: (row['word1'], row['word2']) in actual_pairs, axis=1)
predicted_filtered = raw_predicted[mask].copy()

print(f"Dropped {(~mask).sum()} invalid word pairs from raw_predicted")

Dropped 0 invalid word pairs from raw_predicted


In [6]:
# Get all the missing word pairs (in raw_actual but NOT in raw_predicted)
predicted_pairs = set(zip(predicted_filtered['word1'], predicted_filtered['word2']))

# Find missing pairs
mask_missing = raw_actual.apply(lambda row: (row['word1'], row['word2']) not in predicted_pairs, axis=1)
missing_pairs_df = raw_actual[mask_missing].copy()

# Convert to list of tuples
missing_word_pairs = list(zip(missing_pairs_df['word1'], missing_pairs_df['word2']))

print(f"Found {len(missing_word_pairs)} missing word pairs in raw_predicted")
missing_word_pairs

Found 59 missing word pairs in raw_predicted


[('new', 'fresh'),
 ('sad', 'funny'),
 ('harsh', 'cruel'),
 ('rough', 'frigid'),
 ('bad', 'guilty'),
 ('bottom', 'top'),
 ('student', 'pupil'),
 ('leg', 'arm'),
 ('actress', 'actor'),
 ('sunset', 'sunrise'),
 ('roof', 'ceiling'),
 ('date', 'calendar'),
 ('mud', 'dirt'),
 ('steak', 'meat'),
 ('dictionary', 'definition'),
 ('boy', 'son'),
 ('loop', 'belt'),
 ('cereal', 'wheat'),
 ('intelligence', 'logic'),
 ('bride', 'princess'),
 ('curve', 'angle'),
 ('mouth', 'tooth'),
 ('wood', 'paper'),
 ('doctor', 'professor'),
 ('bee', 'ant'),
 ('alley', 'bowl'),
 ('sinner', 'saint'),
 ('men', 'fraternity'),
 ('apartment', 'furniture'),
 ('fee', 'payment'),
 ('elbow', 'ankle'),
 ('effort', 'difficulty'),
 ('biography', 'fiction'),
 ('home', 'state'),
 ('president', 'mayor'),
 ('limb', 'leg'),
 ('box', 'cigar'),
 ('night', 'dawn'),
 ('beast', 'animal'),
 ('jail', 'choice'),
 ('task', 'woman'),
 ('hole', 'wife'),
 ('deck', 'mouse'),
 ('princess', 'island'),
 ('inform', 'notify'),
 ('enjoy', 'entertai

### **2. Extract data for missing word pairs**

In [7]:
prompt = ("Rate the semantic similarity of the word pair: [('{word1}'), ('{word2}')] on a scale from 0 to 10, "
          "where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. "
          "Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. "
          "Do not provide additional explanations or context.")

In [8]:
# Define model
model = "openai/gpt-oss-20b"

# Set sample size
missing_sample_size = 15

# Delay between individual API calls
missing_delay = 1.0

In [9]:
# Define number of sublists
missing_n_sublists = len(missing_word_pairs)

# Split list
missing_chunks = split_into_n_lists(missing_word_pairs, missing_n_sublists)

# Count number of lists
print(len(missing_chunks))

59


In [10]:
# Print prompts for each chunk
print_prompts_single(missing_chunks, missing_sample_size, prompt)

Rate the semantic similarity of the word pair: [('new'), ('fresh')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('new'), ('fresh')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('new'), ('fresh')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations o

In [11]:
# Get results from API
missing_response = get_responses_single(prompt, missing_chunks, model, missing_sample_size, missing_delay)

Processing: 100%|██████████| 885/885 [43:46<00:00,  2.97s/chunk]

Total time taken: 2626.36 seconds





In [12]:
# Define filepath
file_path = '../../../data/gpt-oss-20b/response/en/f9-post.json'

# Ensure directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(missing_response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File already exists. JSON was not saved to prevent overwriting.


### **3. Inspect Missing Word Pairs**

In [56]:
# Process data into dictionary
missing_data_dict = process_responses(missing_response)

# Check for values higher/lower then sample size
missing_higher_lower_samples = {key: value for key, value in missing_data_dict.items() if len(value) < missing_sample_size or len(value) > missing_sample_size}

# Show results
print(missing_higher_lower_samples)

{('accomplish', 'become'): [2.0, 3.75, 2.7, 2.0, 3.0, 2.0, 2.5, 0.15, 1.5, 3.0, 2.5, 3.5, 3.5, 1.75], ('acish', 'become'): [2.5]}


In [57]:
# Convert dict to Pandas DataFrame
missing_df = create_dataframe(missing_data_dict)

# Show results
missing_df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,new,fresh,8.0,6.5,8.5,7.5,7.0,8.5,8.5,8.5,7.5,8.5,8.5,7.5,7.5,8.5,8.75
1,sad,funny,2.0,2.0,0.0,0.05,0.0,0.1,0.15,0.1,0.1,0.0,2.0,1.5,0.0,1.0,0.1
2,harsh,cruel,8.5,9.0,7.3,7.4,7.5,8.0,7.5,8.0,7.5,8.5,7.25,8.5,7.5,7.5,7.8
3,rough,frigid,0.2,0.1,0.0,0.1,0.1,1.0,0.05,2.4,0.38,0.1,0.5,0.1,1.0,1.0,0.5
4,bad,guilty,6.5,6.5,7.5,7.5,6.5,3.5,5.5,6.3,6.5,6.5,3.0,6.75,6.5,5.0,6.5
5,bottom,top,0.0,0.0,0.1,0.1,0.0,1.0,0.0,0.0,0.05,0.0,0.0,0.1,0.05,0.2,0.0
6,student,pupil,9.75,9.5,9.5,9.7,9.25,9.0,9.5,9.0,9.2,8.5,9.5,9.5,9.2,9.25,9.8
7,leg,arm,6.0,5.5,5.5,8.5,6.0,6.5,8.0,8.0,7.5,7.5,4.5,6.75,7.0,6.8,7.5
8,actress,actor,9.0,7.0,8.5,8.5,8.5,9.0,9.0,9.8,8.5,9.0,8.5,8.5,8.75,8.5,9.0
9,sunset,sunrise,5.5,7.5,7.5,7.75,8.5,5.2,6.5,5.0,7.5,7.5,5.5,5.0,8.0,7.5,7.8


In [58]:
# Count null values
count_null_values = missing_df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     1
similarity_score_3     1
similarity_score_4     1
similarity_score_5     1
similarity_score_6     1
similarity_score_7     1
similarity_score_8     1
similarity_score_9     1
similarity_score_10    1
similarity_score_11    1
similarity_score_12    1
similarity_score_13    1
similarity_score_14    1
similarity_score_15    2
dtype: int64


In [59]:
# Check for rows with at least one null value
rows_with_null = missing_df[missing_df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
54,accomplish,become,2.0,3.75,2.7,2.0,3.0,2.0,2.5,0.15,1.5,3.0,2.5,3.5,3.5,1.75,
55,acish,become,2.5,,,,,,,,,,,,,,


### **4. Add Missing Word Pairs for Missing Word Pairs**

In [60]:
# Extract missing word pairs
missing_missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))

# Show results
missing_missing_word_pair_list

[('accomplish', 'become'), ('acish', 'become')]

In [61]:
# Drop unknown word pair
missing_missing_word_pair_list = [
    pair for pair in missing_missing_word_pair_list 
    if pair != ('acish', 'become')
]

# Show results
missing_missing_word_pair_list

[('accomplish', 'become')]

In [62]:
# Set sample size
missing_missing_sample_size = 15

# Delay between individual API calls
missing_missing_delay = 1.0

In [63]:
# Define number of sublists
missing_missing_n_sublists = len(missing_missing_word_pair_list)

# Split list
missing_missing_chunks = split_into_n_lists(missing_missing_word_pair_list, missing_missing_n_sublists)

# Count number of lists
print(len(missing_missing_chunks))

1


In [42]:
# Print prompts for each chunk
print_prompts_single(missing_missing_chunks, missing_missing_sample_size, prompt)

Rate the semantic similarity of the word pair: [('accomplish'), ('become')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('accomplish'), ('become')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide additional explanations or context.
Rate the semantic similarity of the word pair: [('accomplish'), ('become')] on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>)]. Do not provide a

In [43]:
# Get results from API
missing_missing_response = get_responses_single(prompt, missing_missing_chunks, model, missing_missing_sample_size, missing_missing_delay)

Processing: 100%|██████████| 15/15 [00:43<00:00,  2.88s/chunk]

Total time taken: 43.19 seconds





In [64]:
# Process data into dictionary
missing_missing_data_dict = process_responses(missing_missing_response)

# Show results
missing_missing_data_dict

{('accomplish', 'become'): [2.5,
  2.5,
  2.5,
  2.5,
  2.5,
  4.0,
  2.5,
  2.5,
  2.5,
  3.0,
  2.0,
  4.5,
  2.5,
  2.5,
  3.0]}

In [65]:
# Convert dict to Pandas DataFrame
missing_missing_df = create_dataframe(missing_missing_data_dict)

# Show results
missing_missing_df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,accomplish,become,2.5,2.5,2.5,2.5,2.5,4.0,2.5,2.5,2.5,3.0,2.0,4.5,2.5,2.5,3.0


In [66]:
# Count null values
count_null_values = missing_missing_df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [67]:
# Check for rows with at least one null value
rows_with_null = missing_missing_df[missing_missing_df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


### **4. Add Missing Word Pairs raw_predicted**

In [72]:
# Concatenate raw_predicted with missing_df
raw_predicted_complete = pd.concat([raw_predicted, missing_df], ignore_index=True)

print(f"Original raw_predicted: {len(raw_predicted)} rows")
print(f"Missing pairs: {len(missing_df)} rows")
print(f"Complete raw_predicted: {len(raw_predicted_complete)} rows")

# Sort by word1 and word2 for better organization
raw_predicted_complete = raw_predicted_complete.sort_values(['word1', 'word2']).reset_index(drop=True)

Original raw_predicted: 940 rows
Missing pairs: 60 rows
Complete raw_predicted: 1000 rows


In [73]:
# Create a mapping of word pairs to their order in raw_actual
raw_actual['order'] = range(len(raw_actual))
order_map = dict(zip(zip(raw_actual['word1'], raw_actual['word2']), raw_actual['order']))

# Map the order to raw_predicted_complete
raw_predicted_complete['order'] = raw_predicted_complete.apply(
    lambda row: order_map.get((row['word1'], row['word2']), float('inf')), 
    axis=1
)

# Sort by the order from raw_actual
raw_predicted_complete = raw_predicted_complete.sort_values('order').drop('order', axis=1).reset_index(drop=True)

# Show results
raw_predicted_complete

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.00,0.00,0.75,2.00,0.00,0.0,0.10,0.05,1.00,0.00,0.0,0.00,0.10,0.00,0.00
1,smart,intelligent,9.00,8.75,9.50,9.50,9.50,9.5,9.20,8.50,9.50,9.75,9.5,9.00,8.50,9.00,9.00
2,hard,difficult,9.00,9.50,9.75,8.75,9.50,9.0,9.50,9.75,8.75,9.20,9.8,9.50,9.00,9.50,9.50
3,happy,cheerful,8.75,9.00,9.20,8.00,9.00,9.5,9.30,9.80,9.00,8.75,8.5,9.45,8.50,9.00,8.50
4,hard,easy,0.00,0.00,0.00,0.05,0.00,0.0,0.00,0.00,0.10,0.00,0.1,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,send,attend,1.00,0.15,1.30,0.10,0.75,1.5,2.50,0.20,1.50,1.00,2.0,0.05,0.05,0.00,2.50
996,gather,attend,5.50,6.00,7.50,5.00,5.00,4.5,4.50,7.50,7.00,6.25,5.5,7.20,7.50,6.50,6.50
997,absorb,withdraw,2.50,0.20,0.10,0.20,0.05,0.0,1.50,1.50,0.10,0.10,0.1,1.00,0.00,0.50,1.00
998,attend,arrive,6.50,4.25,6.50,4.50,5.50,6.0,5.67,6.00,5.00,5.40,6.3,6.00,3.50,5.25,6.25


In [74]:
# Drop unknown word pair
raw_predicted_complete = raw_predicted_complete[:-1]

# Show results
raw_predicted_complete

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.00,0.00,0.75,2.00,0.00,0.0,0.10,0.05,1.00,0.00,0.0,0.00,0.10,0.00,0.00
1,smart,intelligent,9.00,8.75,9.50,9.50,9.50,9.5,9.20,8.50,9.50,9.75,9.5,9.00,8.50,9.00,9.00
2,hard,difficult,9.00,9.50,9.75,8.75,9.50,9.0,9.50,9.75,8.75,9.20,9.8,9.50,9.00,9.50,9.50
3,happy,cheerful,8.75,9.00,9.20,8.00,9.00,9.5,9.30,9.80,9.00,8.75,8.5,9.45,8.50,9.00,8.50
4,hard,easy,0.00,0.00,0.00,0.05,0.00,0.0,0.00,0.00,0.10,0.00,0.1,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,4.00,2.75,4.50,6.50,4.00,4.5,6.25,3.50,3.50,4.00,6.5,4.00,2.50,2.50,7.50
995,send,attend,1.00,0.15,1.30,0.10,0.75,1.5,2.50,0.20,1.50,1.00,2.0,0.05,0.05,0.00,2.50
996,gather,attend,5.50,6.00,7.50,5.00,5.00,4.5,4.50,7.50,7.00,6.25,5.5,7.20,7.50,6.50,6.50
997,absorb,withdraw,2.50,0.20,0.10,0.20,0.05,0.0,1.50,1.50,0.10,0.10,0.1,1.00,0.00,0.50,1.00


### **5. Export Data**

In [77]:
# Define file_path
file_path = '../../../data/gpt-oss-20b/processed/en/f9-post.csv'

# Ensure directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)

# Check if file already exists
if not os.path.exists(file_path):
    raw_predicted_complete.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. DataFrame was not saved to prevent overwriting.")


File saved successfully.
