### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

# Select subset
# en_simlex = en_simlex.head(150)

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
tuples_list

[('old', 'new'),
 ('smart', 'intelligent'),
 ('hard', 'difficult'),
 ('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),

### **1. Define and Evaluate Parameters**

In [21]:
# Define prompt
prompt = ("Classify the semantic similarity of each word pair in the hierarchical categories: "
          "'very dissimilar', 'dissimilar', 'neutral', 'similar', and 'very similar'. "
          "The response should strictly adhere to the structure: "
          "[('word1', 'word2', <classification>), ('word3', 'word4', <classification>), ...]. "
          "Do not provide additional explanations or context.")

In [22]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 20

In [23]:
# Split list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count number of lists
print(len(chunks))

20


In [25]:
# Print prompts for each chunk
print_prompts(chunks, prompt)

Classify the semantic similarity of each word pair in the hierarchical categories: 'very dissimilar', 'dissimilar', 'neutral', 'similar', and 'very similar'. The response should strictly adhere to the structure: [('word1', 'word2', <classification>), ('word3', 'word4', <classification>), ...]. Do not provide additional explanations or context. --- ["('old', 'new'), ('smart', 'intelligent'), ('hard', 'difficult'), ('happy', 'cheerful'), ('hard', 'easy'), ('fast', 'rapid'), ('happy', 'glad'), ('short', 'long'), ('stupid', 'dumb'), ('weird', 'strange'), ('wide', 'narrow'), ('bad', 'awful'), ('easy', 'difficult'), ('bad', 'terrible'), ('hard', 'simple'), ('smart', 'dumb'), ('insane', 'crazy'), ('happy', 'mad'), ('large', 'huge'), ('hard', 'tough'), ('new', 'fresh'), ('sharp', 'dull'), ('quick', 'rapid'), ('dumb', 'foolish'), ('wonderful', 'terrific'), ('strange', 'odd'), ('happy', 'angry'), ('narrow', 'broad'), ('simple', 'easy'), ('old', 'fresh'), ('apparent', 'obvious'), ('inexpensive', 

In [26]:
# Load encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)

# Show results
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [436, 443, 424, 427, 418, 416, 431, 438, 437, 423, 423, 430, 433, 420, 426, 422, 413, 428, 434, 404]


### **2. Extract and Process Data**

In [27]:
# Get results from API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing:   0%|          | 0/300 [00:00<?, ?chunk/s]

Processing: 100%|██████████| 300/300 [1:59:27<00:00, 23.89s/chunk]

Total time taken: 7168.01 seconds





In [38]:
# Define filepath
file_path = '../../../data/response/en/gpt-3.5-turbo-0125/f5.json'

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [28]:
# Process data into dictionary
data_dict = process_responses_categorical(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}

# Show results
print(higher_lower_samples)

{('bath', 'balloon'): ['dissimilar', 'dissimilar', 'dissimilar', 'very similar', 'very dissimilar', 'similar', 'dissimilar', 'dissimilar', 'dissimilar', 'dissimilar', 'dissimilar', 'very dissimilar', 'very dissimilar', 'very dissimilar'], ('bath', 'balloons'): ['dissimilar']}


In [29]:
# Print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [30]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,dissimilar,very dissimilar,very dissimilar
1,smart,intelligent,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar
2,hard,difficult,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
3,happy,cheerful,very similar,similar,very similar,similar,similar,very similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
4,hard,easy,dissimilar,dissimilar,very dissimilar,very dissimilar,very dissimilar,dissimilar,dissimilar,very dissimilar,dissimilar,very dissimilar,very dissimilar,dissimilar,dissimilar,dissimilar,very dissimilar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,join,acquire,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
996,send,attend,similar,dissimilar,similar,similar,similar,similar,dissimilar,neutral,similar,dissimilar,similar,similar,dissimilar,neutral,similar
997,gather,attend,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
998,absorb,withdraw,similar,dissimilar,dissimilar,dissimilar,dissimilar,similar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar


In [31]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     1
similarity_score_3     1
similarity_score_4     1
similarity_score_5     1
similarity_score_6     1
similarity_score_7     1
similarity_score_8     1
similarity_score_9     1
similarity_score_10    1
similarity_score_11    1
similarity_score_12    1
similarity_score_13    1
similarity_score_14    1
similarity_score_15    2
dtype: int64


In [32]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
772,bath,balloon,dissimilar,dissimilar,dissimilar,very similar,very dissimilar,similar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,very dissimilar,very dissimilar,very dissimilar,
800,bath,balloons,dissimilar,,,,,,,,,,,,,,


In [33]:
# Custom prompt
custom_prompt = ("Classify the semantic similarity of the word pair [('bath', 'balloon' in the hierarchical categories: "
                 "'very dissimilar', 'dissimilar', 'neutral', 'similar', and 'very similar'. "
                 "The response should strictly adhere to the structure: "
                 "[('word1', 'word2', <classification>)]. "
                 "Do not provide additional explanations or context.")

# Make API call
# messages = [{"role": "user", "content": custom_prompt}]
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1)

# Show results
print(completion.choices[0].message.content)

[('bath', 'balloon', dissimilar)]


In [34]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'bath') & (df['word2'] == 'balloon'), 'similarity_score_15'] = 'dissimilar'

# Show results
df.loc[(df['word1'] == 'bath') & (df['word2'] == 'balloon')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
772,bath,balloon,dissimilar,dissimilar,dissimilar,very similar,very dissimilar,similar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,very dissimilar,very dissimilar,very dissimilar,dissimilar


In [35]:
# Drop faulty row
df = df[~((df['word1'] == 'bath') & (df['word2'] == 'balloons'))]

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,very dissimilar,dissimilar,very dissimilar,very dissimilar
1,smart,intelligent,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar,very similar
2,hard,difficult,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
3,happy,cheerful,very similar,similar,very similar,similar,similar,very similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
4,hard,easy,dissimilar,dissimilar,very dissimilar,very dissimilar,very dissimilar,dissimilar,dissimilar,very dissimilar,dissimilar,very dissimilar,very dissimilar,dissimilar,dissimilar,dissimilar,very dissimilar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,join,acquire,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
996,send,attend,similar,dissimilar,similar,similar,similar,similar,dissimilar,neutral,similar,dissimilar,similar,similar,dissimilar,neutral,similar
997,gather,attend,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar,similar
998,absorb,withdraw,similar,dissimilar,dissimilar,dissimilar,dissimilar,similar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar,dissimilar


In [36]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [37]:
# Define file_path
file_path = '../../../data/prompt/en/gpt-3.5-turbo-0125/f5.csv'

# Check if file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
