### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
#  Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

# Select subset
# en_simlex = en_simlex.head(150)

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
tuples_list

[('old', 'new'),
 ('smart', 'intelligent'),
 ('hard', 'difficult'),
 ('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),

### **1. Define and Evaluate Parameters**

In [71]:
# Define the prompt
prompt = ("Rate the semantic similarity of each word pair on a scale from 0 to 5, "
          "where 0 represents no semantic similarity, and 5 represents perfect semantic similarity. "
          "Use two decimals. The response should strictly adhere to the structure: "
          "[('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. "
          "Do not provide additional explanations or context.")

In [72]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 15.0

# Define number of sublists
n_sublists = 20

In [73]:
# Split the list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count the number of lists
print(len(chunks))

20


In [75]:
# Print the prompts for each chunk
print_prompts(chunks, prompt)

Rate the semantic similarity of each word pair on a scale from 0 to 5, where 0 represents no semantic similarity, and 5 represents perfect semantic similarity. Use two decimals. The response should strictly adhere to the structure: [('word1', 'word2', <score>), ('word3', 'word4', <score>), ...]. Do not provide additional explanations or context. --- ["('old', 'new'), ('smart', 'intelligent'), ('hard', 'difficult'), ('happy', 'cheerful'), ('hard', 'easy'), ('fast', 'rapid'), ('happy', 'glad'), ('short', 'long'), ('stupid', 'dumb'), ('weird', 'strange'), ('wide', 'narrow'), ('bad', 'awful'), ('easy', 'difficult'), ('bad', 'terrible'), ('hard', 'simple'), ('smart', 'dumb'), ('insane', 'crazy'), ('happy', 'mad'), ('large', 'huge'), ('hard', 'tough'), ('new', 'fresh'), ('sharp', 'dull'), ('quick', 'rapid'), ('dumb', 'foolish'), ('wonderful', 'terrific'), ('strange', 'odd'), ('happy', 'angry'), ('narrow', 'broad'), ('simple', 'easy'), ('old', 'fresh'), ('apparent', 'obvious'), ('inexpensive'

In [77]:
# Load the encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count the tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [439, 446, 427, 430, 421, 419, 434, 441, 440, 426, 426, 433, 436, 423, 429, 425, 416, 431, 437, 407]


### **2. Extract and Process Data**

In [78]:
# Process each chunk and get results using the OpenAI API
# response = get_responses(chunks, prompt, model, sample_size, delay)

Processing: 100%|██████████| 300/300 [2:02:10<00:00, 24.44s/chunk]  

Total time taken: 7330.82 seconds





In [79]:
# Define filepath
file_path = '../../../data/response/en/gpt-3.5-turbo-0125/f4.json'

# Check if the file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [80]:
# Extract data with regular expressions into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}
print(higher_lower_samples)

{('organ', 'liver'): [4.0, 3.0, 3.5, 3.75, 3.75, 3.0, 3.5, 3.5, 3.0, 3.75, 3.0, 3.5, 3.5, 3.75], ('strength', 'might'): [3.5, 3.0, 3.5, 3.75, 3.25, 4.0, 4.0, 4.0, 4.2, 3.75, 4.0, 3.5, 4.0, 4.0], ('phrase', 'word'): [3.5, 3.5, 2.5, 3.5, 3.5, 3.0, 2.5, 3.5, 3.7, 3.75, 2.0, 2.0, 2.75, 3.5], ('band', 'parade'): [2.0, 2.0, 3.0, 2.5, 2.0, 3.0, 2.0, 3.0, 2.2, 2.25, 2.5, 2.0, 3.0, 2.5], ('stomach', 'waist'): [3.0, 2.5, 3.0, 3.75, 3.0, 3.5, 3.5, 3.5, 3.4, 3.5, 4.0, 2.5, 3.75, 3.5], ('cloud', 'storm'): [2.5, 2.5, 3.0, 3.25, 2.0, 3.0, 2.5, 3.0, 3.9, 3.0, 3.5, 2.5, 3.5, 3.0], ('joy', 'pride'): [3.5, 3.0, 3.0, 2.5, 2.75, 3.5, 3.5, 3.5, 3.7, 3.25, 3.0, 2.5, 3.5, 3.25], ('noise', 'rattle'): [2.0, 2.0, 3.0, 2.75, 1.5, 2.0, 2.0, 3.0, 2.1, 2.5, 2.5, 1.5, 2.75, 2.75], ('rain', 'mist'): [2.0, 3.0, 3.0, 3.25, 2.5, 3.0, 2.5, 3.0, 3.4, 3.0, 3.0, 2.5, 2.5, 2.25], ('beer', 'beverage'): [4.0, 3.5, 3.5, 3.25, 3.5, 3.0, 2.5, 4.0, 3.9, 3.5, 4.0, 2.5, 4.0, 3.75], ('man', 'uncle'): [2.0, 2.0, 2.0, 2.25, 2.5, 2.0, 2.

In [81]:
# Process data and print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [82]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.05,2.0,0.05,0.01,0.19,0.08,1.00,0.08,0.15,0.15,0.50,0.05,0.27,1.13,0.05
1,smart,intelligent,4.50,5.0,0.95,0.95,0.83,1.00,0.90,1.00,1.00,0.87,0.95,0.95,1.00,4.00,0.95
2,hard,difficult,4.50,4.0,0.85,0.80,0.70,0.95,0.90,0.96,0.85,0.78,0.90,0.95,0.82,3.00,0.95
3,happy,cheerful,4.00,4.0,0.85,0.80,0.84,0.95,0.80,0.75,0.70,0.85,0.85,0.80,0.90,3.00,0.85
4,hard,easy,1.00,1.0,0.15,0.03,0.26,0.08,0.10,0.23,0.25,0.23,0.20,0.10,0.37,1.00,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,0.40,0.5,0.60,0.33,0.32,0.40,0.53,0.70,0.55,0.50,0.40,0.55,0.50,0.41,0.40
995,send,attend,0.20,0.3,0.20,0.17,0.32,0.20,0.33,0.40,0.35,0.30,0.40,0.25,0.25,0.36,0.00
996,gather,attend,0.40,0.3,0.30,0.17,0.32,0.20,0.42,0.50,0.45,0.40,0.50,0.35,0.50,0.37,0.25
997,absorb,withdraw,0.20,0.1,0.20,0.17,0.16,0.20,0.09,0.10,0.10,0.10,0.25,0.25,0.50,0.33,0.00


In [83]:
# Count null values
count_null_values = df.isnull().sum()
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      0
similarity_score_3      0
similarity_score_4      0
similarity_score_5      0
similarity_score_6      0
similarity_score_7      0
similarity_score_8      0
similarity_score_9      0
similarity_score_10     0
similarity_score_11     0
similarity_score_12     0
similarity_score_13     0
similarity_score_14     0
similarity_score_15    43
dtype: int64


In [84]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
269,organ,liver,4.0,3.0,3.5,3.75,3.75,3.0,3.5,3.5,3.0,3.75,3.0,3.5,3.5,3.75,
270,strength,might,3.5,3.0,3.5,3.75,3.25,4.0,4.0,4.0,4.2,3.75,4.0,3.5,4.0,4.0,
271,phrase,word,3.5,3.5,2.5,3.5,3.5,3.0,2.5,3.5,3.7,3.75,2.0,2.0,2.75,3.5,
272,band,parade,2.0,2.0,3.0,2.5,2.0,3.0,2.0,3.0,2.2,2.25,2.5,2.0,3.0,2.5,
273,stomach,waist,3.0,2.5,3.0,3.75,3.0,3.5,3.5,3.5,3.4,3.5,4.0,2.5,3.75,3.5,
274,cloud,storm,2.5,2.5,3.0,3.25,2.0,3.0,2.5,3.0,3.9,3.0,3.5,2.5,3.5,3.0,
275,joy,pride,3.5,3.0,3.0,2.5,2.75,3.5,3.5,3.5,3.7,3.25,3.0,2.5,3.5,3.25,
276,noise,rattle,2.0,2.0,3.0,2.75,1.5,2.0,2.0,3.0,2.1,2.5,2.5,1.5,2.75,2.75,
277,rain,mist,2.0,3.0,3.0,3.25,2.5,3.0,2.5,3.0,3.4,3.0,3.0,2.5,2.5,2.25,
278,beer,beverage,4.0,3.5,3.5,3.25,3.5,3.0,2.5,4.0,3.9,3.5,4.0,2.5,4.0,3.75,


In [85]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))
missing_word_pair_list

[('organ', 'liver'),
 ('strength', 'might'),
 ('phrase', 'word'),
 ('band', 'parade'),
 ('stomach', 'waist'),
 ('cloud', 'storm'),
 ('joy', 'pride'),
 ('noise', 'rattle'),
 ('rain', 'mist'),
 ('beer', 'beverage'),
 ('man', 'uncle'),
 ('apple', 'juice'),
 ('intelligence', 'logic'),
 ('communication', 'language'),
 ('mink', 'fur'),
 ('mob', 'crowd'),
 ('shore', 'coast'),
 ('wire', 'cord'),
 ('bird', 'turkey'),
 ('bed', 'crib'),
 ('competence', 'ability'),
 ('cloud', 'haze'),
 ('supper', 'meal'),
 ('bar', 'cage'),
 ('water', 'salt'),
 ('sense', 'intuition'),
 ('situation', 'condition'),
 ('crime', 'theft'),
 ('style', 'fashion'),
 ('boundary', 'border'),
 ('arm', 'body'),
 ('proclaim', 'announce'),
 ('acquire', 'obtain'),
 ('conclude', 'decide'),
 ('please', 'plead'),
 ('argue', 'prove'),
 ('ask', 'plead'),
 ('find', 'disappear'),
 ('inspect', 'examine'),
 ('verify', 'justify'),
 ('assume', 'predict'),
 ('learn', 'evaluate'),
 ('argue', 'justify')]

In [87]:
# Make extra API call
formatted_prompt = format_prompt(missing_word_pair_list, prompt)
messages = [{"role": "user", "content": formatted_prompt}]

# Call the OpenAI API
completion = client.chat.completions.create(
    model=model,
    messages=messages,
    n=1)

# Store the response content
missing_word_pairs_response = [completion.choices[0].message.content]

In [88]:
# Extract data with regular expressions into dictionary
missing_word_pairs_dict = process_responses(missing_word_pairs_response)
missing_word_pairs_dict

{('organ', 'liver'): [4.75],
 ('strength', 'might'): [1.5],
 ('phrase', 'word'): [4.5],
 ('band', 'parade'): [1.25],
 ('stomach', 'waist'): [2.25],
 ('cloud', 'storm'): [2.75],
 ('joy', 'pride'): [2.75],
 ('noise', 'rattle'): [1.5],
 ('rain', 'mist'): [2.5],
 ('beer', 'beverage'): [4.0],
 ('man', 'uncle'): [2.25],
 ('apple', 'juice'): [3.5],
 ('intelligence', 'logic'): [4.0],
 ('communication', 'language'): [4.75],
 ('mink', 'fur'): [4.25],
 ('mob', 'crowd'): [1.75],
 ('shore', 'coast'): [4.5],
 ('wire', 'cord'): [4.5],
 ('bird', 'turkey'): [1.0],
 ('bed', 'crib'): [3.25],
 ('competence', 'ability'): [4.5],
 ('cloud', 'haze'): [3.0],
 ('supper', 'meal'): [3.75],
 ('bar', 'cage'): [2.0],
 ('water', 'salt'): [2.25],
 ('sense', 'intuition'): [3.75],
 ('situation', 'condition'): [4.5],
 ('crime', 'theft'): [2.75],
 ('style', 'fashion'): [4.25],
 ('boundary', 'border'): [4.75],
 ('arm', 'body'): [3.75],
 ('proclaim', 'announce'): [4.5],
 ('acquire', 'obtain'): [4.0],
 ('conclude', 'decide')

In [89]:
# Iterate over the DataFrame and replace missing values
for index, row in df.iterrows():
    word_pair = (row['word1'], row['word2'])
    # Check if the current value is NaN
    if pd.isna(row['similarity_score_15']):
        if word_pair in missing_word_pairs_dict:
            # Extract the first element from the list to get the scalar value
            df.at[index, 'similarity_score_15'] = missing_word_pairs_dict[word_pair][0]

# Print to check if there are any NaN values left in 'similarity_score_15'
print(df[df['similarity_score_15'].isna()])

Empty DataFrame
Columns: [word1, word2, similarity_score_1, similarity_score_2, similarity_score_3, similarity_score_4, similarity_score_5, similarity_score_6, similarity_score_7, similarity_score_8, similarity_score_9, similarity_score_10, similarity_score_11, similarity_score_12, similarity_score_13, similarity_score_14, similarity_score_15]
Index: []


In [90]:
# Filter the DataFrame based on pairs
filtered_df = df[df.apply(lambda row: (row['word1'], row['word2']) in missing_word_pair_list, axis=1)]

# Select only the similarity_score_15 column
result = filtered_df[['word1', 'word2', 'similarity_score_15']]

# Print or output the result
print(result)

             word1      word2  similarity_score_15
269          organ      liver                 4.75
270       strength      might                 1.50
271         phrase       word                 4.50
272           band     parade                 1.25
273        stomach      waist                 2.25
274          cloud      storm                 2.75
275            joy      pride                 2.75
276          noise     rattle                 1.50
277           rain       mist                 2.50
278           beer   beverage                 4.00
279            man      uncle                 2.25
280          apple      juice                 3.50
281   intelligence      logic                 4.00
282  communication   language                 4.75
283           mink        fur                 4.25
284            mob      crowd                 1.75
285          shore      coast                 4.50
286           wire       cord                 4.50
287           bird     turkey  

In [91]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [92]:
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,old,new,0.05,2.0,0.05,0.01,0.19,0.08,1.00,0.08,0.15,0.15,0.50,0.05,0.27,1.13,0.05
1,smart,intelligent,4.50,5.0,0.95,0.95,0.83,1.00,0.90,1.00,1.00,0.87,0.95,0.95,1.00,4.00,0.95
2,hard,difficult,4.50,4.0,0.85,0.80,0.70,0.95,0.90,0.96,0.85,0.78,0.90,0.95,0.82,3.00,0.95
3,happy,cheerful,4.00,4.0,0.85,0.80,0.84,0.95,0.80,0.75,0.70,0.85,0.85,0.80,0.90,3.00,0.85
4,hard,easy,1.00,1.0,0.15,0.03,0.26,0.08,0.10,0.23,0.25,0.23,0.20,0.10,0.37,1.00,0.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,0.40,0.5,0.60,0.33,0.32,0.40,0.53,0.70,0.55,0.50,0.40,0.55,0.50,0.41,0.40
995,send,attend,0.20,0.3,0.20,0.17,0.32,0.20,0.33,0.40,0.35,0.30,0.40,0.25,0.25,0.36,0.00
996,gather,attend,0.40,0.3,0.30,0.17,0.32,0.20,0.42,0.50,0.45,0.40,0.50,0.35,0.50,0.37,0.25
997,absorb,withdraw,0.20,0.1,0.20,0.17,0.16,0.20,0.09,0.10,0.10,0.10,0.25,0.25,0.50,0.33,0.00


In [93]:
# Define file_path
file_path = '../../../data/prompt/en/gpt-3.5-turbo-0125/f4.csv'

# Check if the file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
