### **0. Set-up**

In [1]:
# Import libraries and utils
%run '../../utils.ipynb'

In [2]:
# Get api key
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Set client
client = OpenAI()

In [3]:
# Load dataframe
en_simlex = pd.read_csv("../../../data/dataset/cleaned-en-simlex-999.csv")

# Remove first three word pairs
en_simlex = en_simlex.iloc[3:]

# Select subset
# en_simlex = en_simlex.head(70)

# Convert to tuple
tuples_list = list(zip(en_simlex['word1'], en_simlex['word2']))

In [4]:
# Show results
tuples_list

[('happy', 'cheerful'),
 ('hard', 'easy'),
 ('fast', 'rapid'),
 ('happy', 'glad'),
 ('short', 'long'),
 ('stupid', 'dumb'),
 ('weird', 'strange'),
 ('wide', 'narrow'),
 ('bad', 'awful'),
 ('easy', 'difficult'),
 ('bad', 'terrible'),
 ('hard', 'simple'),
 ('smart', 'dumb'),
 ('insane', 'crazy'),
 ('happy', 'mad'),
 ('large', 'huge'),
 ('hard', 'tough'),
 ('new', 'fresh'),
 ('sharp', 'dull'),
 ('quick', 'rapid'),
 ('dumb', 'foolish'),
 ('wonderful', 'terrific'),
 ('strange', 'odd'),
 ('happy', 'angry'),
 ('narrow', 'broad'),
 ('simple', 'easy'),
 ('old', 'fresh'),
 ('apparent', 'obvious'),
 ('inexpensive', 'cheap'),
 ('nice', 'generous'),
 ('weird', 'normal'),
 ('weird', 'odd'),
 ('bad', 'immoral'),
 ('sad', 'funny'),
 ('wonderful', 'great'),
 ('guilty', 'ashamed'),
 ('beautiful', 'wonderful'),
 ('confident', 'sure'),
 ('dumb', 'dense'),
 ('large', 'big'),
 ('nice', 'cruel'),
 ('impatient', 'anxious'),
 ('big', 'broad'),
 ('strong', 'proud'),
 ('unnecessary', 'necessary'),
 ('restless', 

### **1. Define and Evaluate Parameters**

In [5]:
# Define prompt
prompt = ("Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity.")
user_content = ("[('old', 'new'), ('smart', 'intelligent'), ('hard', 'difficult)']")
assistant_content = ("[('old', 'new', 1.58), ('smart', 'intelligent', 9.20), ('hard', 'difficult', 8.77)]")

In [6]:
# Define model
model = "gpt-3.5-turbo-0125"

# Set sample size
sample_size = 15

# Delay between individual API calls
delay = 3.0

# Define number of sublists
n_sublists = 20

In [7]:
# Split list
chunks = split_into_n_lists(tuples_list, n_sublists)

# Count number of lists
print(len(chunks))

20


In [8]:
# Print prompts for each chunk
print_prompts(chunks, prompt)

Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity. --- ["('happy', 'cheerful'), ('hard', 'easy'), ('fast', 'rapid'), ('happy', 'glad'), ('short', 'long'), ('stupid', 'dumb'), ('weird', 'strange'), ('wide', 'narrow'), ('bad', 'awful'), ('easy', 'difficult'), ('bad', 'terrible'), ('hard', 'simple'), ('smart', 'dumb'), ('insane', 'crazy'), ('happy', 'mad'), ('large', 'huge'), ('hard', 'tough'), ('new', 'fresh'), ('sharp', 'dull'), ('quick', 'rapid'), ('dumb', 'foolish'), ('wonderful', 'terrific'), ('strange', 'odd'), ('happy', 'angry'), ('narrow', 'broad'), ('simple', 'easy'), ('old', 'fresh'), ('apparent', 'obvious'), ('inexpensive', 'cheap'), ('nice', 'generous'), ('weird', 'normal'), ('weird', 'odd'), ('bad', 'immoral'), ('sad', 'funny'), ('wonderful', 'great'), ('guilty', 'ashamed'), ('beautiful', 'wonderful'), ('confident', 'sure'), ('dumb', 'dense'), ('large', 'big'), ('nic

In [9]:
# Load encoding
encoding = tiktoken.get_encoding("cl100k_base")

# Count tokens per chunk
token_counts = count_tokens_with_tiktoken(chunks, prompt)

# Show results
print("Token counts for each formatted prompt:", token_counts)

Token counts for each formatted prompt: [391, 403, 380, 380, 372, 376, 387, 392, 394, 379, 377, 387, 390, 375, 380, 379, 363, 378, 383, 360]


### **2. Extract and Process Data**

In [11]:
# Get results from API
# response = get_responses_conversational(chunks, prompt, user_content, assistant_content, model, sample_size, delay)

Processing: 100%|██████████| 300/300 [2:09:29<00:00, 25.90s/chunk]  

Total time taken: 7769.52 seconds





In [12]:
# Define filepath
file_path = '../../../data/response/en/gpt-3.5-turbo-0125/f8.json'

# Check if file already exists
if not os.path.exists(file_path):
    with open(file_path, 'w') as f:
        json.dump(response, f)
        print("File saved successfully.")
else:
    print("File already exists. JSON was not saved to prevent overwriting.")

File saved successfully.


In [22]:
# Process data into dictionary
data_dict = process_responses(response)

# Check for values higher/lower then sample size
higher_lower_samples = {key: value for key, value in data_dict.items() if len(value) < sample_size or len(value) > sample_size}

# Show results
print(higher_lower_samples)

{('bad', 'guilty'): [3.0, 2.55, 3.0, 2.0, 4.0, 2.76, 2.1, 2.0, 0.58, 1.0, 0.0, 0.93, 3.21], ('leg', 'arm'): [8.33, 4.88, 7.9, 4.1, 0.71, 8.96, 8.66, 6.18, 7.07, 8.0, 8.74, 5.0, 0.03, 6.71], ('plane', 'jet'): [6.13, 3.63, 5.18, 2.2, 0.58, 5.47, 5.82, 4.07, 5.19, 6.0, 5.61, 6.5, 0.07, 4.0], ('woman', 'man'): [9.73, 6.97, 9.52, 7.88, 0.77, 9.8, 8.77, 7.38, 8.87, 9.0, 9.74, 9.0, 0.08, 7.93], ('horse', 'colt'): [7.95, 6.45, 6.4, 3.7, 0.69, 6.63, 7.73, 5.62, 6.79, 2.0, 6.17, 7.0, 0.05, 3.21], ('actress', 'actor'): [9.53, 8.9, 9.45, 7.88, 0.82, 9.6, 9.68, 8.47, 8.62, 9.0, 9.25, 9.0, 0.08, 7.67], ('teacher', 'instructor'): [5.73, 7.38, 7.53, 6.25, 0.79, 8.53, 9.68, 8.72, 7.31, 9.0, 9.61, 7.0, 0.05, 6.0], ('movie', 'film'): [8.05, 8.03, 8.73, 7.5, 0.78, 9.47, 9.2, 6.46, 8.07, 8.0, 9.69, 5.0, 0.1, 6.79], ('sheep', 'lamb'): [8.22, 5.95, 7.6, 6.0, 0.82, 9.1, 7.16, 6.19, 6.56, 2.0, 7.74, 8.0, 0.05, 3.43], ('lady', 'gentleman'): [8.9, 7.22, 8.8, 7.75, 0.79, 9.57, 9.2, 7.27, 7.94, 9.0, 0.79, 9.0, 0.1

In [23]:
# Print duplicate word pairs
print_duplicate_word_pairs(en_simlex, data_dict)

Empty DataFrame
Columns: [Combined_Columns]
Index: []
Empty DataFrame
Columns: [Combined_Columns]
Index: []


In [24]:
# Convert dict to Pandas DataFrame
df = create_dataframe(data_dict)

# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,happy,cheerful,8.75,9.20,7.76,5.26,9.50,9.60,8.66,8.50,9.15,8.57,8.90,8.20,9.80,8.90,8.89
1,hard,easy,4.10,1.73,0.81,3.00,1.67,3.50,4.52,1.14,1.98,2.14,5.65,3.20,6.00,2.33,5.00
2,fast,rapid,9.50,9.60,9.34,9.75,9.80,8.50,9.68,9.21,8.90,9.29,9.50,9.80,9.70,8.20,9.17
3,happy,glad,8.50,8.60,8.47,7.50,9.40,8.80,7.90,8.09,8.00,8.57,7.80,7.60,9.70,8.50,8.89
4,short,long,7.25,7.50,0.71,5.50,9.60,5.50,6.42,7.80,7.34,8.57,8.20,8.80,9.20,7.50,6.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,join,acquire,7.35,6.60,4.00,3.00,5.12,0.25,6.27,4.38,6.23,6.12,3.93,4.30,3.44,4.00,4.00
992,send,attend,5.30,6.60,2.40,1.00,3.44,0.25,4.44,3.33,4.28,4.65,4.44,4.35,2.09,2.17,3.00
993,gather,attend,6.97,6.00,3.00,1.00,4.20,0.25,5.36,3.33,4.56,4.94,4.21,3.90,2.53,3.52,2.50
994,absorb,withdraw,3.41,3.20,1.80,1.00,2.60,0.25,3.37,2.08,2.67,3.12,2.19,3.40,1.36,5.75,3.00


In [25]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      0
similarity_score_3      0
similarity_score_4      0
similarity_score_5      0
similarity_score_6      0
similarity_score_7      0
similarity_score_8      0
similarity_score_9      0
similarity_score_10     0
similarity_score_11     0
similarity_score_12     0
similarity_score_13     0
similarity_score_14     4
similarity_score_15    94
dtype: int64


In [26]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
99,bad,guilty,3.00,2.55,3.00,2.00,4.00,2.76,2.10,2.00,0.58,1.00,0.00,0.93,3.21,,
120,leg,arm,8.33,4.88,7.90,4.10,0.71,8.96,8.66,6.18,7.07,8.00,8.74,5.00,0.03,6.71,
121,plane,jet,6.13,3.63,5.18,2.20,0.58,5.47,5.82,4.07,5.19,6.00,5.61,6.50,0.07,4.00,
122,woman,man,9.73,6.97,9.52,7.88,0.77,9.80,8.77,7.38,8.87,9.00,9.74,9.00,0.08,7.93,
123,horse,colt,7.95,6.45,6.40,3.70,0.69,6.63,7.73,5.62,6.79,2.00,6.17,7.00,0.05,3.21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,bone,teeth,7.50,5.11,2.05,4.68,4.24,4.95,3.88,2.88,4.90,5.44,4.11,0.50,2.64,3.90,
648,bone,elbow,4.00,1.67,1.90,2.47,1.76,2.33,0.42,1.78,2.25,2.61,2.22,0.20,2.31,1.60,
649,bacon,bean,2.78,1.36,1.85,3.82,2.21,4.02,0.49,1.43,4.47,3.10,3.11,0.25,0.50,1.80,
713,winner,goal,3.00,3.81,6.91,4.50,5.30,5.60,5.00,7.00,5.30,5.28,5.35,4.00,7.66,7.65,


In [27]:
# Show results
df[df['similarity_score_14'].isna()][['word1', 'word2']]

Unnamed: 0,word1,word2
99,bad,guilty
272,joy,pride
621,book,article
642,box,cigar


In [29]:
# Define prompt
prompt = ("Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity.")
user_content_1 = ("[(old, new), (smart, intelligent), (hard, difficult)]")
assistant_content = ("[(old, new, 1.58), (smart, intelligent, 9.20), (hard, difficult, 8.77)]")
user_content_2 = "[('bad', 'guilty'), ('joy', 'pride'), ('book', 'article'), ('box', 'cigar')]"

# Define message
messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": user_content_1},
    {"role": "assistant", "content": assistant_content},
    {"role": "user", "content": user_content_2},
    ]

# Call the API
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1,
#     stop=None)

# Show results
# print(completion.choices[0].message.content)

In [30]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'bad') & (df['word2'] == 'guilty'), 'similarity_score_14'] = 2.60

# Show results
df.loc[(df['word1'] == 'bad') & (df['word2'] == 'guilty')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
99,bad,guilty,3.0,2.55,3.0,2.0,4.0,2.76,2.1,2.0,0.58,1.0,0.0,0.93,3.21,2.6,


In [31]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'joy') & (df['word2'] == 'pride'), 'similarity_score_14'] = 6.50

# Show results
df.loc[(df['word1'] == 'joy') & (df['word2'] == 'pride')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
272,joy,pride,7.5,4.5,6.6,4.0,6.88,8.0,7.0,8.75,8.34,5.95,8.1,6.52,6.0,6.5,


In [32]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'book') & (df['word2'] == 'article'), 'similarity_score_14'] = 2.50

# Show results
df.loc[(df['word1'] == 'book') & (df['word2'] == 'article')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
621,book,article,2.5,1.67,1.7,6.73,1.13,0.14,1.94,1.82,3.75,3.89,0.15,0.41,2.7,2.5,


In [33]:
# Manually fix inconsistencies
df.loc[(df['word1'] == 'box') & (df['word2'] == 'cigar'), 'similarity_score_14'] = 2.00

# Show results
df.loc[(df['word1'] == 'box') & (df['word2'] == 'cigar')]

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
642,box,cigar,3.33,0.0,2.0,2.38,1.13,4.39,0.12,1.33,1.88,2.75,1.44,0.05,1.9,2.0,


In [34]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
99,bad,guilty,3.00,2.55,3.00,2.00,4.00,2.76,2.10,2.00,0.58,1.00,0.00,0.93,3.21,2.60,
120,leg,arm,8.33,4.88,7.90,4.10,0.71,8.96,8.66,6.18,7.07,8.00,8.74,5.00,0.03,6.71,
121,plane,jet,6.13,3.63,5.18,2.20,0.58,5.47,5.82,4.07,5.19,6.00,5.61,6.50,0.07,4.00,
122,woman,man,9.73,6.97,9.52,7.88,0.77,9.80,8.77,7.38,8.87,9.00,9.74,9.00,0.08,7.93,
123,horse,colt,7.95,6.45,6.40,3.70,0.69,6.63,7.73,5.62,6.79,2.00,6.17,7.00,0.05,3.21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,bone,teeth,7.50,5.11,2.05,4.68,4.24,4.95,3.88,2.88,4.90,5.44,4.11,0.50,2.64,3.90,
648,bone,elbow,4.00,1.67,1.90,2.47,1.76,2.33,0.42,1.78,2.25,2.61,2.22,0.20,2.31,1.60,
649,bacon,bean,2.78,1.36,1.85,3.82,2.21,4.02,0.49,1.43,4.47,3.10,3.11,0.25,0.50,1.80,
713,winner,goal,3.00,3.81,6.91,4.50,5.30,5.60,5.00,7.00,5.30,5.28,5.35,4.00,7.66,7.65,


In [35]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                   0
word2                   0
similarity_score_1      0
similarity_score_2      0
similarity_score_3      0
similarity_score_4      0
similarity_score_5      0
similarity_score_6      0
similarity_score_7      0
similarity_score_8      0
similarity_score_9      0
similarity_score_10     0
similarity_score_11     0
similarity_score_12     0
similarity_score_13     0
similarity_score_14     0
similarity_score_15    94
dtype: int64


In [36]:
# Extract missing word pairs
missing_word_pair_list = list(zip(rows_with_null['word1'], rows_with_null['word2']))

# Show results
missing_word_pair_list

[('bad', 'guilty'),
 ('leg', 'arm'),
 ('plane', 'jet'),
 ('woman', 'man'),
 ('horse', 'colt'),
 ('actress', 'actor'),
 ('teacher', 'instructor'),
 ('movie', 'film'),
 ('sheep', 'lamb'),
 ('lady', 'gentleman'),
 ('stomach', 'waist'),
 ('cloud', 'storm'),
 ('joy', 'pride'),
 ('noise', 'rattle'),
 ('rain', 'mist'),
 ('beer', 'beverage'),
 ('man', 'uncle'),
 ('apple', 'juice'),
 ('intelligence', 'logic'),
 ('communication', 'language'),
 ('mink', 'fur'),
 ('mob', 'crowd'),
 ('shore', 'coast'),
 ('wire', 'cord'),
 ('bird', 'turkey'),
 ('bed', 'crib'),
 ('competence', 'ability'),
 ('cloud', 'haze'),
 ('supper', 'meal'),
 ('bar', 'cage'),
 ('water', 'salt'),
 ('sense', 'intuition'),
 ('situation', 'condition'),
 ('crime', 'theft'),
 ('style', 'fashion'),
 ('boundary', 'border'),
 ('arm', 'body'),
 ('boat', 'car'),
 ('sandwich', 'lunch'),
 ('bride', 'princess'),
 ('doctor', 'professor'),
 ('arm', 'vein'),
 ('adult', 'guardian'),
 ('newspaper', 'information'),
 ('communication', 'television'),


In [23]:
# Define prompt
prompt = ("Rate the semantic similarity of each word pair on a scale from 0 to 10, where 0 represents no semantic similarity, and 10 represents perfect semantic similarity.")
user_content_1 = ("[(old, new), (smart, intelligent), (hard, difficult)]")
assistant_content = ("[(old, new, 1.58), (smart, intelligent, 9.20), (hard, difficult, 8.77)]")
user_content_2 = f'"{missing_word_pair_list}"'

In [27]:
# Define message
messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": user_content_1},
    {"role": "assistant", "content": assistant_content},
    {"role": "user", "content": user_content_2},
    ]

# Make API call
# completion = client.chat.completions.create(
#     model=model,
#     messages=messages,
#     n=1,
#     stop=None)

# Store response content
missing_word_pairs_response = [completion.choices[0].message.content]

In [28]:
# Extract data with regular expressions into dictionary
missing_word_pairs_dict = process_responses(missing_word_pairs_response)

# Show results
missing_word_pairs_dict

{('bad', 'guilty'): [2.83],
 ('leg', 'arm'): [7.9],
 ('plane', 'jet'): [7.6],
 ('woman', 'man'): [8.77],
 ('horse', 'colt'): [7.0],
 ('actress', 'actor'): [9.3],
 ('teacher', 'instructor'): [8.0],
 ('movie', 'film'): [9.86],
 ('sheep', 'lamb'): [6.5],
 ('lady', 'gentleman'): [8.7],
 ('stomach', 'waist'): [7.7],
 ('cloud', 'storm'): [7.69],
 ('joy', 'pride'): [6.1],
 ('noise', 'rattle'): [4.4],
 ('rain', 'mist'): [6.5],
 ('beer', 'beverage'): [4.3],
 ('man', 'uncle'): [6.2],
 ('apple', 'juice'): [7.8],
 ('intelligence', 'logic'): [7.8],
 ('communication', 'language'): [7.3],
 ('mink', 'fur'): [4.6],
 ('mob', 'crowd'): [7.1],
 ('shore', 'coast'): [7.8],
 ('wire', 'cord'): [7.0],
 ('bird', 'turkey'): [5.4],
 ('bed', 'crib'): [7.9],
 ('competence', 'ability'): [7.8],
 ('cloud', 'haze'): [5.7],
 ('supper', 'meal'): [8.3],
 ('bar', 'cage'): [4.3],
 ('water', 'salt'): [4.4],
 ('sense', 'intuition'): [8.1],
 ('situation', 'condition'): [7.9],
 ('crime', 'theft'): [6.2],
 ('style', 'fashion'): 

In [29]:
# Iterate over DataFrame and replace missing values
for index, row in df.iterrows():
    word_pair = (row['word1'], row['word2'])

    # Check if current value is NaN
    if pd.isna(row['similarity_score_15']):
        if word_pair in missing_word_pairs_dict:

            # Extract first element from list
            df.at[index, 'similarity_score_15'] = missing_word_pairs_dict[word_pair][0]

# Check if any NaN values left
print(df[df['similarity_score_15'].isna()])

Empty DataFrame
Columns: [word1, word2, similarity_score_1, similarity_score_2, similarity_score_3, similarity_score_4, similarity_score_5, similarity_score_6, similarity_score_7, similarity_score_8, similarity_score_9, similarity_score_10, similarity_score_11, similarity_score_12, similarity_score_13, similarity_score_14, similarity_score_15]
Index: []


In [30]:
# Filter the DataFrame based on pairs
filtered_df = df[df.apply(lambda row: (row['word1'], row['word2']) in missing_word_pair_list, axis=1)]

# Select only the similarity_score_15 column
result = filtered_df[['word1', 'word2', 'similarity_score_15']]

# Show results
print(result)

      word1   word2  similarity_score_15
99      bad  guilty                 2.83
120     leg     arm                 7.90
121   plane     jet                 7.60
122   woman     man                 8.77
123   horse    colt                 7.00
..      ...     ...                  ...
647    bone   teeth                 7.10
648    bone   elbow                 5.30
649   bacon    bean                 4.50
713  winner    goal                 5.90
845      go    send                 6.70

[94 rows x 3 columns]


In [31]:
# Check for rows with at least one null value
rows_with_null = df[df.isnull().any(axis=1)]

# Show results
rows_with_null

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15


In [32]:
# Count null values
count_null_values = df.isnull().sum()

# Show results
print("Null value counts per column:", count_null_values)

Null value counts per column: word1                  0
word2                  0
similarity_score_1     0
similarity_score_2     0
similarity_score_3     0
similarity_score_4     0
similarity_score_5     0
similarity_score_6     0
similarity_score_7     0
similarity_score_8     0
similarity_score_9     0
similarity_score_10    0
similarity_score_11    0
similarity_score_12    0
similarity_score_13    0
similarity_score_14    0
similarity_score_15    0
dtype: int64


In [33]:
# Show results
df

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,similarity_score_9,similarity_score_10,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15
0,happy,cheerful,8.75,9.20,7.76,5.26,9.50,9.60,8.66,8.50,9.15,8.57,8.90,8.20,9.80,8.90,8.89
1,hard,easy,4.10,1.73,0.81,3.00,1.67,3.50,4.52,1.14,1.98,2.14,5.65,3.20,6.00,2.33,5.00
2,fast,rapid,9.50,9.60,9.34,9.75,9.80,8.50,9.68,9.21,8.90,9.29,9.50,9.80,9.70,8.20,9.17
3,happy,glad,8.50,8.60,8.47,7.50,9.40,8.80,7.90,8.09,8.00,8.57,7.80,7.60,9.70,8.50,8.89
4,short,long,7.25,7.50,0.71,5.50,9.60,5.50,6.42,7.80,7.34,8.57,8.20,8.80,9.20,7.50,6.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,join,acquire,7.35,6.60,4.00,3.00,5.12,0.25,6.27,4.38,6.23,6.12,3.93,4.30,3.44,4.00,4.00
992,send,attend,5.30,6.60,2.40,1.00,3.44,0.25,4.44,3.33,4.28,4.65,4.44,4.35,2.09,2.17,3.00
993,gather,attend,6.97,6.00,3.00,1.00,4.20,0.25,5.36,3.33,4.56,4.94,4.21,3.90,2.53,3.52,2.50
994,absorb,withdraw,3.41,3.20,1.80,1.00,2.60,0.25,3.37,2.08,2.67,3.12,2.19,3.40,1.36,5.75,3.00


In [35]:
# Define file_path
file_path = '../../../data/gpt-3.5-turbo-0125/processed/en/f8.csv'

# Check if file already exists
if not os.path.exists(file_path):
    df.to_csv(file_path, index=False)
    print("File saved successfully.")
else:
    print("File already exists. Dataframe was not saved to prevent overwriting.")

File saved successfully.
