### 0. Set-up

In [2]:
# Import libraries and utils
%run '../../data_processing/simulate/utils.ipynb'

In [None]:
### simulation - human like

def generate_word_pairs(n):
    words = [
        "oud", "nieuw", "slim", "intelligent", "hard", "moeilijk", "gelukkig", "vrolijk", 
        "stoer", "snel", "razendsnel", "blij", "kort", "lang", "dom", "stom", "breed", 
        "smal", "slecht", "vreselijk", "makkelijk", "moeilijk", "krijgen", "kopen", 
        "verzamelen", "opslaan", "vervangen", "herstellen", "samenvoegen", "toevoegen", 
        "trouwen", "accepteren", "bezorgen", "bijvoegen", "plaatsen", "ophangen", "gaan", 
        "binnenkomen"
        ]
    return [(random.choice(words), random.choice(words)) for _ in range(n)]

def random_format(word_pair, idx, score):
    formats = [
        f"{idx}. ('{word_pair[0]}', '{word_pair[1]}') - {score}\n   - 'Oud' means 'old', and 'nieuw' means new, which are antonyms, representing opposite concepts.",
        f"{idx}. ('{word_pair[0]}', '{word_pair[1]}'): {score} - 'Slim' translates to smart, and 'intelligent' is directly equivalent to intelligent in English. These words are very closely related in meaning.",
        f"{idx}. ('{word_pair[0]}', '{word_pair[1]}'): {score} - 'Random description.'",
        f"{idx}. {word_pair[0]} and {word_pair[1]} are rated at {score} on a scale from 0 to 10."
    ]
    return random.choice(formats)

def create_simulated_text(word_pairs):
    simulated_text = "Ratings of semantic similarity for each word pair on a scale from 0 to 10, where 0 means no similarity and 10 means perfect similarity:\n\n"
    for idx, word_pair in enumerate(word_pairs, start=1):
        score = round(random.uniform(0, 10), 2)
        simulated_text += random_format(word_pair, idx, score) + "\n"
    return simulated_text

def split_text(text, max_chunk_size=1000):
    chunks = []
    current_chunk = ""
    for line in text.split('\n'):
        if len(current_chunk) + len(line) + 1 > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = ""
        else:
            current_chunk += line + '\n'
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

def convert_output_to_dataframe(text, iteration):
    pattern = r"(\d+)\. \('(\w+)', '(\w+)'\)\s*[:\-]\s*([0-9\.]+)"
    matches = re.findall(pattern, text)
    df = pd.DataFrame(matches, columns=['idx', 'word1', 'word2', 'similarity_score'])
    df['similarity_score'] = df['similarity_score'].astype(float)
    df.drop(['idx'], axis=1, inplace=True)
    df.columns = ['word1', 'word2', f'similarity_score_{iteration}']
    return df

def process_multiple_iterations(word_pairs, num_iterations=5):
    base_df = None
    
    for i in range(1, num_iterations + 1):
        text = create_simulated_text(word_pairs)
        chunks = split_text(text)
        iteration_data = []
        
        for chunk in chunks:
            df_iteration = convert_output_to_dataframe(chunk, i)
            iteration_data.append(df_iteration)
        
        df_iteration_combined = pd.concat(iteration_data, ignore_index=True).drop_duplicates()
        
        if base_df is None:
            base_df = df_iteration_combined
        else:
            base_df = pd.merge(base_df, df_iteration_combined, on=['word1', 'word2'], how='outer')

    return base_df


### 1. Test with human-like prompts

In [16]:
# Generate unique word pars
word_pairs = generate_word_pairs(n=999)

# Process data
final_df = process_multiple_iterations(word_pairs)

# Show results
print(final_df.info())
final_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4810 entries, 0 to 4809
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   word1               4810 non-null   object 
 1   word2               4810 non-null   object 
 2   similarity_score_1  4584 non-null   float64
 3   similarity_score_2  4563 non-null   float64
 4   similarity_score_3  4560 non-null   float64
 5   similarity_score_4  4572 non-null   float64
 6   similarity_score_5  4563 non-null   float64
dtypes: float64(5), object(2)
memory usage: 300.6+ KB
None


Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5
0,toevoegen,accepteren,1.14,7.30,8.58,8.07,
1,vrolijk,ophangen,1.49,9.46,7.28,,1.76
2,dom,oud,0.23,8.26,2.14,4.92,2.17
3,dom,oud,0.23,8.26,2.14,4.92,8.98
4,dom,oud,0.23,8.26,2.14,3.60,2.17
...,...,...,...,...,...,...,...
4805,samenvoegen,vervangen,,,,7.14,1.31
4806,moeilijk,bijvoegen,,,,7.80,
4807,krijgen,stoer,,,,,4.92
4808,vreselijk,smal,,,,,7.12


In [22]:
# Simulate a long unstructured output by repeating a smaller block of text multiple times
example_text = """
Hier is de beoordeling van de semantische gelijkenis voor elk opgegeven woordpaar op een schaal van 0 tot 10, waarbij 0 geen overeenkomst betekent en 10 volledige overeenkomst:

1. ('oud', 'nieuw') - 0.50
   - Ondanks dat ze tegengesteld zijn, zijn ze gerelateerd in het concept van tijd en leeftijd.

2. ('slim', 'intelligent') - 9.50
   - Deze twee woorden zijn bijna synoniemen.

3. ('hard', 'moeilijk') - 7.50
   - "Hard" kan betrekking hebben op moeilijkheid, hoewel het ook andere betekenissen heeft.

4. ('gelukkig', 'vrolijk') - 8.00
   - Beide woorden beschrijven een positieve emotionele staat, met lichte nuances.

5. ('hard', 'stoer') - 4.00
   - Beide kunnen kracht of vastberadenheid impliceren, maar worden in verschillende contexten gebruikt.

6. ('snel', 'razendsnel') - 9.00
   - "Razendsnel" is een intensivering van "snel"; beide beschrijven hoge snelheid.

7. ('gelukkig', 'blij') - 8.50
   - Deze woorden zijn sterk gerelateerd en worden vaak als synoniemen beschouwd.

8. ('kort', 'lang') - 0.50
   - Tegengestelden, maar gerelateerd door het concept van lengte.

9. ('dom', 'stom') - 8.00
   - Beide woorden hebben een vergelijkbare betekenis die een gebrek aan intelligentie of wijsheid uitdrukt.

Deze scores zijn gebaseerd op de context en de relaties tussen de woorden zoals algemeen begrepen in de Nederlandse taal.
""" * 111 # Repeat to simulate a large dataset

# Run the process for 20 iterations
final_df = process_multiple_iterations(example_text)

# Show results
print(final_df.info())
final_df

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 8
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   word1                9 non-null      object 
 1   word2                9 non-null      object 
 2   similarity_score_1   9 non-null      float64
 3   similarity_score_2   9 non-null      float64
 4   similarity_score_3   9 non-null      float64
 5   similarity_score_4   9 non-null      float64
 6   similarity_score_5   9 non-null      float64
 7   similarity_score_6   9 non-null      float64
 8   similarity_score_7   9 non-null      float64
 9   similarity_score_8   9 non-null      float64
 10  similarity_score_9   9 non-null      float64
 11  similarity_score_10  9 non-null      float64
 12  similarity_score_11  9 non-null      float64
 13  similarity_score_12  9 non-null      float64
 14  similarity_score_13  9 non-null      float64
 15  similarity_score_14  9 non-null      float64

Unnamed: 0,word1,word2,similarity_score_1,similarity_score_2,similarity_score_3,similarity_score_4,similarity_score_5,similarity_score_6,similarity_score_7,similarity_score_8,...,similarity_score_11,similarity_score_12,similarity_score_13,similarity_score_14,similarity_score_15,similarity_score_16,similarity_score_17,similarity_score_18,similarity_score_19,similarity_score_20
0,oud,nieuw,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
1,kort,lang,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,hard,stoer,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
3,gelukkig,vrolijk,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,slim,intelligent,9.5,9.5,9.5,9.5,9.5,9.5,9.5,9.5,...,9.5,9.5,9.5,9.5,9.5,9.5,9.5,9.5,9.5,9.5
5,dom,stom,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
6,gelukkig,blij,8.5,8.5,8.5,8.5,8.5,8.5,8.5,8.5,...,8.5,8.5,8.5,8.5,8.5,8.5,8.5,8.5,8.5,8.5
7,hard,moeilijk,7.5,7.5,7.5,7.5,7.5,7.5,7.5,7.5,...,7.5,7.5,7.5,7.5,7.5,7.5,7.5,7.5,7.5,7.5
8,snel,razendsnel,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,...,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0


In [14]:
print(final_df)

      word1        word2  similarity_score_1  similarity_score_2  \
0       oud        nieuw                 0.5                 0.5   
1      slim  intelligent                 9.5                 9.5   
2      hard     moeilijk                 7.5                 7.5   
3  gelukkig      vrolijk                 8.0                 8.0   
4      hard        stoer                 4.0                 4.0   
5      snel   razendsnel                 9.0                 9.0   
6  gelukkig         blij                 8.5                 8.5   
7       dom         stom                 8.0                 8.0   
8      kort         lang                 0.5                 0.5   

   similarity_score_3  similarity_score_4  similarity_score_5  \
0                 0.5                 0.5                 0.5   
1                 9.5                 9.5                 9.5   
2                 7.5                 7.5                 7.5   
3                 8.0                 8.0                 8

In [5]:
# Simulate a long unstructured output by repeating a smaller block of text multiple times
example_text = """
To rate the semantic similarity of each Dutch word pair from 0 to 10, where 0 represents no similarity and 10 represents perfect similarity, I'll consider how closely related the meanings of the words are in each pair. Here are the ratings with explanations:

1.  ('oud', 'nieuw') - 0.00: "Oud" means old, and "nieuw" means new, which are antonyms, representing opposite concepts.
2.  ('slim', 'intelligent') - 9.50: "Slim" translates to smart, and "intelligent" is directly equivalent to intelligent in English. These words are very closely related in meaning.
3.  ('hard', 'moeilijk') - 6.00: "Hard" can mean hard or difficult, and "moeilijk" directly translates to difficult. They share a common meaning in one of the senses of "hard."
4.  ('gelukkig', 'vrolijk') - 7.50: "Gelukkig" means happy, and "vrolijk" means cheerful. Both suggest positive emotional states, though "vrolijk" is more about being upbeat and lively.
5.  ('hard', 'stoer') - 2.00: "Hard" can mean hard or tough, and "stoer" means tough or cool. They overlap in the sense of toughness but differ in additional connotations.
6.  ('snel', 'razendsnel') - 9.00: "Snel" means fast, and "razendsnel" translates to very fast. "Razendsnel" is an intensified version of "snel," indicating a high degree of similarity.
7.  ('gelukkig', 'blij') - 8.50: "Gelukkig" means happy, and "blij" also means happy or glad. These words are synonymous but may be used in slightly different contexts.
8.  ('kort', 'lang') - 0.00: "Kort" means short, and "lang" means long, which are antonyms, representing opposite lengths.
9. ('dom', 'stom') - 8.00: Beide woorden hebben een vergelijkbare betekenis die een gebrek aan intelligentie of wijsheid uitdrukt.
These ratings are based on the primary meanings of the words in contemporary Dutch.
""" * 111 # Repeat to simulate a large dataset

# Run the process for 20 iterations
final_df = process_multiple_iterations(example_text)

# Display part of the final dataframe and check number of entries
print(f"\nTotal entries processed: {len(final_df)}")
final_df


Total entries processed: 9


Unnamed: 0,word1,word2,similarity_score_1_x,similarity_score_1_y,similarity_score_1_x.1,similarity_score_1_y.1,similarity_score_1_x.2,similarity_score_1_y.2,similarity_score_1_x.3,similarity_score_1_y.3,...,similarity_score_20_x,similarity_score_20_y,similarity_score_20_x.1,similarity_score_20_y.1,similarity_score_20_x.2,similarity_score_20_y.2,similarity_score_20_x.3,similarity_score_20_y.3,similarity_score_20_x.4,similarity_score_20_y.4
0,oud,nieuw,0.0,,,0.0,,0.0,,,...,,,0.0,,0.0,,,0.0,,
1,slim,intelligent,9.5,,9.5,9.5,,9.5,,9.5,...,,9.5,9.5,,9.5,,9.5,9.5,,
2,hard,moeilijk,6.0,,6.0,,,6.0,,6.0,...,,6.0,,,6.0,,6.0,,,
3,gelukkig,vrolijk,7.5,,7.5,,7.5,7.5,,7.5,...,,7.5,,7.5,7.5,,7.5,,7.5,
4,snel,razendsnel,,9.0,9.0,,9.0,,9.0,9.0,...,9.0,9.0,,9.0,,9.0,9.0,,9.0,
5,gelukkig,blij,,8.5,,,8.5,,8.5,,...,8.5,,,8.5,,8.5,,,8.5,
6,kort,lang,,0.0,,0.0,0.0,,0.0,,...,0.0,,0.0,0.0,,0.0,,0.0,0.0,
7,dom,stom,,8.0,,8.0,8.0,,8.0,,...,8.0,,8.0,8.0,,8.0,,8.0,8.0,
8,hard,stoer,,,2.0,,2.0,,,2.0,...,,2.0,,2.0,,,2.0,,2.0,


In [None]:
# Simulate a long unstructured output by repeating a smaller block of text multiple times
example_text = """
Hier zijn de beoordelingen van semantische gelijkenis voor elk woordpaar:
1. ('oud', 'nieuw'): 1.00
2. ('slim', 'intelligent'): 0.90
3. ('hard', 'moeilijk'): 0.20
4. ('gelukkig', 'vrolijk'): 0.70
5. ('hard', 'stoer'): 0.60
6. ('snel', 'razendsnel'): 1.00
7. ('gelukkig', 'blij'): 0.80
8. ('kort', 'lang'): 0.10
9. ('dom', 'stom'): 8.00
Dit zijn subjectieve beoordelingen op basis van de mate van overeenkomst in betekenis tussen de woordparen
""" * 111  # Repeat to simulate a large dataset

# Run the process for 20 iterations
final_df = process_multiple_iterations(example_text)

# Display part of the final dataframe and check number of entries
print(f"\nTotal entries processed: {len(final_df)}")
final_df