In [4]:
import pandas as pd
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import random

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/yushi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/yushi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/yushi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/yushi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Get the list of English stop words
stop_words = set(stopwords.words('english'))

def get_pos_tag(word):
    return nltk.pos_tag([word])[0][1]

def get_synonyms(word, pos_tag):
    synonyms = []
    if word.lower() in stop_words:
        return []  # Return an empty list if the word is a stop word
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            lemma_name = lemma.name()
            if lemma_name.lower() != word.lower() and lemma_name not in stop_words:  # Additional check for case-insensitive match
                synonyms.append(lemma_name)
    return list(set(synonyms))  # Return unique synonyms

def replace_with_synonym(word, synonyms):
    if synonyms:
        return random.choice(synonyms)
    return word

def augment_with_synonyms(text, target_replacements):
    sentences = sent_tokenize(text)
    augmented_sentences = []
    replacements = 0
    words_processed = []

    for sentence in sentences:
        tokenized = word_tokenize(sentence)
        replaceable_tokens = [token for token in tokenized if get_synonyms(token, get_pos_tag(token))]

        words_to_replace = random.sample(replaceable_tokens, min(target_replacements - replacements, len(replaceable_tokens)))

        augmented_tokens = []

        for token in tokenized:
            if replacements >= target_replacements:
                augmented_tokens.append(token)
                continue

            if token in words_to_replace and token not in words_processed and token.lower() not in stop_words:
                pos_tag = get_pos_tag(token)
                synonyms = get_synonyms(token, pos_tag)
                original_token = token
                token = replace_with_synonym(token, synonyms)

                if token != original_token:
                    replacements += 1
                    words_processed.append(original_token)

            augmented_tokens.append(token)

        augmented_sentence = ' '.join(augmented_tokens)
        augmented_sentences.append(augmented_sentence)

        if replacements >= target_replacements:
            break

    augmented_text = ' '.join(augmented_sentences)
    return augmented_text



# Example usage
original_text = "Within what time in the period of infectivity will an immunocompetent child display chickenpox signs after contacting an infectious patient (with chickenpox)?"
target_replacements = 5  # Maximum number of words to be replaced

print("Original Text:", original_text)
augmented_text = augment_with_synonyms(original_text, target_replacements=target_replacements)
print(f"Augmented Text:", augmented_text)

Original Text: Within what time in the period of infectivity will an immunocompetent child display chickenpox signs after contacting an infectious patient (with chickenpox)?
Augmented Text: Within what time in the period of infectivity will an immunocompetent nestling display varicella subscribe after contact an infectious patient_role ( with chickenpox ) ?


In [3]:
# Load the dataset
df = pd.read_csv('/code/llm-fine-tuning/lek_training_sets/lek_augmented_50_three_quarters.csv')

# Apply the augmentation function to 'augmented' rows and remove the original 'augmented' rows
df.loc[df['Data Type'] == 'augmented', 'Prompt'] = df[df['Data Type'] == 'augmented']['Prompt'].apply(lambda x: augment_with_synonyms(x, 5))

# Save the modified dataframe
output_file_path = '/code/llm-fine-tuning/lek_augmented_50_syn_replaced_three_quarters.csv'
df.to_csv(output_file_path, index=False)

output_file_path

'/code/llm-fine-tuning/lek_augmented_150_syn_replaced_three_quarters.csv'

In [4]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize

# Make sure to download the 'punkt' package with the following line:
# nltk.download('punkt')

# Load your dataset
df = pd.read_csv('/code/llm-fine-tuning/lek_training_sets/lek_full_random.csv')

# Calculate the average number of words in the 'Prompt' column
df['word_count'] = df['Prompt'].apply(lambda x: len(word_tokenize(x)))
average_word_count = df['word_count'].mean()

print("Average number of words in 'Prompt':", average_word_count)


Average number of words in 'Prompt': 31.834096109839816


In [3]:
import pandas as pd

# Load your dataset
df = pd.read_csv('/code/llm-fine-tuning/lek_training_sets/lek_three_quarters_random.csv')

# Define a function to modify the 'Answer' column
def modify_answer(row):
    answer_letter = row['Answer']
    answer_text = row[answer_letter]
    return f"{answer_letter}) {answer_text}"

# Apply the function to each row
df['Answer'] = df.apply(modify_answer, axis=1)

# Save the modified dataset to a new file
df.to_csv('/code/llm-fine-tuning/lek_training_sets/lek_three_quarters_random_long_answer.csv', index=False)
