In [1]:
%pip install nltk



In [6]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
import random

# Download necessary NLTK data
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

def get_synonyms(word):
    """Get synonyms of a word."""
    synonyms = set()
    for syn in wordnet.synsets(word, lang='fra'):
        for lemma in syn.lemmas(lang='fra'):
            synonyms.add(lemma.name())
    return list(synonyms)

def synonym_replacement(sentence, n=2):
    """Replace n words in the sentence with their synonyms."""
    words = word_tokenize(sentence)
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalpha()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:  # Only replace up to n words
            break

    # Reconstruct the sentence
    sentence = ' '.join(new_words)
    return sentence

# Load your dataset
file_path = './training_data.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Sample 2400 sentences for augmentation
augmentation_sample = data.sample(n=2400, random_state=2)
augmentation_sample['sentence'] = augmentation_sample['sentence'].apply(synonym_replacement)

# Combine the original and augmented data
augmented_dataset = pd.concat([data, augmentation_sample]).reset_index(drop=True)

# Save the augmented dataset to a new CSV file
augmented_dataset.to_csv('Data-Augmentation/augmented_training_data.csv', index=False)

print("Data augmentation complete. Augmented dataset saved.")


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dimitriroulin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dimitriroulin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dimitriroulin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data augmentation complete. Augmented dataset saved.


In [10]:
# Load your dataset
file_path = 'training_data.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

# Get the last ID of the original sentences
last_id = data['id'].max()

# Sample sentences for augmentation (1200 sentences)
augmentation_sample = data.sample(n=1200, random_state=1)
augmentation_sample['sentence'] = augmentation_sample['sentence'].apply(synonym_replacement)

# Increment the ID of the new sentences
augmentation_sample['id'] = augmentation_sample['id'].apply(lambda x: x + last_id + 1)

# Combine the original and augmented data
augmented_dataset = pd.concat([data, augmentation_sample]).reset_index(drop=True)

# Save the augmented dataset to a new CSV file
augmented_dataset.to_csv('Data-Augmentation/augmented_training_data.csv', index=False)

# Create a new CSV file containing only the new sentences
new_sentences = augmentation_sample[['id', 'sentence']]
new_sentences.to_csv('Data-Augmentation/new_sentences.csv', index=False)

print("Data augmentation complete. Augmented dataset saved.")
print("New sentences saved to new_sentences.csv.")


Data augmentation complete. Augmented dataset saved.
New sentences saved to new_sentences.csv.


In [18]:
import pandas as pd

# Read the first file
df1 = pd.read_csv('./combined_data.csv')

# Read the second file
df2 = pd.read_csv('./Data-Augmentation/gpt_augmented_test_1.csv')

# Rename the columns of df2
df2 = df2.rename(columns={'sentence': 'sentence', 'dfficulty': 'difficulty'})

# Check if 'id' column exists in df2
if 'id' not in df2.columns:
    # If 'id' column does not exist, create a new column with incremental values
    df2['id'] = range(len(df2)) + df1['id'].max() + 1

# Concatenate df1 and df2
combined_df = pd.concat([df1, df2])

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('combined_data.csv', index=False)

print("Data combination complete. Combined dataset saved.")


Data combination complete. Combined dataset saved.
