## Image data augmentation 

In [1]:
import os

def list_directory_contents(dir_path):
    for root, dirs, files in os.walk(dir_path):
        level = root.replace(dir_path, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 4 * (level + 1)


In [2]:

directory_path = 'raw-data'
list_directory_contents(directory_path)

raw-data/
    AbstractScenes_v1.1/
        AbstractScenes_v1.1/
            Pngs/
            RenderedScenes/
            RenderedSeedScenes/
            SemanticClassesRender/
            SimpleSentences/
                tuples/
            VisualFeatures/
            WordFeatures/
    CoDraw/
        asset/
        css/
        dataset/
        images/
        imgs/
        js/
        output/
        script/
            .mypy_cache/
                3.6/
                    collections/
                    json/
                    os/
    GloVe/
    i-CLEVR/
        images/
            .ipynb_checkpoints/
        scenes/
            .ipynb_checkpoints/
        text/


In [8]:
from PIL import Image, ImageOps
import os
import random

def augment_images_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.png'):
            img_path = os.path.join(folder_path, filename)
            with Image.open(img_path) as img:
                # Apply random rotation
                if random.choice([True, False]):
                    img = img.rotate(random.randint(-30, 30), expand=True)
                
                # Apply random scaling
                if random.choice([True, False]):
                    scale = random.uniform(0.8, 1.2)  # Scale between 80% and 120%
                    width, height = img.size
                    img = img.resize((int(width * scale), int(height * scale)))
                
                # Apply random translation
                if random.choice([True, False]):
                    img = ImageOps.exif_transpose(img)
                
                # Apply random flipping
                if random.choice([True, False]):
                    img = img.transpose(Image.FLIP_LEFT_RIGHT)
                
                # Save the augmented image
                img.save(img_path)

In [9]:
# Example usage:
augment_images_in_folder('raw-data/AbstractScenes_v1.1/AbstractScenes_v1.1/Pngs')

In [10]:
import os
import random
import numpy as np
from PIL import Image, ImageEnhance

def random_noise(image):
    # Convert image to array
    img_array = np.array(image)
    # Generate noise and add it to the image
    noise = np.random.randint(-30, 30, img_array.shape, dtype='int16')
    img_array = img_array + noise
    img_array = np.clip(img_array, 0, 255)  # Ensure pixel values are valid
    return Image.fromarray(img_array.astype('uint8'))

def adjust_colors(image):
    # Randomly adjust the brightness, contrast, and color
    enhancers = [
        ImageEnhance.Brightness(image),
        ImageEnhance.Contrast(image),
        ImageEnhance.Color(image)
    ]
    factors = [random.uniform(0.5, 1.5) for _ in enhancers]  # Random factor between 0.5 and 1.5
    
    for enhancer, factor in zip(enhancers, factors):
        image = enhancer.enhance(factor)
    return image

def augment_images_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.png'):
            img_path = os.path.join(folder_path, filename)
            with Image.open(img_path) as img:
                # Apply random noise
                if random.choice([True, False]):
                    img = random_noise(img)
                
                # Apply random color adjustments
                if random.choice([True, False]):
                    img = adjust_colors(img)

                # Save the augmented image
                img.save(img_path)

In [11]:
# Example usage:
augment_images_in_folder('raw-data/CoDraw/images')

In [12]:
import os
import random
import numpy as np
from PIL import Image, ImageFilter

def apply_noise(image):
    img_array = np.asarray(image).astype(np.int16)
    noise = np.random.randint(-20, 20, img_array.shape)
    noisy_img = img_array + noise
    noisy_img = np.clip(noisy_img, 0, 255)  #
    return Image.fromarray(noisy_img.astype('uint8'))

def apply_blur(image):
    return image.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.5, 2.0)))

def apply_sharpen(image):
    return image.filter(ImageFilter.UnsharpMask(radius=1, percent=150))

def augment_images_in_folder(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.png'):
            img_path = os.path.join(folder_path, filename)
            with Image.open(img_path) as img:
                actions = [apply_noise, apply_blur, apply_sharpen]
                random.shuffle(actions)  
                for action in actions:
                    if random.choice([True, False]):  
                        img = action(img)
                img.save(img_path)

In [13]:
augment_images_in_folder('raw-data/i-CLEVR/images')


## Textual data augmentation

In [1]:
import random
import nltk
from nltk.corpus import wordnet
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rania\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rania\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
import nltk
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
avoid_verbs = {"is", "am", "are", "was", "were", "be", "being", "been", "have", "has", "had"}

def get_wordnet_pos(tag):
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag[0].upper(), wordnet.NOUN)

def most_similar_synonym(word, pos):
    if word in stop_words or word in avoid_verbs:
        return word  # Skip stopwords and specific verbs
    synsets = wordnet.synsets(word, pos=pos)
    if not synsets:
        return word
    best_synonym = word
    best_similarity = 0.0
    original_synset = synsets[0]  # Assuming the first synset is the most common usage
    for synset in synsets:
        for lemma in synset.lemmas():
            if lemma.name() == word:
                continue
            similarity = original_synset.path_similarity(synset)
            if similarity and similarity > 0.9 and similarity > best_similarity:
                best_similarity = similarity
                best_synonym = lemma.name().replace('_', ' ')
    return best_synonym if best_similarity > 0.9 else word

def synonym_replacement(sentence):
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)
    new_words = words.copy()
    for i, (word, tag) in enumerate(tagged_words):
        if tag.startswith('NNP') or word.lower() in stop_words or word.lower() in avoid_verbs:  # Skip proper nouns and specific words
            continue
        wn_tag = get_wordnet_pos(tag)
        new_words[i] = most_similar_synonym(word, wn_tag)
    return ' '.join(new_words)

def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.readlines()
    
    with open(file_path, 'w', encoding='utf-8') as file:
        for sentence in sentences:
            modified_sentence = synonym_replacement(sentence.strip())
            file.write(modified_sentence + '\n')




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rania\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rania\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rania\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rania\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
file_path = 'raw-data/AbstractScenes_v1.1/AbstractScenes_v1.1/Sentences_1002.txt'
process_file(file_path)

## Preparing the word embedding model 

In [3]:
from gensim.models import KeyedVectors

# Load the binary Word2Vec model
model = KeyedVectors.load_word2vec_format('raw-data/GloVe/GoogleNews-vectors-negative300.bin', binary=True)

# Save the model in text format
model.save_word2vec_format('raw-data/GloVe/GoogleNews-vectors-negative300.txt', binary=False)