In [1]:
import os
from pathlib import Path
import numpy as np
import cv2
import librosa
import pickle
import random
import string
import pandas as pd
import torch
from torchvision.transforms import v2
import soundfile as sf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
age_trans = { 
    1 : '[07, 13]',
    2 : '[14, 18]',
    3 : '[19, 24]',
    4 : '[25, 32]',
    5 : '[33, 45]',
    6 : '[46, 60]',
    7 : '[61, inf]'
}
gender_trans = {
    1 : 'Male',
    2 : 'Female'
}

ethnic_trans = {
    1 : 'Asian',
    2 : 'Caucasian',
    3 : 'African-American'
}

In [3]:
import pandas as pd

train_csv = pd.read_csv('data/train_set_age_labels.csv')

In [4]:
train_csv

Unnamed: 0,VideoName,UserID,AgeGroup,Gender,Ethnicity
0,YFm-8VGsUGg.004.mp4,YFm-8VGsUGg,1,2,2
1,-2qsCrkXdWs.001.mp4,-2qsCrkXdWs,1,1,2
2,rW7b48Dy_80.000.mp4,rW7b48Dy_80,1,2,2
3,1zpucNpIDk8.000.mp4,1zpucNpIDk8,1,2,3
4,2KSBoJZMcMU.000.mp4,2KSBoJZMcMU,1,1,2
...,...,...,...,...,...
6001,f9GeYKAXgAQ.002.mp4,f9GeYKAXgAQ,7,2,2
6002,f9GeYKAXgAQ.003.mp4,f9GeYKAXgAQ,7,2,2
6003,f9GeYKAXgAQ.005.mp4,f9GeYKAXgAQ,7,2,2
6004,f9GeYKAXgAQ.004.mp4,f9GeYKAXgAQ,7,2,2


We are going only to augment data for the age group 1, 2, 4, 5 and for the asian, african-american

In [5]:
filtered_df = train_csv[train_csv['Ethnicity'].isin([1, 3])]
filtered_df = filtered_df[filtered_df['AgeGroup'].isin([1, 2, 3, 5, 6, 7])]
filtered_df

Unnamed: 0,VideoName,UserID,AgeGroup,Gender,Ethnicity
3,1zpucNpIDk8.000.mp4,1zpucNpIDk8,1,2,3
13,4lyKh2_xCDQ.003.mp4,4lyKh2_xCDQ,2,2,1
14,4lyKh2_xCDQ.004.mp4,4lyKh2_xCDQ,2,2,1
18,0a5FOYBAIcc.001.mp4,0a5FOYBAIcc,2,2,3
19,6wHQsN5g2RM.000.mp4,6wHQsN5g2RM,2,1,1
...,...,...,...,...,...
5939,IMCEXoAkZv4.000.mp4,IMCEXoAkZv4,6,1,3
5940,IMCEXoAkZv4.002.mp4,IMCEXoAkZv4,6,1,3
5941,IMCEXoAkZv4.003.mp4,IMCEXoAkZv4,6,1,3
5942,IMCEXoAkZv4.004.mp4,IMCEXoAkZv4,6,1,3


In [6]:
def add_noise(text, noise_level=0.001):
    # Determine the number of characters to modify based on noise level
    num_noise_chars = int(len(text) * noise_level)
    
    # Create a list of characters from the text
    text_chars = list(text)
    
    # Modify random characters in the text
    for _ in range(num_noise_chars):
        # Choose a random index to modify
        index = random.randint(0, len(text_chars) - 1)
        # Choose a random modification type: insert, delete, or replace
        modification_type = random.choice(['insert', 'delete', 'replace'])
        
        if modification_type == 'insert':
            # Insert a random character at the chosen index
            random_char = random.choice(string.ascii_letters + string.digits)
            text_chars.insert(index, random_char)
        elif modification_type == 'delete':
            # Delete the character at the chosen index
            del text_chars[index]
        else:  # modification_type == 'replace'
            # Replace the character at the chosen index with a random character
            random_char = random.choice(string.ascii_letters + string.digits)
            text_chars[index] = random_char
    
    # Join the modified characters back into a string
    noisy_text = ''.join(text_chars)
    
    return noisy_text

def augment_audio(audio):
    # Generate random noise with the same length as the audio data
    mean = random.uniform(-0.1, 0.1)
    std = random.uniform(0, 0.1)
    noise = np.random.normal(mean, std, len(audio))  # Mean=0, standard deviation=0.1

    noisy_audio_data = audio + noise
    noisy_audio_data = np.clip(noisy_audio_data, -1, 1)
    return noisy_audio_data

def augment_image(image):
    min_sc = random.uniform(0, 1)
    max_sc = random.uniform(0, 1)
    if min_sc > max_sc:
        min_sc, max_sc = max_sc, min_sc

    transforms = v2.Compose([
        v2.ToTensor(),
        v2.RandomResizedCrop(size=(224, 224), antialias=True),
        v2.ColorJitter(brightness=random.uniform(0, 0.5), contrast=random.uniform(0, 0.5), saturation=random.uniform(0, 0.5), hue=random.uniform(0, 0.5)),#i dont know what values to put
        v2.RandomAffine(degrees=random.uniform(0, 0.5), translate=(random.uniform(0, 0.5), random.uniform(0, 0.5)), scale=(min_sc, max_sc), shear=random.uniform(0, 0.5)),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    return transforms(image)

In [7]:
import matplotlib.pyplot as plt
# Looping over rows
root = 'data/train/'

added = []

for index, row in filtered_df.iterrows():
    video_name = row['VideoName'][:-4 ]
    userID = row['UserID']
    age_group = row['AgeGroup']
    gender = row['Gender']
    ethnicity = row['Ethnicity']

    base_path = root + str(age_group) + '/' + video_name
    noise_path = base_path + '_noise'

    img_path = base_path + '.jpg'
    aud_path = base_path + '.wav'
    txt_path = base_path + '.pkl'

    image = cv2.imread(img_path)

    audio, sample_rate = librosa.load(aud_path, sr=None)
    with open(txt_path, 'rb') as pkl_file:
        text = pickle.load(pkl_file)

    for i in range(10):

        au_audio = augment_audio(audio)
        au_img = augment_image(image)
        au_img = au_img.numpy()

        au_text = add_noise(text, random.uniform(0.1, 0.7))

        noise_img = noise_path + str(i) + '.jpg'
        noise_aud = noise_path + str(i) + '.wav'
        noise_txt = noise_path + str(i) + '.pkl'

        cv2.imwrite(noise_img, au_img.transpose(1, 2, 0) * 255)

        #save text
        with open(noise_txt, 'wb') as f:
            pickle.dump(au_text, f)

        #save audio
        sf.write(noise_aud, au_audio, sample_rate)

        added.append({
            'VideoName': video_name + '_noise' + str(i) + '.mp4',
            'UserID': userID,
            'AgeGroup': age_group,
            'Gender' : 	gender,
            'Ethnicity': ethnicity
        })

        print(video_name + '_noise' + str(i) + '.mp4')
        print(au_text)

df_to_append = pd.DataFrame(added)

# Appending the new DataFrame to the original DataFrame
result_df = pd.concat([train_csv, df_to_append], ignore_index=True)

result_df.to_csv('data/train_set_age_labels_noise.csv', index=False)




1zpucNpIDk8.000_noise0.mp4
-ind ecting PfRpy. I love a gad bfuogcer anR ce3iv. OB6rxerxfCnd cdQhipA2r,Gb2urraGndPWci1xs, ut0geMTan cGps evrwhee.t utYf Put1 waWA oCvav3..I9doHt eat L allQW tSe ime, btitS'2smy favr4i, fZ II-
1zpucNpIDk8.000_noise1.mp4
-and eting fry. I ove a gZo urgePrPanHd Lhips.Burger and chipEe burgeet anE cips, bruer and chips evewywhehe.bBut I j5st want to eat ... I don' eaKfit all t6eU time, but it's m nrfXavoritedif I-
1zpucNpIDk8.000_noise2.mp4
-a9nt etnNg fy I lov7 a otd burger and cRhipHs. BWrgEHm Tan chiJ, burgerWGhnd Echips burgwe wd chiA IevTqghL8ere Bt I jst waA to eat ..j.  doqn't eat it allWC tze Utime, butit6s umy 4aEPArte3 if I-
1zpucNpIDk8.000_noise3.mp4
-ndCea4ing4 mfrVVyAY l loie a gcomd bunrcr an ccqais. B3ur72ger andl hGps, bYuTurgcrgQn cxipvs,Eurer a9d3Yi3q4s everywW But I jusWt wan RJ0at8s.M I ddnTt e2t it alhthe tme3J,butcit's my aoZrite, if II-
1zpucNpIDk8.000_noise4.mp4
-8arnd eatingjfry. I love a good burer and chips. Burge and chips, 0burger

In [None]:
import pickle

with open('data/train_set_age_labels_noise.csv', 'rb') as f:
    data = pickle.load(f)