# Data Augmentation

By now it's a known problem that our positive cases are seriously inbalanced.<br>
Which is why we're going to add augmented data to the dataset.<br>
<br>
The augmentation will be simple. We're going to translate our sentences into German and then back into English.<br>
<br>
We use machine translation models, and because this process is a time costly procedure, we have chosen not to add this as a part of our data processing pipeline, but rather to have it pre-processed.<br>
Also as the 'multilabel' data are the only ones we actually use, that will be the only one processed.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-all-data" data-toc-modified-id="Load-all-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load all data</a></span></li><li><span><a href="#Prepare-Translation-Models" data-toc-modified-id="Prepare-Translation-Models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Prepare Translation Models</a></span></li><li><span><a href="#Translate" data-toc-modified-id="Translate-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Translate</a></span></li><li><span><a href="#Erase-Unnamed:-0" data-toc-modified-id="Erase-Unnamed:-0-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Erase Unnamed: 0</a></span></li></ul></div>

In [1]:
import os
import pickle
import pandas as pd
import numpy as np

from datetime import date, datetime

# Machine Translatio
import torch
import fairseq

arXiv:2004.10972
Sets both temperature to 0.8, we follow suit

In [None]:
en2de_temperature = 0.8
de2en_temperature = 0.8

## Load all data

In [None]:
with open("../data/20200405-topic_to_id.pickle", "rb") as input_file:
    topic_to_id = pickle.load(input_file)

In [None]:
data_folder = './data/extra_data/multilabel/'
dfs = []
file_names = os.listdir(data_folder)
for f in file_names:
    dfs.append(pd.read_csv(data_folder + f))

## Prepare Translation Models

In [None]:
de2en = torch.hub.load('pytorch/fairseq',
                       'transformer.wmt19.de-en',
                       checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
                       tokenizer='moses',
                       bpe='fastbpe')
en2de = torch.hub.load('pytorch/fairseq',
                       'transformer.wmt19.en-de',
                       checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt',
                       tokenizer='moses',
                       bpe='fastbpe')

In [None]:
# Move models to GPU
de2en = de2en.cuda()
en2de = en2de.cuda()

## Translate

In [None]:
out_folder ='../data/extra_data/aug/'

In [None]:
def read_timedelta(td):
    return td.days, td.seconds//3600, (td.seconds//60), td.seconds%60

In [None]:
df = dfs[1]
filename = file_names[1]

start = datetime.now()
print(start.strftime("%B %d, %Y %I:%M%p"))
df['aug'] = df['text'].apply(lambda s : de2en.translate(en2de.translate(s, sampling=True, temperature=en2de_temperature), sampling=True, temperature=de2en_temperature))
filename = f.replace('Multi', 'Aug')
filename = date.today().strftime("%Y%m%d") + filename[8:]
df.to_csv(out_folder + filename, index=False)
end = datetime.now()
print(f"Done: {filename}")
print('{} days, {}H {}m {}s'.format(read_timedelta(end-start)))

In [None]:
df = dfs[2]
filename = file_names[2]

start = datetime.now()
print(start.strftime("%B %d, %Y %I:%M%p"))
df['aug'] = df['text'].apply(lambda s : de2en.translate(en2de.translate(s, sampling=True, temperature=en2de_temperature), sampling=True, temperature=de2en_temperature))
filename = f.replace('Multi', 'Aug')
filename = date.today().strftime("%Y%m%d") + filename[8:]
df.to_csv(out_folder + filename, index=False)
end = datetime.now()
print(f"Done: {filename}")
print('{} days, {}H {}m {}s'.format(read_timedelta(end-start)))

In [None]:
df = dfs[3]
filename = file_names[3]

start = datetime.now()
print(start.strftime("%B %d, %Y %I:%M%p"))
df['aug'] = df['text'].apply(lambda s : de2en.translate(en2de.translate(s, sampling=True, temperature=en2de_temperature), sampling=True, temperature=de2en_temperature))
filename = f.replace('Multi', 'Aug')
filename = date.today().strftime("%Y%m%d") + filename[8:]
df.to_csv(out_folder + filename, index=False)
end = datetime.now()
print(f"Done: {filename}")
print('{} days, {}H {}m {}s'.format(read_timedelta(end-start)))

In [None]:
df = dfs[4]
filename = file_names[4]

start = datetime.now()
print(start.strftime("%B %d, %Y %I:%M%p"))
df['aug'] = df['text'].apply(lambda s : de2en.translate(en2de.translate(s, sampling=True, temperature=en2de_temperature), sampling=True, temperature=de2en_temperature))
filename = f.replace('Multi', 'Aug')
filename = date.today().strftime("%Y%m%d") + filename[8:]
df.to_csv(out_folder + filename, index=False)
end = datetime.now()
print(f"Done: {filename}")
print('{} days, {}H {}m {}s'.format(read_timedelta(end-start)))

## Erase Unnamed: 0

In [3]:
import os
import pandas as pd

In [5]:
data_folder = '../data/extra_data/aug/'
file_names = os.listdir(data_folder)
for f in file_names:
    pd.read_csv(data_folder + f).drop(['Unnamed: 0'], axis=1).to_csv(data_folder + f, index=False)