In [None]:
import pandas as pd
import numpy as np
import torch
import pickle
from tqdm import tqdm_notebook as tqdm
import os
import re
import pickle

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
print("gpu num: ", n_gpu)

## Load pre-trained translation model from fairseq.
You can always use another middle language or other models.

In [None]:
# Load translation model
en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru.single_model', tokenizer='moses', bpe='fastbpe')
ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en.single_model', tokenizer='moses', bpe='fastbpe')

In [None]:
en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', tokenizer='moses', bpe='fastbpe')
de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en.single_model', tokenizer='moses', bpe='fastbpe')

In [None]:
en2ru.cuda()
ru2en.cuda()

In [None]:
en2de.cuda()
de2en.cuda()

## Load data

In [None]:
path = './'
train_df = pd.read_csv(path+'train.csv', header=None)
train_df.head()

In [None]:
train_labels = [v-1 for v in train_df[0]]
train_text = [v for v in train_df[2]]

In [None]:
train_text[0]

In [None]:
len(train_text)

In [None]:
max(train_labels)

In [None]:
# split and get our unlabeled training data
def train_val_split(labels, n_labeled_per_class, n_labels, seed = 0):
    np.random.seed(seed)
    labels = np.array(labels)
    train_labeled_idxs = []
    train_unlabeled_idxs = []
    val_idxs = []

    for i in range(n_labels):
        idxs = np.where(labels == i)[0]
        np.random.shuffle(idxs)
        train_labeled_idxs.extend(idxs[:n_labeled_per_class])
        train_unlabeled_idxs.extend(idxs[n_labeled_per_class : n_labeled_per_class + 10000])
        val_idxs.extend(idxs[-3000:])
    
    np.random.shuffle(train_labeled_idxs)
    np.random.shuffle(train_unlabeled_idxs)
    np.random.shuffle(val_idxs)
    return train_labeled_idxs, train_unlabeled_idxs, val_idxs

In [None]:
train_labeled_idxs, train_unlabeled_idxs, val_idxs = train_val_split(train_labels, 500, 10)

In [None]:
len(train_unlabeled_idxs)

In [None]:
idxs = train_unlabeled_idxs

In [None]:
idxs[0]

## Back translation process
You can tune the temperature in the translation process to control the diversity.

In [None]:
# back translate using Russian as middle language
def translate_ru(start, end, file_name):
    trans_result = {}
    for id in tqdm(range(start, end)):
        trans_result[idxs[id]] = ru2en.translate(en2ru.translate(train_text[idxs[id]],  sampling = True, temperature = 0.9),  sampling = True, temperature = 0.9)
        if id % 500 == 0:
            with open(file_name, 'wb') as f:
                pickle.dump(trans_result, f)
    with open(file_name, 'wb') as f:
        pickle.dump(trans_result, f)

In [None]:
# back translate using German as middle language
def translate_de(start, end, file_name):
    trans_result = {}
    for id in tqdm(range(start, end)):
        trans_result[idxs[id]] = de2en.translate(en2de.translate(train_text[idxs[id]],  sampling = True, temperature = 0.9),  sampling = True, temperature = 0.9)
        if id % 500 == 0:
            with open(file_name, 'wb') as f:
                pickle.dump(trans_result, f)
    with open(file_name, 'wb') as f:
        pickle.dump(trans_result, f)

In [None]:
translate_de(0,100000, 'de_1.pkl')