# Dataset Preprocessing: Translation

In [None]:
import os
import json
from getpass import getpass
import multiprocessing

import pandas as pd
import openai
from tqdm import tqdm

from src.data.translate_openai import translate_openai

In [None]:
openai.api_key = getpass()

In [None]:
data_folder = '../../data/raw/unarxive_citrec'
interim_folder = '../../data/interim'
processed_folder = '../../data/processed'
splits = ['train', 'valid', 'test']

In [None]:
lock = multiprocessing.Lock()


# Define the worker function
def worker_function(worker_id, task_queue):
    while True:
        task = task_queue.get()  # Get a task from the queue
        if task is None:
            break  # Exit the loop when None is received as a task
        s, i, split_name = task
        print(f"Worker {worker_id} is processing: {s} at {i}")
        try:
            tmp = translate_openai(s)
            lock.acquire()
            out = open(os.path.join(interim_folder, split_name + '.out'), 'a', encoding='utf-8')
            out.write(f"\n###{i}@@ {tmp}")
            out.flush()
            out.close()
            lock.release()
            # print(tmp)
        except Exception as e:
            print(f"Fail!! pos={i}")
            print(e)

In [None]:
num_workers = 8

# Create and start worker processes
workers = []
task_queue = multiprocessing.Queue()
for worker_id in range(num_workers):
    worker = multiprocessing.Process(target=worker_function, args=(worker_id, task_queue))
    workers.append(worker)
    worker.start()

for split_name, st in zip(splits, [1094, 0, 0]):
    arr = json.load(open(os.path.join(data_folder, split_name + '.json'), 'r', encoding='utf-8'))
    print(f"Working on {split_name}...")

    for i in range(st, len(arr)):
        s = arr[i]
        task_queue.put((s, i, split_name))

# Add None to the queue for each worker to signal them to exit
for _ in range(num_workers):
    task_queue.put(None)

# Wait for all workers to finish
for worker in workers:
    worker.join()

print("All workers have finished")

## Dataset Preprocessing: Build

In [None]:
for split_name in splits:
    src = json.load(open(os.path.join(data_folder, split_name + '.json'), 'r', encoding='utf-8'))
    with open(os.path.join(interim_folder, split_name + '.out'), 'r', encoding='utf-8') as f:
        text = '\n'.join(f.readlines())
        lines = text.split('\n###')
        targets = [None for _ in range(len(src))]
        for line in lines[1:]:
            pos = line.find('@@ ')
            idx = int(line[:pos])
            aft = line[pos + 3:]
            targets[idx] = aft.strip()
        df = pd.DataFrame({'source': src, 'target': targets})

        # fix missing values by trying to translate again
        print(f"Fixing missing values in {split_name}...")
        # save the indices of missing values
        idx = df.loc[df['target'].isna()].index
        print(list(idx))
        df.loc[df['target'].isna(), 'target'] = [translate_openai(s) for s in tqdm(df.loc[df['target'].isna(), 'source'])]

        df.to_csv(os.path.join(processed_folder, split_name + '.csv'), index=False)

Results have been saved to `data/processed` folder and uploaded to huggingface datasets as [unarxive-en2ru](https://huggingface.co/datasets/waleko/unarxive-en2ru).