In [None]:
!git clone https://github.com/facebookresearch/seamless_communication.git && cd seamless_communication && pip install .

In [None]:
import pandas as pd
import re
import ast
from tqdm import tqdm

In [None]:
import torch
from seamless_communication.models.inference import Translator

MODEL = "seamlessM4T_large"
VOCODER = "vocoder_36langs"
DEVICE = "cuda"
TGT_LANG = "urd"
SRC_LANG = "eng"

translator = Translator(MODEL, vocoder_name_or_card=VOCODER, device=torch.device(DEVICE))

In [None]:
def translate_m4t(translator, text):
    translated_text, _, _ = translator.predict(text, "t2tt", TGT_LANG, src_lang=SRC_LANG)
    return str(translated_text)

def translate(translator, SQuAD, data):
    for i in data:
        print(f"Translating data: {(i + 1)}/{len(data)}")
        df = SQuAD[SQuAD["data_num"] == i]
        rows = []
        for j in tqdm(range(len(df))):
            data_num = df.iloc[j]["data_num"]
            paragraph_num = df.iloc[j]["paragraph_num"]
            id = df.iloc[j]["id"]
            title = df.iloc[j]["title"]
            context = ast.literal_eval(df.iloc[j]["context"])
            question = df.iloc[j]["question"]
            is_impossible = df.iloc[j]["is_impossible"]
            title_ = translate_m4t(translator, title)
            question_ = translate_m4t(translator, question)
            
            context_ = []
            review = True
            for sentence in context:
                if "••" in sentence:
                sentence = re.sub("••'", "\"", sentence)
                sentence = re.sub("'••", "\"", sentence)
                sentence = re.sub("••", "\"", sentence)
                sentence_ = translate_m4t(translator, sentence)
                if sentence_.count("\"") == 2:
                    sentence_ = re.sub("\"", "••", sentence_)
                    review = False
                    context_.append(sentence_)
                else:
                    sentence_= translate_m4t(translator, sentence)
                    context_.append(sentence_)

            context_ = " ".join(context_)

            row = (data_num, paragraph_num, id, title_, context_, question_, is_impossible, review)
            rows.append(row)

        df_translated = pd.DataFrame(rows, columns=["data_num", "paragraph_num", "id", "title", "context",
                                                    "question", "is_impossible", "review"])
        df_translated.to_csv(f"SQuAD-UR/train-v2.0/{i}.csv")

In [None]:
SQuAD_train = pd.read_csv("/SQuAD/train-v2.0-clean.csv")

In [None]:
translate(translator, SQuAD_train, list(range(0, 442)))

In [None]:
SQuAD_dev = pd.read_csv("/SQuAD/dev-v2.0-clean.csv")

In [None]:
translate(translator, SQuAD_dev, list(range(0, 35)))