<a href="https://colab.research.google.com/github/RtjShreyD/Mandrin2Eng_native_trans/blob/master/Preprocessing_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from collections import defaultdict
from google.colab import drive
drive.mount('/content/drive')
import csv

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#download datasets sentences.tar.bz2 and links.tar.bz2 from https://tatoeba.org/eng/downloads

def read_sentences(filename, target_language, src_language, sentences_with_audio=None):
    """
    Read sentences.csv and returns a dict containing sentence information.
    Parameters:
        filename (str): filename of 'sentence.csv'
        target_language (str): target language
        src_language (str): src language
        sentences_with_audio (set of int): set of sentence ids with audio.
            If not None, limit the output to this set.
    Returns:
        dict from sentence id (int) to Sentence information, where
        sentence information is a dict with 'sent_id', 'lang', and 'text' keys.
        dict only contains sentences in target_language or src_language."""

    sentences = {}
    for line in open(filename):
        sent_id, lang, text = line.rstrip().split('\t')
        if lang == src_language or lang == target_language:
            sent_id = int(sent_id)
            if (sentences_with_audio is not None
                    and lang == target_language
                    and sent_id not in sentences_with_audio):
                continue
            sentences[sent_id] = {'sent_id': sent_id, 'lang': lang, 'text': text}
    return sentences


def read_sentences_with_audio(filename):
    """
    Read sentences_with_audio.cvs file and returns a set of all the sentence IDs with audio.
    Parameters:
        filename (str): filename of 'sentences_with_audio.csv' file.
    Returns:
        set of all the sentence ids (int).
    """
    sentences_with_audio = set()
    for line in open(filename):
        sent_id = int(line.rstrip())
        sentences_with_audio.add(sent_id)
    return sentences_with_audio


def read_links(filename):
    """
    Read links.csv and returns a dict containing links information.
    Args:
        filename (str): filename of 'links.csv'
    Returns:
        dict from sentence id (int) of a sentence and a set of all its translation sentence ids."""

    links = defaultdict(set)
    for line in open(filename):
        sent_id, trans_id = line.rstrip().split('\t')
        links[int(sent_id)].add(int(trans_id))
    return links


def generate_translation_pairs(sentences, links, target_language, src_language):
    """
    Given sentences and links, generate a list of sentence pairs in target and source languages.
    Parameters:
        sentences: dict of sentence information (returned by read_sentences())
        links: dict of links information (returned by read_links())
        target_language (str): target language
        src_language (str): src language
    Returns:
        list of sentence pairs (sentence info 1, sentence info 2)
        where sentence info 1 is in target_language and sentence info 2 in src_language.
    """
    translations = []
    for sent_id, trans_ids in links.items():
        # Links in links.csv are reciprocal, meaning that if (id1, id2) is in the file,
        # (id2, id1) is also in the file. So we don't have to check both directions.
        if sent_id in sentences and sentences[sent_id]['lang'] == target_language:
            for trans_id in trans_ids:
                if trans_id in sentences and sentences[trans_id]['lang'] == src_language:
                    translations.append((sentences[sent_id], sentences[trans_id]))
    return translations


def write_tsv(translations):
    """
    Write translations as TSV to stdout.
    Parameters:
        translations (list): list of sentence pairs returned by generate_translation_pairs()
    """
    out_file = "/content/drive/My Drive/realworldnlp-master/data/mt/tatoeba.eng_cmn.tsv"
    dev_file = "/content/drive/My Drive/realworldnlp-master/data/mt/tatoeba.eng_cmn.dev.tsv"
    test_file = "/content/drive/My Drive/realworldnlp-master/data/mt/tatoeba.eng_cmn.test.tsv"
    train_file = "/content/drive/My Drive/realworldnlp-master/data/mt/tatoeba.eng_cmn.train.tsv"
    x  = 0
    with open(out_file, "w") as out, open(dev_file, "w") as dev, open(test_file, "w") as test, open(train_file, "w") as train:
      for sent1, sent2 in translations:
          sent1_text = '{text}'.format(**sent1)
          sent2_text = '{text}'.format(**sent2)
          print("%s\t%s" % (sent1_text, sent2_text))
          out.write("%s\t%s\n" % (sent1_text, sent2_text))
          x += 1
          if (x % 10 == 1):
            test.write("%s\t%s\n" % (sent1_text, sent2_text))
            print("Test written at x", x)
          elif (x % 10 == 2):
            dev.write("%s\t%s\n" % (sent1_text, sent2_text))
            print("dev written at x", x)
          else:
            train.write("%s\t%s\n" % (sent1_text, sent2_text))
            print("train written at x", x)

def main():
    
    target_language = "eng"
    src_language = "cmn"

    sentences = read_sentences("/content/drive/My Drive/realworldnlp-master/data/mt/sentences.csv", target_language, src_language, None)

    links = read_links("/content/drive/My Drive/realworldnlp-master/data/mt/links.csv")

    translations = generate_translation_pairs(sentences, links, target_language, src_language)

    write_tsv(translations)

if __name__ == '__main__':
    main()

Let's try something.	我們試試看！
Test written at x 1
I have to go to sleep.	我该去睡觉了。
dev written at x 2
Today is June 18th and it is Muiriel's birthday!	今天是６月１８号，也是Muiriel的生日！
train written at x 3
Muiriel is 20 now.	Muiriel现在20岁了。
train written at x 4
The password is "Muiriel".	密码是"Muiriel"。
train written at x 5
The password is "Muiriel".	密碼是「Muiriel」。
train written at x 6
I will be back soon.	我很快就會回來。
train written at x 7
I'm at a loss for words.	我不知道應該說什麼才好。
train written at x 8
This is never going to end.	這個永遠完不了了。
train written at x 9
This is never going to end.	这将永远继续下去。
train written at x 10
I just don't know what to say.	我只是不知道應該說什麼而已……
Test written at x 11
I just don't know what to say.	我就是不知道說些什麼。
dev written at x 12
That was an evil bunny.	那是一隻有惡意的兔子。
train written at x 13
I was in the mountains.	我以前在山里。
train written at x 14
Is it a recent picture?	那是一张近照吗？
train written at x 15
I don't know if I have the time.	我不知道我有沒有時間。
train written at x 16
Education in this world disappoints 