# Tatoeba sentence dataset

dataset location: `dataset/sayings, proverbs, idioms/English_phrases_and_sayings.csv`
* [Download](https://downloads.tatoeba.org/exports/sentences.tar.bz2)
* [Source](https://tatoeba.org/en/downloads)

In [None]:
# Prepare for import
import os, csv
import stanza
from django.conf import settings
from datacore.models import Document, Phrase, Template, PhraseCollection
from datacore.functions.stanza import get_stanza

# set up NLP pipeline
nlp = get_stanza(lang="en", processors='tokenize')

Reference1, created  = Reference.objects.get_or_create(title="Tatoeba - official homepage", url="https://tatoeba.org/en", description="Tatoeba is a collection of sentences and translations.It's collaborative, open, free and even addictive.")
Reference2, created  = Reference.objects.get_or_create(title="Tatoeba - Dataset", url="https://tatoeba.org/en/downloads", description="Tatoeba dataset.")
data_source, created = DataSource.objects.get_or_create(title="Tatoeba")
data_source.references.add(Reference1, Reference2)

tatoeba, created = PhraseCollection.objects.get_or_create(title="Tatoeba Phrases")
tatoeba.data_sources.add(data_source)

In [None]:
# Download dataset if it doens't exist
url = 'https://downloads.tatoeba.org/exports/sentences.tar.bz2'
dir_path = os.path.join(settings.BASE_DIR, '../dataset/tatoeba/')
path = os.path.join(dir_path, 'sentences.csv')
if not os.path.exists(path):
    from datacore.functions.utils import download, get_or_create_dir
    get_or_create_dir(dir_path)
    download(url, path)

In [None]:
# Read file and import
file = open(path, "r", encoding='utf-8')
csv_reader = csv.reader(file, delimiter='\t')
rows_counted = 0
rows_impoerted = 0
for row in csv_reader:
    rows_counted = rows_counted + 1
    try:
        # Import phrases in all languages or in a specific language using alpha3 code in row[1]
        if row[1] == 'eng':
            sentence = row[2]
            doc = nlp(sentence)
            for sentence in doc.sentences:
                phrase, created = Phrase.objects.get_or_create(text=sentence.text)
                tatoeba.phrases.add(phrase)
            rows_impoerted = rows_impoerted + 1
    except Exception as e:
        print(f"Error in importing row '{rows_counted}' with text: '{row[2]}'\nError: {e}\n")
print(f"\nImported {rows_impoerted} from {rows_counted} rows")