# Tatoeba sentence dataset

dataset location: `dataset/sayings, proverbs, idioms/English_phrases_and_sayings.csv`
* [Download](https://downloads.tatoeba.org/exports/sentences.tar.bz2)
* [Source](https://tatoeba.org/en/downloads)

I got memory error running this code in one of my lower-end servers, bacause `pandas` loads entire dataset into RAM. so I've created anothed file with `csv_reader`:
> `notebooks/Import - Export - Scrapping/Import Tatoeba(csv.reader).ipynb`

In [None]:
# imports
import os
import pandas as pd
from tqdm.auto import tqdm

from django.conf import settings
from datacore.models import Document, Phrase, Template, PhraseCollection
from datacore.functions.stanza import get_stanza

In [None]:
# set up NLP pipeline
nlp = get_stanza(lang="en", processors='tokenize')

Reference1, created  = Reference.objects.get_or_create(title="Tatoeba - official homepage", url="https://tatoeba.org/en", description="Tatoeba is a collection of sentences and translations.It's collaborative, open, free and even addictive.")
Reference2, created  = Reference.objects.get_or_create(title="Tatoeba - Dataset", url="https://tatoeba.org/en/downloads", description="Tatoeba dataset.")
data_source, created = DataSource.objects.get_or_create(title="Tatoeba")
data_source.references.add(Reference1, Reference2)

tatoeba, created = PhraseCollection.objects.get_or_create(title="Tatoeba Phrases xxx")
tatoeba.data_sources.add(data_source)

In [None]:
# Download dataset if it doens't exist
url = 'https://downloads.tatoeba.org/exports/sentences.tar.bz2'
dir_path = os.path.join(settings.BASE_DIR, '../dataset/tatoeba/')
path = os.path.join(dir_path, 'sentences.csv')
if not os.path.exists(path):
    from datacore.functions.utils import download, get_or_create_dir
    get_or_create_dir(dir_path)
    download(url, path)

In [None]:
print("Loading dataset...")
df = pd.read_csv(path, encoding='utf-8', sep='\t', header=None, usecols=[1, 2], names=['language', 'sentence'])
df.info()
df.head()

In [None]:
#OPTIONAL: Only import english sentences
df = df[df.language == 'eng']

In [None]:
# Import phrases
for row in tqdm(df.iterrows(), total=len(df)):
    try:
        # Import phrases in all languages or in a specific language using alpha3 code in row[1]
        doc = nlp(row[1]['sentence'])
        for sentence in doc.sentences:
            phrase, created = Phrase.objects.get_or_create(text=sentence.text)
            tatoeba.phrases.add(phrase)
    except Exception as e:
        print(f"error in importing row #{row.index()} with data: {row[1]}\nError: {e}\n")