In [1]:
import pandas as pd
from pathlib import Path
from pangres import upsert
from sqlalchemy import create_engine, text

# Config

In [2]:
NB_SAMPLES = 1000
SEED = 123
LANGUAGE = 'it'
BASE_PATH = Path('it').resolve()
DB_PATH = 'cv.sqlite3'
engine = create_engine(f'sqlite:///{DB_PATH}')

# Get table containing clip paths

In [3]:
df_commons = (pd.read_csv(BASE_PATH / 'validated.tsv', sep='\t', usecols=['path', 'sentence_id', 'sentence'])
              .convert_dtypes())
assert df_commons['path'].is_unique
df_commons.set_index('path', inplace=True)
df_commons.head()

Unnamed: 0_level_0,sentence_id,sentence
path,Unnamed: 1_level_1,Unnamed: 2_level_1
common_voice_it_23606167.mp3,14e6f7d4a8eaf63cb2c1e91d712fe636aec623a466d411...,Il libro ha suscitato molte polemiche a causa ...
common_voice_it_20045040.mp3,0743b8daf1ffe9887c5ba462b5cd81a35f679ea0c2e2e3...,Fin dall'inizio la sede episcopale è stata imm...
common_voice_it_26970935.mp3,302c15d1ad4f39b8a961e3d2cee6732ed6dbf2c5af0730...,"Fu il fondatore di molti chiostri, ospedali e ..."
common_voice_it_17544185.mp3,9cb2b99c1a6d21b8c4280f85ae051b8532401392e7b3f1...,Il vuoto assoluto?
common_voice_it_20042813.mp3,0734e569c8853e509186ad2e91c511d46d9283df8dd715...,"Dopo alcuni anni, egli decise di tornare in In..."


# Get the paths of `NB_SAMPLES` clips

In [4]:
nb_clips = len(df_commons)
if nb_clips < NB_SAMPLES:
    logger.warning(f'NB_SAMPLES ({NB_SAMPLES}) > number of clips ({nb_clips}) | We will use all clips instead')
    NB_SAMPLES = nb_clips

df_commons_sample = df_commons.sample(NB_SAMPLES, random_state=SEED)
df_commons_sample.head()

Unnamed: 0_level_0,sentence_id,sentence
path,Unnamed: 1_level_1,Unnamed: 2_level_1
common_voice_it_32671878.mp3,34d62b82cce0334fc156e6754e9320e8b4d6d09152eb5c...,Beth accetta.
common_voice_it_19983792.mp3,0234ccc4d8569b1d653055cb5884924d53b6b13692175f...,Il pezzo ha avuto moltissimo successo nel mondo.
common_voice_it_23989089.mp3,190f431e0c7145d70702b4e7b8e582dbfb8adc5adb5edc...,La camera funebre ha volta piatta con i lati r...
common_voice_it_21262721.mp3,1075be9d8faa8faed772694349f4f432b4593f5f5c70da...,Dopo questo album la band si sciolse.
common_voice_it_20003259.mp3,04468068bfee66d8f8075cc871fc2ed5653485a33d56da...,Egli introdusse un nuovo sistema di scommesse ...


# Save

In [5]:
table_name = f'samples_{LANGUAGE}'

with engine.connect() as connection:
    connection.execute(text(f'DROP TABLE IF EXISTS {table_name};'))

upsert(con=engine, df=df_commons_sample,
       table_name=table_name, if_row_exists='update')