# Building Vocabulary

This notebook was used to define the words selected as target vocabulary.

In [None]:
import os
import pandas as pd

# Change working directory to the project root directory
current_dir = os.getcwd()
os.chdir(current_dir + "/../")
os.getcwd()


In [2]:
def text_normalization(text):
    text = text.lower()
    text = text.strip()
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = text.replace("\t", " ")
    text = text.replace("-", " ")
    text = text.replace(" ", "_")
    return text

In [3]:
vocab_df = pd.read_json("data/raw/selected_video_variants.json").drop_duplicates()
vocab_df.duplicated().sum()

0

In [4]:
count_data = (
    vocab_df.groupby("word")["dataset"]
    .agg(["nunique", "count"])
    .sort_values(["count", "nunique"], ascending=False)
)

for i in range(3, 7):
    print(f"Words with {i} options and 3 datasets:")
    display(
        count_data[(count_data["nunique"] == 3) & (count_data["count"] == i)].shape[0]
    )

Words with 3 options and 3 datasets:


28

Words with 4 options and 3 datasets:


55

Words with 5 options and 3 datasets:


218

Words with 6 options and 3 datasets:


20

In [5]:
candidates = (
    count_data[(count_data["nunique"] == 3) & (count_data["count"] >= 5)]
    .sort_index()
    .index
)

In [6]:
with open("data/interim/word_candidates.txt", "w") as f:
    for word in candidates:
        f.write(word + "\n")

In [7]:
with open("data/interim/best_candidates.txt", "r") as f:
    best_candidates = [line.strip() for line in f]

In [8]:
best_candidates = [word.split(":")[0] for word in best_candidates]
print(f"Number of best candidates: {len(best_candidates)}")

Number of best candidates: 66


In [9]:
BASE_DIR = os.getcwd() + "/data/raw/"

# Set paths to the raw data files
ne_path = BASE_DIR + "INES/"
sb_path = BASE_DIR + "SignBank/"
uf_path = BASE_DIR + "UFV/"
vl_path = BASE_DIR + "V-Librasil/"
try:
    ne_raw_df = pd.read_csv(ne_path + "metadata.csv")
except FileNotFoundError:
    print("INES metadata not found")

try:
    sb_raw_df = pd.read_csv(sb_path + "metadata.csv")
except FileNotFoundError:
    print("SignBank metadata not found")

try:
    uf_raw_df = pd.read_csv(uf_path + "metadata.csv")
except FileNotFoundError:
    print("UFV metadata not found")

try:
    vl_raw_df = pd.read_csv(vl_path + "metadata.csv")
except FileNotFoundError:
    print("V-Librasil metadata not found")

In [10]:
ne_df = ne_raw_df[ne_raw_df["file_exists"]]
ne_df[ne_df["scraped_label"].duplicated(keep=False)].head(10).sort_values("label")
ne_df = ne_df.drop_duplicates(keep="first")
ne_df["label"] = ne_df["label"].apply(text_normalization)
ne_df["label_number"] = ne_df["scraped_label"].str.extract(r"(\d+)").astype(float)
ne_df["label_number"] = ne_df["label_number"].fillna(0).astype(int)

sb_df = sb_raw_df.drop_duplicates(keep="first")
sb_df["label"] = sb_df["label"].apply(text_normalization)

vl_df = vl_raw_df.drop_duplicates(keep="first")
vl_df["label"] = vl_df["label"].apply(text_normalization)

uf_df = uf_raw_df.drop_duplicates(keep="first")
uf_df["label"] = uf_df["label"].apply(text_normalization)

In [11]:
display(uf_df["url_available"].value_counts())
uf_df["word_in_url"].value_counts()

url_available
True    919
Name: count, dtype: int64

word_in_url
True    919
Name: count, dtype: int64

In [12]:
vocab_df

Unnamed: 0,dataset,word,chosen_video
0,INES,abacaxi,0
1,SIGNBANK,abacaxi,1
2,INES,abanar,0
3,SIGNBANK,abanar,1
4,INES,abandonar,1
...,...,...,...
2058,SIGNBANK,zíper,1
2059,INES,zíper,0
2060,V-LIBRASIL,zíper,2
2061,V-LIBRASIL,zíper,1


In [13]:
source_map = {"INES": "ne", "SIGNBANK": "sb", "UFV": "uf", "V-LIBRASIL": "vl"}
vocab_df["source"] = vocab_df["dataset"].map(source_map)
vocab_df["word"] = vocab_df["word"].apply(text_normalization)
vocab_df.head()

Unnamed: 0,dataset,word,chosen_video,source
0,INES,abacaxi,0,ne
1,SIGNBANK,abacaxi,1,sb
2,INES,abanar,0,ne
3,SIGNBANK,abanar,1,sb
4,INES,abandonar,1,ne


In [14]:
ne_df.columns, sb_df.columns, vl_df.columns

(Index(['label', 'video_url', 'signer_number', 'data_source', 'scraped_label',
        'number_in_label', 'file_exists', 'letter', 'assuntos', 'acepção',
        'exemplo', 'exemplo libras', 'classe gramatical', 'origem',
        'label_number'],
       dtype='object'),
 Index(['label', 'video_url', 'signer_number', 'data_source', 'scraped_label',
        'scraped_video_url', 'sign_variant', 'signer_number.1',
        'video_url_root', 'video_url_ext', 'number_in_label'],
       dtype='object'),
 Index(['label', 'video_url', 'signer_number', 'data_source', 'sign_url',
        'signer_order'],
       dtype='object'))

In [15]:
selected_ne_df = ne_df.loc[:, ["label", "label_number", "data_source", "video_url"]]
selected_ne_df.rename(columns={"label_number": "sign_id"}, inplace=True)
selected_ne_df["signer_number"] = 1

selected_sb_df = sb_df.loc[:, ["label", "data_source", "video_url", "sign_variant", "signer_number"]]
selected_sb_df.rename(columns={"sign_variant": "sign_id"}, inplace=True)

selected_vl_df = vl_df.loc[:, ["label", "data_source", "video_url", "signer_number"]]
# temporarily use signer_number as sign_id, since selected_video_variants.json uses it that way
selected_vl_df["sign_id"] = selected_vl_df["signer_number"]

In [16]:
column_order = ['label', 'data_source', 'video_url', 'signer_number', 'sign_id']
combined_df = pd.concat([
    selected_ne_df[column_order], 
    selected_sb_df[column_order], 
    selected_vl_df[column_order],
    ], axis=0)
combined_df.head()

Unnamed: 0,label,data_source,video_url,signer_number,sign_id
0,a,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0
1,abacate,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0
2,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0
3,abafar,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0
4,abaixo,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0


In [17]:
full_data = combined_df.merge(
    vocab_df,
    left_on=["label", "data_source", "sign_id"],
    right_on=["word", "source", "chosen_video"],
    how="inner",
)
full_data = full_data.sort_values(["label", "data_source", "sign_id"])
full_data = full_data.reset_index(drop=True)
display(full_data.head())
print(full_data.shape)

Unnamed: 0,label,data_source,video_url,signer_number,sign_id,dataset,word,chosen_video,source
0,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0,INES,abacaxi,0,ne
1,abacaxi,sb,https://videos.nals.cce.ufsc.br/SignBank/Vídeo...,2,1,SIGNBANK,abacaxi,1,sb
2,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1,1,V-LIBRASIL,abacaxi,1,vl
3,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2,2,V-LIBRASIL,abacaxi,2,vl
4,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,3,3,V-LIBRASIL,abacaxi,3,vl


(2047, 9)


In [18]:
with open("data/interim/best_candidates.txt", "r") as f:
    selected_labels = [line.strip() for line in f]
    selected_labels = [word.split(":")[0] for word in selected_labels]
len(selected_labels)

66

In [19]:
selected_data = full_data[full_data["label"].isin(selected_labels)]
selected_data = selected_data.drop(
    ["dataset", "word", "word", "source", "chosen_video"], axis=1
)
display(selected_data.head())
print(selected_data.shape)

Unnamed: 0,label,data_source,video_url,signer_number,sign_id
0,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0
1,abacaxi,sb,https://videos.nals.cce.ufsc.br/SignBank/Vídeo...,2,1
2,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1,1
3,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2,2
4,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,3,3


(333, 5)


## Compare UFV to rest of datasets

This next section will focus on combining data from all 4 datasets and return a target selection

In [20]:
word_candidates = selected_data["label"].unique()
uf_subset_df = (
    uf_df[uf_df["label"].isin(word_candidates)]
    .sort_values("label")
    .reset_index(drop=True)
).drop(
    columns=[
        "Unnamed: 0",
        "url_available",
        "word_in_url",
        "hand_gesture",
        "example_libras",
        "category",
    ]
)
uf_subset_df

Unnamed: 0,label,example_pt,video_url,decoded_word
0,abacaxi,Eu vendo abacaxi.,https://sistemas.cead.ufv.br/capes/dicionario/...,abacaxi
1,adulto,Na idade de 21 anos já é adulto.,https://sistemas.cead.ufv.br/capes/dicionario/...,adulto
2,ajudar,Posso ajudar você a fazer o doce.,https://sistemas.cead.ufv.br/capes/dicionario/...,ajudar
3,animal,No zoológico há vários animais.,https://sistemas.cead.ufv.br/capes/dicionario/...,animal
4,aniversário,O aniversário dela é amanhã.,https://sistemas.cead.ufv.br/capes/dicionario/...,aniversario
5,ano,O ano tem doze meses.,https://sistemas.cead.ufv.br/capes/dicionario/...,ano
6,banana,Não gosto de vitamina de banana.,https://sistemas.cead.ufv.br/capes/dicionario/...,banana
7,banheiro,Meu apartamento tem três banheiros grandes.,https://sistemas.cead.ufv.br/capes/dicionario/...,banheiro
8,bebê,"Adoro bebês, eles são muito fofos.",https://sistemas.cead.ufv.br/capes/dicionario/...,bebe
9,cabeça,Eu sempre tenho dores de cabeça.,https://sistemas.cead.ufv.br/capes/dicionario/...,cabeca


In [21]:
subset_words = uf_subset_df["label"].unique()

In [21]:
import json
import ipywidgets as widgets
from IPython.display import display, HTML
from ipywidgets import interact
import re


# Use regex to filter out words that start with a letter in the range
selected_vocab = [word for word in sorted(subset_words)]

# JSON file path
json_file = "data/raw/combined/words_in_all_datasets.json"

# Load existing selections if the file exists
if os.path.exists(json_file):
    with open(json_file, "r") as f:
        try:
            selected_words = json.load(f)
        except json.JSONDecodeError:
            selected_words = []  # If file is empty or corrupted, reset list
else:
    selected_words = []


def save_selection(dataset, word, match):
    """Save selected video to list and update JSON file without overwriting."""
    global selected_words

    # Append new selection
    selected_words.append({"dataset": dataset, "word": word, "video_match": match})

    # Save to JSON
    with open(json_file, "w") as f:
        json.dump(selected_words, f, indent=4)


def display_video(word):
    """Display videos from three datasets with selection buttons."""
    display(HTML(f"<h3>Selected Word: {word}</h3>"))
    display(
        HTML(
            f"<h4>Word {sorted(selected_vocab).index(word) + 1}/{len(selected_vocab)} ({(sorted(selected_vocab).index(word) + 1) / (len(selected_vocab)):.2%})</h4>"
        )
    )

    # V-LIBRASIL videos
    display(HTML("<h4>V-Librasil</h4>"))
    for _, row in selected_data[
        (selected_data["label"] == word) & (selected_data["data_source"] == "vl")
    ].iterrows():
        print("Signer: ", row["sign_id"])
        display(
            HTML(
                f'<video width=400 controls onloadedmetadata="this.playbackRate=2"><source src="{row["video_url"]}" type="video/mp4"></video>'
            )
        )

    # INES videos
    print("-" * 10)
    display(HTML("<h4>INES</h4>"))
    for _, row in selected_data[
        (selected_data["label"] == word) & (selected_data["data_source"] == "ne")
    ].iterrows():
        print("Definition ", row["sign_id"])
        display(
            HTML(
                f'<video width=400 controls onloadedmetadata="this.playbackRate=1.5"><source src="{row["video_url"]}" type="video/mp4"></video>'
            )
        )

    # SIGNBANK videos
    print("-" * 10)
    display(HTML("<h4>SignBank</h4>"))
    for _, row in selected_data[
        (selected_data["label"] == word) & (selected_data["data_source"] == "sb")
    ].iterrows():
        print("Sign variant: ", row["sign_id"])
        display(
            HTML(
                f'<video width=400 controls onloadedmetadata="this.playbackRate=1.5"><source src="{row["video_url"]}" type="video/mp4"></video>'
            )
        )

    # UFV videos
    print("-" * 10)
    display(HTML("<h4>UFV</h4>"))
    for _, row in uf_subset_df[uf_subset_df["label"] == word].iterrows():
        display(
            HTML(
                f'<video width=400 controls onloadedmetadata="this.playbackRate=1.5"><source src="{row["video_url"]}" type="video/mp4"></video>'
            )
        )
        btn = widgets.Button(description=f"Choose {word}")
        btn.on_click(
            lambda b, dataset="UFV", w=word, m=True: save_selection(dataset, w, m)
        )
        display(btn)


# Interactive dropdown for word selection
interact(display_video, word=sorted(selected_vocab));


interactive(children=(Dropdown(description='word', options=('abacaxi', 'adulto', 'ajudar', 'animal', 'aniversá…

In [22]:
uf_subset_df["data_source"] = "uf"
uf_subset_df["sign_id"] = 0
uf_subset_df["signer_number"] = 1
uf_subset_df = uf_subset_df.drop(columns=["example_pt", "decoded_word"])

In [23]:
combined_metadata = pd.concat([selected_data, uf_subset_df], axis=0)
# Set sign_id to 0 for V-Librasil videos to reflect them all having the same sign
combined_metadata.loc[combined_metadata["data_source"] == "vl", ["sign_id"]] = 0
combined_metadata

Unnamed: 0,label,data_source,video_url,signer_number,sign_id
0,abacaxi,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,0
1,abacaxi,sb,https://videos.nals.cce.ufsc.br/SignBank/Vídeo...,2,1
2,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1,0
3,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2,0
4,abacaxi,vl,https://libras.cin.ufpe.br/storage/videos/2021...,3,0
...,...,...,...,...,...
24,pai,uf,https://sistemas.cead.ufv.br/capes/dicionario/...,1,0
25,pessoa,uf,https://sistemas.cead.ufv.br/capes/dicionario/...,1,0
26,sopa,uf,https://sistemas.cead.ufv.br/capes/dicionario/...,1,0
27,sorvete,uf,https://sistemas.cead.ufv.br/capes/dicionario/...,1,0


In [24]:
reviewed_words = pd.read_json("data/raw/combined/words_in_all_datasets.json")
combined_metadata = (
    combined_metadata[combined_metadata["label"].isin(reviewed_words["word"])]
    .sort_values(["label", "data_source", "sign_id"])
    .reset_index(drop=True)
)
combined_metadata

Unnamed: 0,label,data_source,video_url,signer_number,sign_id
0,ajudar,ne,https://www.ines.gov.br/dicionario-de-libras/p...,1,2
1,ajudar,sb,https://videos.nals.cce.ufsc.br/SignBank/V%C3%...,1,1
2,ajudar,uf,https://sistemas.cead.ufv.br/capes/dicionario/...,1,0
3,ajudar,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1,0
4,ajudar,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2,0
...,...,...,...,...,...
145,vagina,sb,https://videos.nals.cce.ufsc.br/SignBank/V%C3%...,1,1
146,vagina,uf,https://sistemas.cead.ufv.br/capes/dicionario/...,1,0
147,vagina,vl,https://libras.cin.ufpe.br/storage/videos/2021...,1,0
148,vagina,vl,https://libras.cin.ufpe.br/storage/videos/2021...,2,0


In [25]:
combined_metadata.to_csv("data/raw/combined/metadata_combined.csv", index=False)

In [None]:
# The current target_words.txt has English meanings manually written in
# So don't overwrite it 
# ---
#  with open("data/interim/target_words.txt", "w") as f:
#     for word in sorted(reviewed_words["word"].unique()):
#         f.write(word + "\n")