In [15]:
import pandas as pd
import json

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def json_to_dataframe(data):
    flattened_data = []
    for entry in data:
        for key, value in entry.items():
            flat_entry = {'Gloss ID': key}
            flat_entry.update(value)
            flattened_data.append(flat_entry)
    return pd.DataFrame(flattened_data)

json_file_path = 'glosses_meta.json'
data = load_json(json_file_path)
df = json_to_dataframe(data)

df.head()

Unnamed: 0,Gloss ID,Lemma ID Gloss: Dutch,Annotation ID Gloss: Dutch,Annotation ID Gloss: English,Senses: Dutch,Annotation Instructions,Handedness,Strong Hand,Strong Hand Letter,In The Web Dictionary,...,Iconic Image,Named Entity,Weak Prop,Mouth Gesture,Weak Hand Letter,Simultaneous Morphology,NME Videos,Concepticon Concept Set,Blend Morphology,Mouthing
0,3808,#A,#A,#A,{'1': 'a...'},"Staat voor diverse concepten, waarvan de Neder...",1,A,True,True,...,,,,,,,,,,
1,3809,#B,#B,#B,{'1': 'b...'},"Staat voor diverse concepten, waarvan de Neder...",1,B,True,True,...,,,,,,,,,,
2,3810,#C,#C,#C,{'1': 'c...'},"Staat voor diverse concepten, waarvan de Neder...",1,C,True,True,...,,,,,,,,,,
3,3811,#D,#D,#D,{'1': 'd...'},"Staat voor diverse concepten, waarvan de Neder...",1,1,True,True,...,,,,,,,,,,
4,3812,#E,#E,#E,{'1': 'e...'},"Staat voor diverse concepten, waarvan de Neder...",1,E,True,True,...,,,,,,,,,,


## Get Set of Data from Directory 

In [16]:
# Evaluate if entry exists in available data

# Check if datapoint is available 
import os

# Define the subdirectory where your .hamer files are stored
subdirectory = '../../../../signbank_videos/segmented_videos/output'

# Function to check if the file exists for a given Lemma ID Gloss
def file_exists_normalized(row):
    filename = f"normalized_{row['Lemma ID Gloss: Dutch']}_segment.json"
    file_path = os.path.join(subdirectory, filename)
    
    return os.path.isfile(file_path)

# Filter the DataFrame based on whether the file exists
n_df = df[df.apply(file_exists_normalized, axis=1)]

# Define the subdirectory where your .hamer files are stored
subdirectory = '../../../../signbank_videos/segmented_videos'

# Function to check if the file exists for a given Lemma ID Gloss
def file_exists_segmented(row):
    filename = f"{row['Lemma ID Gloss: Dutch']}_segment.hamer"
    file_path = os.path.join(subdirectory, filename)

    return os.path.isfile(file_path)

# Filter the DataFrame based on whether the file exists
s_df = df[df.apply(file_exists_segmented, axis=1)]

print('normalized:', n_df.shape)
print('segmented:', s_df.shape)

print('The minimum set is n_df')
common_df = n_df

normalized: (5257, 50)
segmented: (5286, 50)
The minimum set is n_df


In [17]:
print(common_df.keys())

Index(['Gloss ID', 'Lemma ID Gloss: Dutch', 'Annotation ID Gloss: Dutch',
       'Annotation ID Gloss: English', 'Senses: Dutch',
       'Annotation Instructions', 'Handedness', 'Strong Hand',
       'Strong Hand Letter', 'In The Web Dictionary',
       'Is This A Proposed New Sign?', 'Exclude From Ecv', 'Repeated Movement',
       'Alternating Movement', 'Link', 'Video', 'Affiliation',
       'Senses: English', 'Weak Hand', 'Relative Orientation: Movement',
       'Movement Direction', 'Tags', 'Movement Shape', 'Orientation Change',
       'Location', 'Lemma ID Gloss: English', 'Virtual Object',
       'Relative Orientation: Location', 'Strong Hand Number',
       'Handshape Change', 'Weak Hand Number', 'Phonetic Variation', 'Notes',
       'Contact Type', 'Sequential Morphology', 'Semantic Field', 'Word Class',
       'Weak Drop', 'Relation Between Articulators', 'Phonology Other',
       'Iconic Image', 'Named Entity', 'Weak Prop', 'Mouth Gesture',
       'Weak Hand Letter', 'Simult

## Properties of the dataset

In [18]:
print(common_df.value_counts('Strong Hand'))
print(common_df.value_counts('Weak Hand'))

Strong Hand
B        708
1        455
S        360
5        348
Money    239
        ... 
M > V      1
B > S      1
C > R      1
N > i      1
Y > R      1
Name: count, Length: 92, dtype: int64
Weak Hand
B                   650
S                   240
5                   237
1                   169
C                   142
C_spread            107
T                    87
Money                86
V                    61
A                    57
Beak_open            50
N                    48
Baby_C               47
O                    40
B_bent               36
B_curved             27
L                    25
1_curved             24
V_curved             23
Beak                 21
Y                    20
4                    20
Baby_beak_open       19
W                    19
Baby_O               13
Variable              8
Beak_spread           8
5_claw                8
K                     8
I                     7
Baby_beak             7
Beak_open_spread      7
3                     6
5m_cl

In [19]:
print(common_df.value_counts('Handedness'))

Handedness
1      2179
2s     1604
2a      756
2t       42
N/A       3
Name: count, dtype: int64


In [20]:
# Get only classes with more than x training examples
def select_num_classes(df, num_classes = 35):
    strong_hand_counts = df['Strong Hand'].value_counts()
    print("\nValue counts for 'Strong Hand':")
    print(strong_hand_counts[0:num_classes])
    print(sum(strong_hand_counts[0:num_classes]))

    top_values = strong_hand_counts.head(num_classes).index
    # Filter the DataFrame to include only rows where 'Strong Hand' is in the top values
    return df[df['Strong Hand'].isin(top_values)].copy()

common_df = select_num_classes(common_df)

print(len(common_df['Strong Hand'].value_counts() ))



Value counts for 'Strong Hand':
Strong Hand
B                   708
1                   455
S                   360
5                   348
Money               239
C_spread            211
T                   199
V                   187
A                   169
C                   155
Baby_C              130
N                   125
1_curved            104
B_bent               86
Beak_open            77
L                    77
W                    74
O                    74
V_curved             64
Beak                 61
Y                    59
Baby_O               47
Baby_beak_open       46
B_curved             44
4                    42
Baby_beak            28
Other                27
Beak_spread          26
K                    23
I                    23
Beak_open_spread     23
M                    23
5m                   22
3                    20
5r                   13
Name: count, dtype: int64
4369
35


In [21]:
def randomize_and_reset_index(df):
    return df.sample(frac=1, random_state=1).reset_index(drop=True)

common_df = randomize_and_reset_index(common_df)

In [22]:
def filter_on_handedness(df, handedness_labels):
    # Only keep signs that match the specified handedness labels (can be a list)
    df = df[df['Handedness'].isin(handedness_labels)]
    print(df['Handedness'].value_counts())  # Print counts of filtered handedness values
    return df

# Filter with a list of handedness labels
filtered_df = filter_on_handedness(common_df, ['1', '2s', '2a'])
print('Total number of datapoints available:', filtered_df.shape[0])


Handedness
1     2057
2s    1535
Name: count, dtype: int64
Total number of datapoints available: 3592


In [23]:
def drop_handshapechanges(df):
    # Drop all Handshape Changes
    df = df[df['Handshape Change'].isna()]
    print(df['Handshape Change'].value_counts(dropna=False))
    return df

filtered_df = drop_handshapechanges(filtered_df)
print('Total number of datapoints available:', filtered_df.shape[0])

Handshape Change
NaN    3003
Name: count, dtype: int64
Total number of datapoints available: 3003


In [24]:
# Replace '.' with '-' in the 'Lemma ID Gloss: Dutch' column
filtered_df['Annotation ID Gloss: Dutch'] = filtered_df['Annotation ID Gloss: Dutch'].str.replace('.', '-')
filtered_df.reset_index(drop=True, inplace=True)

# Add handedness RL Label

In [12]:
# Read handshape results

def read_txt_to_dict(file_path):
    result_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            # Strip any surrounding whitespace and split by comma
            key_value = line.strip().split(',')
            # Remove spaces from both the key and value
            key = key_value[0].replace(' ', '')
            value = key_value[1].replace(' ', '')
            result_dict[key] = value
    return result_dict

# Example usage:
file_path = '../../results/handedness.txt'  # Replace with your file path
h1_dict = read_txt_to_dict(file_path)
print(h1_dict)


{'TOCH-B': 'R', 'SLAAN-B': 'R', 'BIJVOORBEELD-A': 'R', 'STRING-A': 'R', 'LIANNE-WESTENBERG': 'R', '1000-C': 'R', 'VERZOEKEN': 'R', 'STATION-A': 'R', 'PRAXIS': 'R', 'BEZOEKEN-A:1': 'R', 'GRAPPIG-D': 'R', 'VERDOMME-A': 'R', 'IRAK-B': 'L', 'SINAASAPPEL': 'R', 'HALLO-A': 'R', 'IETS-IN-DE-HAND-SCHUDDEN': 'R', 'ZON-A': 'R', 'IDENTITEIT': 'R', 'STEM-B': 'R', 'DUITSLAND-B': 'R', 'DRONE': 'R', 'WEER': 'R', 'POLEN-C': 'R', 'GLAS-C': 'R', 'BRAZILIE-A': 'R', 'VAN-D': 'R', 'KLOPT-B': 'R', 'NAAM-C': 'R', 'TEKENEN-B': 'R', 'PERSOON-A.PL': 'R', 'GELOVEN-B': 'R', '4-A': 'R', 'ABONNEMENT-B': 'R', 'DOORGEVEN': 'L', 'ZEGGEN': 'R', 'CUBA-A': 'R', 'PERSOONLIJK': 'R', 'EERLIJK-A': 'R', 'CENTRUM': 'R', 'SLECHT-A': 'R', 'STOUT-B': 'R', 'MEE-EENS-A': 'R', 'OVERGEVEN-A': 'R', 'WASSEN-C': 'R', 'HELSINKI': 'R', 'MUIS-D': 'R', 'BEEN-C': 'R', 'GOD': 'R', 'DING': 'R', 'SIMPEL': 'R', 'VRIJDAG-B': 'R', 'AANKLIKKEN': 'R', 'GERAAKT-B': 'R', 'BARENDRECHT': 'R', 'MEISJE-B': 'R', 'LEKKER-C': 'R', 'VERDOMME-B': 'R', 'WEL-B':

In [13]:
def process_glosses(df, h1_dict):
    # Create a list to store processed rows
    processed_rows = []

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        gloss = row['Annotation ID Gloss: Dutch']
        handedness = row['Handedness']

        # If handedness is '1', append handedness from the dictionary
        if handedness == '1':
            try:
                new_gloss = gloss + '-' + h1_dict[gloss]
                row['Annotation ID Gloss: Dutch'] = new_gloss
                processed_rows.append(row)
            except KeyError:
                # If the gloss is not found in the dictionary, discard the row
                continue

        # If handedness is '2s', create two rows with '_R' and '_L'
        elif handedness == '2s':
            # Create a copy of the row for '_R'
            row_r = row.copy()
            row_r['Annotation ID Gloss: Dutch'] = gloss + '-R'
            processed_rows.append(row_r)

            # Create another copy of the row for '_L'
            row_l = row.copy()
            row_l['Annotation ID Gloss: Dutch'] = gloss + '-L'
            processed_rows.append(row_l)

        # If handedness is '2a', do nothing (rows are ignored)
        elif handedness == '2a':
            row_u = row.copy()
            row_u['Annotation ID Gloss: Dutch'] = gloss + '-U'
            processed_rows.append(row_u)

    # Convert the list of processed rows back to a DataFrame
    processed_df = pd.DataFrame(processed_rows)

    return processed_df

# Example usage
processed_df = process_glosses(filtered_df, h1_dict)
processed_df


Unnamed: 0,Gloss ID,Lemma ID Gloss: Dutch,Annotation ID Gloss: Dutch,Annotation ID Gloss: English,Senses: Dutch,Annotation Instructions,Handedness,Strong Hand,Strong Hand Letter,In The Web Dictionary,...,Iconic Image,Named Entity,Weak Prop,Mouth Gesture,Weak Hand Letter,Simultaneous Morphology,NME Videos,Concepticon Concept Set,Blend Morphology,Mouthing
0,2571,LIMONADE,LIMONADE-R,LEMONADE,"{'1': 'limonade, drinken, alcohol, bier, zuipe...",,1,L,True,True,...,,,,,,,,,,
1,480,GEWOON-A,GEWOON-A-R,NORMAL-A,"{'1': 'normaal, regulier, gewoonte, simpel, ee...",,1,1,,True,...,,,,,,,,,,
2,636,DOKTER-B,DOKTER-B-R,DOCTOR-B,"{'1': 'arts, medisch, dokter'}",,1,T,,True,...,,,,,,,,,,
3,4598,OEKRAINE-A,OEKRAINE-A-R,UKRAINE-A,{'1': 'Oekraïne'},,1,Money,,True,...,,Country,,,,,,,,
4,1952,TELEVISIE-C,TELEVISIE-C-R,TELEVISION-C,"{'1': 'TV, beeld, televisie, scherm, beeldsche...",,2s,1,,True,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2998,36911,STAAN-D,STAAN-D-L,STAND-D,{'1': 'staan'},,2s,B,,False,...,,,,,,,,,,
2999,2102,WILLEN-B,WILLEN-B-R,WANT-B,"{'1': 'behoefte, willen, graag, liever'}",,1,B,,True,...,,,,,,,,,,
3000,2866,GRAPPIG-C,GRAPPIG-C-R,FUNNY-C,"{'1': 'humor, grap, grappig, mop, nep, fop, al...",,1,1,,True,...,,,,,,,,,,
3001,48001,EXPRES-A,EXPRES-A-R,ON-PURPOSE-A,{'1': 'expres'},,1,1,,True,...,,,,,,,,,,


# Add Split

In [14]:
# Assign splits
import numpy as np

def add_split(filtered_df):
    # Initialize a column for splits
    filtered_df['split'] = np.nan
    # Shuffle the DataFrame rows
    filtered_df = filtered_df.sample(frac=1, random_state=1).reset_index(drop=True)


    def assign_splits(df, class_label):
        class_df = df[df['Strong Hand'] == class_label]
        
        # Ensure at least 10 examples per class for test and validation if possible
        num_test = 10 
        num_validation = 10
        
        # Assign splits
        df.loc[class_df.index[:num_validation], 'split'] = 'val'
        df.loc[class_df.index[num_validation:num_validation + num_test], 'split'] = 'test'
        
        df.loc[class_df.index[num_test + num_validation:], 'split'] = 'train'



    # Apply splits to each class
    for class_label in filtered_df['Strong Hand'].unique():
        assign_splits(filtered_df, class_label)
    return filtered_df

filtered_df = add_split(processed_df)
filtered_df.head()

Unnamed: 0,Gloss ID,Lemma ID Gloss: Dutch,Annotation ID Gloss: Dutch,Annotation ID Gloss: English,Senses: Dutch,Annotation Instructions,Handedness,Strong Hand,Strong Hand Letter,In The Web Dictionary,...,Named Entity,Weak Prop,Mouth Gesture,Weak Hand Letter,Simultaneous Morphology,NME Videos,Concepticon Concept Set,Blend Morphology,Mouthing,split
0,2640,RAM-A,RAM-A-L,RAM-A,"{'1': 'ram, mannetjesschaap'}",,2s,1,,True,...,,,,,,,,,,val
1,46561,KNUTSELEN,KNUTSELEN-R,CRAFT,{'1': 'knutselen'},,2s,Beak,,False,...,,,,,,,,,,val
2,48217,66-A,66-A-R,66-A,"{'1': '66, zesenzestig'}",,2s,A,,True,...,,,,,,,,,,val
3,1933,KIKKER-B,KIKKER-B-R,FROG-B,{'1': 'kikker'},,1,Y,,True,...,,,,,,,,,,val
4,1214,SPITS-A,SPITS-A-R,SPITS-A,"{'1': 'spits (sport), Spits (dagblad)'}",,2s,B,,True,...,Newspaper,,,,,,,,,val


In [15]:
filtered_df['split'].value_counts()

split
train    3361
val       334
test      287
Name: count, dtype: int64

In [16]:
# Remove corrupted files 
# Path to the text file with video IDs
video_ids_file = 'corrupted.txt'

# Read the video IDs from the text file into a list
with open(video_ids_file, 'r') as file:
    corrupted_video_ids = [line.strip() for line in file.readlines()]

# Filter the DataFrame to only keep rows where 'Lemma ID Gloss: Dutch' is NOT in the list of video IDs
filtered_df = filtered_df[~filtered_df['Lemma ID Gloss: Dutch'].isin(corrupted_video_ids)]

print('Total number of datapoints available:', filtered_df.shape[0])

Total number of datapoints available: 3974


In [17]:
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.value_counts('split')

split
train    3355
val       333
test      286
Name: count, dtype: int64

In [14]:
# Extract unique values from the 'Strong Hand' column
unique_values = filtered_df['Strong Hand'].unique()

# Step 2: Create a dictionary with unique identifiers starting from 1
value_to_id = {value: idx for idx, value in enumerate(unique_values, start=1)}
print(value_to_id)
# Extract unique values from the 'Strong Hand' column
unique_values = filtered_df['Strong Hand'].unique()

# Step 2: Create a dictionary with unique identifiers starting from 1
value_to_id = {value: idx for idx, value in enumerate(unique_values, start=1)}
print(value_to_id)
# Step 3: Add a new column 'letter_id' to filtered_df using the dictionary
filtered_df['letter_id'] = filtered_df['Strong Hand'].map(value_to_id)

# Display the updated DataFrame to verify
print(filtered_df.head(10))
# Step 3: Add a new column 'letter_id' to filtered_df using the dictionary
filtered_df['letter_id'] = filtered_df['Strong Hand'].map(value_to_id)

# Display the updated DataFrame to verify
print(filtered_df.head(10))

{'L': 1, '1': 2, 'T': 3, 'Money': 4, 'S': 5, 'B': 6, 'A': 7, 'Baby_C': 8, 'B_bent': 9, 'Y': 10, 'Beak': 11, 'W': 12, 'O': 13, 'C_spread': 14, 'Baby_beak_open': 15, 'Beak_open': 16, '1_curved': 17, 'K': 18, 'Baby_O': 19, '5': 20, 'V': 21, 'C': 22, 'B_curved': 23, 'N': 24, '5r': 25, 'Baby_beak': 26, '4': 27, 'M': 28, '3': 29, 'V_curved': 30, '5m': 31, 'Other': 32, 'I': 33, 'Beak_spread': 34, 'Beak_open_spread': 35}
  Gloss ID Lemma ID Gloss: Dutch Annotation ID Gloss: Dutch  \
0     2571              LIMONADE                   LIMONADE   
1      480              GEWOON-A                   GEWOON-A   
2      636              DOKTER-B                   DOKTER-B   
3     4598            OEKRAINE-A                 OEKRAINE-A   
4     1952           TELEVISIE-C                TELEVISIE-C   
5    46653                TAIWAN                     TAIWAN   
6      750             ZWEMMEN-B                  ZWEMMEN-B   
7     1873                TAXI-B                     TAXI-B   
8     2216      

# Produce SignClip MetaData

In [19]:
# Ensure letter_id is converted to string for concatenation
filtered_df['letter_id_str'] = filtered_df['letter_id'].astype(str)

# Update 'Lemma ID Gloss: Dutch' to the new format
filtered_df['Lemma ID Gloss: Dutch'] = '0_' + filtered_df['letter_id_str'] + '_0_' + filtered_df['Lemma ID Gloss: Dutch']

# Drop the temporary 'letter_id_str' column as it is no longer needed
filtered_df.drop('letter_id_str', axis=1, inplace=True)

# Display the updated DataFrame to verify
print(filtered_df[['Lemma ID Gloss: Dutch', 'Strong Hand', 'letter_id']].head())


  Lemma ID Gloss: Dutch Strong Hand  letter_id
0           0_1_0_RAM-A           1          1
1       0_2_0_KNUTSELEN        Beak          2
2            0_3_0_66-A           A          3
3        0_4_0_KIKKER-B           Y          4
4         0_5_0_SPITS-A           B          5


In [20]:
def save_to_txt(df, file_path):
    columns_to_save = ['Gloss ID', 'Lemma ID Gloss: Dutch', 'Strong Hand', 'split']
    df_to_save = df[columns_to_save]
    
    with open(file_path, 'w', encoding='utf-8') as file:
        for index, row in df_to_save.iterrows():
            line = ', '.join(row.astype(str))
            file.write(line + '\n')


# Save the filtered DataFrame to a text file
txt_file_path = 'metadata_1_2s.txt'
save_to_txt(filtered_df, txt_file_path)
print(f"\nFiltered data has been written to {txt_file_path}")


Filtered data has been written to metadata_1_2s.txt


In [23]:
def save_split_files(df):
    # Define file paths for each split
    split_files = {
        'test': 'test.txt',
        'validation': 'val.txt',
        'training': 'train.txt'
    }
    
    # Write the 'Lemma ID Gloss: Dutch' column for each split to corresponding files
    for split, file_path in split_files.items():
        # Filter the DataFrame for the current split
        split_df = df[df['split'] == split]
        print(split)
        print(split_df)
        
        # Write to the text file
        with open(file_path, 'w', encoding='utf-8') as file:
            for lemma in split_df['Lemma ID Gloss: Dutch']:
                file.write(lemma + '\n')

# Save the split files
save_split_files(filtered_df)

     Gloss ID Lemma ID Gloss: Dutch Annotation ID Gloss: Dutch  \
105      4349     0_16_0_STROOIEN-A               STROOIEN-A-R   
112     46334       0_16_0_VLECHTEN                 VLECHTEN-R   
113     47708         0_9_0_RUGBY-B                  RUGBY-B-R   
114      4026     0_5_0_DOORSNIJDEN              DOORSNIJDEN-R   
115      1458    0_16_0_AANKLEDEN-C              AANKLEDEN-C-R   
...       ...                   ...                        ...   
3774      630   0_22_0_ONTSPANNEN-D             ONTSPANNEN-D-L   
3808     1318     0_27_0_BEVRUCHTEN               BEVRUCHTEN-L   
3828    48078           0_24_0_33-B                     33-B-R   
3869     1539         0_32_0_ROZE-A                   ROZE-A-R   
3887    44134    0_31_0_VERKOUDEN-B              VERKOUDEN-B-R   

     Annotation ID Gloss: English  \
105                       STREW-A   
112                         BRAID   
113                       RUGBY-B   
114                   CUT-IN-HALF   
115                   

# Get json file

In [21]:
import json
import pandas as pd
import numpy as np

# Define the file paths
output_json = 'metadata_1_2s_2a.json'  # Path where the output JSON file will be saved


In [22]:
len(filtered_df['Strong Hand'].value_counts())

35

In [23]:
def get_instance(row, handshape):
    return {
            "bbox": [-1, -1, -1, -1],
            "fps": 25,
            "frame_end": -1,
            "frame_start": 1,
            "instance_id": -1,
            "signer_id": -1,
            "source": "SB",
            "split": row['split'],
            "url": "NA",
            "variation_id": -1,
            "video_id": row['Annotation ID Gloss: Dutch'],
            "camera_view": 1,
            "Minor Location": row['Relative Orientation: Location'],
            "Handshape": row['Strong Hand'],  # Handshape now stripped of spaces
            "Flexion": np.nan,
            "Spread": -1,
            "Sign Type": row['Handedness'],
            "Second Minor Location": np.nan,
            "Nondominant Handshape": row['Weak Hand'],
            "Sign Offset": -1,
            "Handshape Morpheme 2": np.nan,
            "Thumb Position": np.nan,
            "Major Location": row['Location'],
            "Path Movement": row['Movement Shape'],
            "Repeated Movement": row['Repeated Movement'],
            "Spread Change": -1,
            "Wrist Twist": -1,
            "Thumb Contact": -1,
            "Sign Onset": -1,
            "Contact": -1,
            "Selected Fingers": np.nan,
            "gloss": str(handshape)
        }

In [24]:
filtered_df['letter_id'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35])

In [25]:
# Initialize a list to hold the final metadata
metadata = []

# we use the id gloss and not handshape as this is how it is commonly processed in current models
# Iterate over unique glosses
ul = 0
n_datapoints = 0
for handshape in filtered_df['letter_id'].unique():
    gloss_data = {
        "gloss": str(handshape),
        "instances": []
    }
    
    # Filter rows for the current gloss
    gloss_rows = filtered_df[filtered_df['letter_id'] == handshape]
    
    # Iterate over the rows for this gloss and populate instance information
    for _, row in gloss_rows.iterrows():
        instance_data = get_instance(row, handshape)
        gloss_data['instances'].append(instance_data)
        n_datapoints += 1
    
    # Append the gloss data to the metadata list
    metadata.append(gloss_data)

print('Unaccounted for Signs ', ul)
print('Number of datapoints ', n_datapoints)
# Save the metadata to a JSON file
with open(output_json, 'w') as json_file:
    json.dump(metadata, json_file, indent=4)

Unaccounted for Signs  0
Number of datapoints  4620
