In [240]:
##pip install spacy
##python -m spacy download en_core_web_sm

### Search: rules for glossing in ASL 
ChatGPT: what are the rules for glossing in ASL

### To gloss American Sign Language (ASL) using Python and SpaCy, you would need to set up a system to process the input text according to the rules of ASL glossing. Below are the basic rules for ASL glossing followed by a Python code example using SpaCy.
*** ASL Glossing Rules: ***
1. Uppercase Letters: Write each ASL sign in uppercase letters.\ DONE
2. Non-Manual Signals (NMS): Indicate non-manual signals such as facial expressions or body movements above the glossed sign.\
3. Fingerspelling: Represent fingerspelled words with dashes between each letter.\
4. Lexicalized Fingerspelling: Indicate lexicalized fingerspelling with a # symbol.\
5. Repetition: Show repeated signs with a plus sign (+) after the gloss.\
6. Role Shift: Indicate role shift with "rs" before the gloss.\
7. Indexing/Pointing: Use "ix" followed by a subscript letter or number for indexing.\
8. Directional Signs: Indicate the direction of the sign with arrows or other indicators.\
9. Classifiers: Use abbreviations for classifiers.\
10. Time Indicators: Place time indicators at the beginning of the sentence.\ DONE
11. Topic-Comment Structure: Indicate the topic followed by the comment.\
12. English Words/Concepts: Use English gloss in quotation marks for concepts without direct ASL equivalents.


In [241]:
import spacy
from spacy.scorer import Scorer
import pandas as pd

from sklearn.metrics import precision_score, recall_score, f1_score
from spacy.training.example import Example


### Read CSV Dataset

In [242]:
"""
file_path = 'Input/ASLG_PC12_train.csv/train.csv'
df = pd.read_csv(file_path)

print(f'df.shape: {df.shape}')

df.columns
"""

"\nfile_path = 'Input/ASLG_PC12_train.csv/train.csv'\ndf = pd.read_csv(file_path)\n\nprint(f'df.shape: {df.shape}')\n\ndf.columns\n"

### Load Spacy instance

In [243]:
# Load SpaCy model
## attention de faire: python -m spacy download en_core_web_sm si nécessaire
nlp = spacy.load("en_core_web_sm")

### Example sentences and their corresponding reference glosses

In [244]:
# Example sentences and their corresponding reference glosses
## list of tuples
"""
examples = [
    ("Yesterday, I saw a car and a person.", "YESTERDAY I IX_1 SAW CL:3 AND CL:1."),
    ("I went to the store.", "I IX_1 WENT STORE."),
]
"""

# Example sentences
examples = [
    ("the high costs of patents in europe , you say , might possibly explain this",
     "DESC-HIGH COST PATENT IN EUROPE , X-YOU SAY , MIGHT DESC-POSSIBLY EXPLAIN THIS"),
    ("i should like to congratulate you once again , commissioner , on a fascinating publication",
     "X-I SHOULD LIKE TO CONGRATULATE X-YOU DESC-ONCE DESC-AGAIN , COMMISSIONER , ON DESC-FASCINATING PUBLICATION"),
    ("i will also create the post of commissioner for internal affairs and migration , including security",
     "X-I WILL DESC-ALSO CREATE POST COMMISSIONER FOR DESC-INTERNAL AFFAIR AND MIGRATION , INCLUDE SECURITY"),
    ("the sitting was suspended at 3.25 p.m. and resumed at 6.00 p.m.",
     "SIT BE SUSPEND AT 3.25 DESC-P.M. AND RESUME AT 6.00 DESC-P.M."),
    ("Where is the bathroom?",
     "wh WHERE BATHROOM"),
    ("pl mr president , I would like to thank the rapporteur for her work",
     "PL MR PRESIDENT , X-I WOULD LIKE TO THANK RAPPORTEUR FOR X-SHE WORK"),
    ("Do you like coffee ?",
     "q YOU LIKE COFFEE"),
    ("I will go to the store tomorrow.",
     "TOMORROW STORE I GO")
]

In [245]:
# Define a list of question adverbs
opened_question_adverbs = ["how", "when", "where", "why", "how much", "how many", "how often", "how long", "what", "which", "who", "whose", "whom"]

time_words = ["yesterday", "today", "tomorrow"]

### ASL Gloss standard functions

In [246]:
# ASL glossing rules implemented in functions
def gloss_word(word):
    return word.upper()

def handle_fingerspelling(word):
    return '-'.join(list(word.upper()))

def handle_lexicalized_fingerspelling(word):
    return f"#{word.upper()}"

def handle_repetition(word, count):
    return f"{word.upper()}{'+' * (count - 1)}" if count > 1 else word.upper()

def handle_role_shift(sentence):
    return f"rs {sentence}"

def handle_indexing(token, index):
    return f"ix_{index} {token.upper()}"

def gloss_sentence(doc):
    glossed_sentence = []
    for token in doc:
        glossed_word = gloss_word(token.text)
        glossed_sentence.append(glossed_word)
    return " ".join(glossed_sentence)

"""
def add_time_indicator(doc):
    glossed_sentence = gloss_sentence(doc)
    for word in doc:
        if word.text.lower() in time_words:
            return f"{word.text.upper()} {glossed_sentence.replace(word.text.upper(), '').strip()}"
    return glossed_sentence
"""

def add_time_indicator(gloss_sentence_):
    for word in gloss_sentence_:
        if word.text.lower() in time_words:
            return f"{word.text.upper()} {gloss_sentence_.replace(word.text.upper(), '').strip()}"
    return gloss_sentence_

## skip stop_words
def skip_stop_words(word):
    if word.lower() == 'the' or word.lower() == 'a':
        return ''
    else:
        return word

## doc est une liste de tokens
def question_type(doc):
    if doc[-1].text == '?':
        if doc[0].text.lower() in opened_question_adverbs:
            return "wh-question"
        else:
            return "yes-no-question"
    return None

# add question id as a prefix
def process_sentence(doc):
    nms = {
        "wh-question": "wh-q",
        "yes-no-question": "y/n-q"
    }
    
    classifiers = {
        "car": "CL:3",
        "person": "CL:1"
    }
    
    glossed_sentence = []
    for token in doc:
        ## utilize token.lemma_, not .text
        #word = token.text.lower()
        word = token.lemma_.lower()
        
        if word in ["i", "me"]:
            glossed_word = handle_indexing("I", 1)
        elif word in ["you"]:
            glossed_word = handle_indexing("YOU", 2)
        elif word in classifiers:
            glossed_word = classifiers[word]
        else:
            glossed_word = gloss_word(word)
        glossed_word = skip_stop_words(glossed_word)
        
        glossed_sentence.append(glossed_word)    

    for gloss in glossed_sentence:
        if gloss.lower() in time_words:
            print(f'gloss: {gloss}')
            # move gloss at beginning
            glossed_sentence.insert(0, glossed_sentence.pop(glossed_sentence.index(gloss)))
            break
          
    type_doc = question_type(doc)
    if type_doc != None:
        glossed_sentence.insert(0, nms[type_doc])
        
    return " ".join(glossed_sentence)


### Evaluation

NB: deterministic model = set of ASL-Gloss-rules functions

In [247]:
# Evaluation function
def evaluate_glossing(examples):
    y_true = []
    y_pred = []
    
    for sentence, reference_gloss in examples:
        doc = nlp(sentence)
        generated_gloss = process_sentence(doc) ## deterministic model = set of ASL-Gloss-rules functions
        
        # Tokenize glosses for comparison
        reference_tokens = reference_gloss.split()
        generated_tokens = generated_gloss.split()
        
        y_true.extend([reference_tokens])
        y_pred.extend([generated_tokens])
        
    return y_true, y_pred


### Flatten the lists for sklearn

### Compute Metrics

In [248]:
# Calculate metrics
def compute_metrics(y_true_, y_pred_):
    precision = precision_score(y_true_, y_pred_, average='weighted', zero_division=1)
    recall = recall_score(y_true_, y_pred_, average='weighted', zero_division=1)
    f1 = f1_score(y_true_, y_pred_, average='weighted', zero_division=1)

    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    return {"precision": precision, "recall": recall, "f1_score": f1}


### example

In [249]:
# Example usage
y_true, y_pred = evaluate_glossing(examples)

print(type(y_true))

for idx, sentence in enumerate(examples):
    print(f'sentence: {sentence[0]}')
    print(f'y_true sentence: {" ".join(y_true[idx])}')
    print(f'y_pred sentence: {" ".join(y_pred[idx])}')

###precision, recall, f1_score = compute_metrics(y_pred, y_pred)


gloss: TOMORROW
<class 'list'>
sentence: the high costs of patents in europe , you say , might possibly explain this
y_true sentence: DESC-HIGH COST PATENT IN EUROPE , X-YOU SAY , MIGHT DESC-POSSIBLY EXPLAIN THIS
y_pred sentence: HIGH COST OF PATENT IN EUROPE , ix_2 YOU SAY , MIGHT POSSIBLY EXPLAIN THIS
sentence: i should like to congratulate you once again , commissioner , on a fascinating publication
y_true sentence: X-I SHOULD LIKE TO CONGRATULATE X-YOU DESC-ONCE DESC-AGAIN , COMMISSIONER , ON DESC-FASCINATING PUBLICATION
y_pred sentence: ix_1 I SHOULD LIKE TO CONGRATULATE ix_2 YOU ONCE AGAIN , COMMISSIONER , ON FASCINATING PUBLICATION
sentence: i will also create the post of commissioner for internal affairs and migration , including security
y_true sentence: X-I WILL DESC-ALSO CREATE POST COMMISSIONER FOR DESC-INTERNAL AFFAIR AND MIGRATION , INCLUDE SECURITY
y_pred sentence: ix_1 I WILL ALSO CREATE POST OF COMMISSIONER FOR INTERNAL AFFAIR AND MIGRATION , INCLUDE SECURITY
sentence: