<a href="https://colab.research.google.com/github/Sourasky-DHLAB/LLMs/blob/main/pos_he_dicta_bert_il_tiny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [111]:
# Import libraries
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json
from tqdm import tqdm

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-tiny-joint')
model = AutoModel.from_pretrained('dicta-il/dictabert-tiny-joint', trust_remote_code=True)

In [None]:
# Set the model to evaluation mode
model.eval()

In [107]:
# Read sentences from the file
file_path = "/content/test1.txt"
with open(file_path, "r", encoding="utf-8") as file:
    sentences = [line.strip() for line in file if line.strip()]

In [None]:
print(sentences)

In [109]:
# Initialize a list to hold the outputs
outputs = []

In [None]:
# Assuming 'sentences' is a list or similar iterable
for sentence in tqdm(sentences, desc="Processing sentences"):
    # Tokenize and predict
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
    output = model.predict([sentence], tokenizer, output_style='json')
    outputs.append(output)

In [97]:
# Save the outputs
with open('/content/model_outputs.json', 'w', encoding='utf-8') as json_file:
    json.dump(outputs, json_file, ensure_ascii=False, indent=4)

In [98]:
# Initialize a list for CSV rows
csv_rows = []

In [99]:
# Find the maximum number of verbs in future tense and compounds for any sentence
max_verbs = max(len(verbs) for entry in outputs for sub_entry in entry for verbs in [[token['token'] for token in sub_entry['tokens'] if token['morph']['pos'] in ['VERB', 'AUX'] and token['morph']['feats'].get('Tense') == 'Fut']])
max_compounds = max(len(compounds) for entry in outputs for sub_entry in entry for compounds in [[token['token'] for token in sub_entry['tokens'] if token['syntax']['dep_func'] == 'compound:smixut']])
max_dets = max(len(dets) for entry in outputs for sub_entry in entry for dets in [[token['token'] for token in sub_entry['tokens'] if 'DET' in token['morph']['prefixes']]])
max_nums = max(len(nums) for entry in outputs for sub_entry in entry for nums in [[token['token'] for token in sub_entry['tokens'] if token['morph']['pos'] == 'NUM']])
max_adps = max(len(adps) for entry in outputs for sub_entry in entry for adps in [[token['token'] for token in sub_entry['tokens'] if token['morph']['pos'] == 'ADP']])
max_adjs = max(len(adj_genders) for entry in outputs for sub_entry in entry for adj_genders in [[token['morph']['feats'].get('Gender', 'Not Specified') for token in sub_entry['tokens'] if token['morph']['pos'] == 'ADJ']])

In [100]:
for entry in outputs:
    for sub_entry in entry:
        # Initialize row with the sentence
        row = [sub_entry['text']]

        # Collect all future tense verbs
        verbs = [token['token'] for token in sub_entry['tokens']
                 if token['morph']['pos'] in ['VERB', 'AUX']
                 and token['morph']['feats'].get('Tense') == 'Fut']

        # Collect all compound:smixut elements
        compounds = [token['token'] for token in sub_entry['tokens']
                     if token['syntax']['dep_func'] == 'compound:smixut']

        # Collect all tokens with determiners
        dets = [token['token'] for token in sub_entry['tokens']
                if 'DET' in token['morph']['prefixes']]

        # Collect all numerical tokens
        nums = [token['token'] for token in sub_entry['tokens']
                if token['morph']['pos'] == 'NUM']

        # Collect all ADP tokens
        adps = [token['token'] for token in sub_entry['tokens']
                if token['morph']['pos'] == 'ADP']

        # Collect gender of all ADJ tokens
        adj_genders = [token['morph']['feats'].get('Gender', 'Not Specified') for token in sub_entry['tokens']
                       if token['morph']['pos'] == 'ADJ']

        # Extend the row with verbs, compounds, determiners, numerical, ADP tokens, and adjective genders while maintaining the order
        extended_row = row + verbs + [''] * (max_verbs - len(verbs)) + compounds + [''] * (max_compounds - len(compounds)) + \
                       dets + [''] * (max_dets - len(dets)) + nums + [''] * (max_nums - len(nums)) + \
                       adps + [''] * (max_adps - len(adps)) + adj_genders

        csv_rows.append(extended_row)

In [101]:
# Deduplicate rows
unique_rows = set(tuple(row) for row in csv_rows)
csv_rows = [list(row) for row in unique_rows]

In [102]:
max_adjs = max(len(adj_genders) for entry in outputs for sub_entry in entry for adj_genders in [[token['morph']['feats'].get('Gender', 'Not Specified') for token in sub_entry['tokens'] if token['morph']['pos'] == 'ADJ']])

column_headers = ['Sentence'] + \
                 [f'Verb_Fut{i+1}' for i in range(max_verbs)] + \
                 [f'Compound_Smixut{i+1}' for i in range(max_compounds)] + \
                 [f'Determiner{i+1}' for i in range(max_dets)] + \
                 [f'Number{i+1}' for i in range(max_nums)] + \
                 [f'Adposition{i+1}' for i in range(max_adps)] + \
                 [f'Adj_Gender{i+1}' for i in range(max_adjs)]

In [103]:
# Create a DataFrame and save to CSV
csv_df = pd.DataFrame(csv_rows, columns=column_headers)
csv_file_path = '/content/future_tense_verbs.csv'
csv_df.to_csv(csv_file_path, index=False, header=True, encoding='utf-8')