<a href="https://colab.research.google.com/github/Sourasky-DHLAB/LLMs/blob/main/pos_he_dicta_bert_il_tiny.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import libraries
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json

In [None]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-tiny-joint')
model = AutoModel.from_pretrained('dicta-il/dictabert-tiny-joint', trust_remote_code=True)

In [None]:
# Set the model to evaluation mode
model.eval()

In [4]:
# Read sentences from the file
file_path = "/content/sentences.txt"
with open(file_path, "r", encoding="utf-8") as file:
    sentences = [line.strip() for line in file if line.strip()]

In [5]:
# Initialize a list to hold the outputs
outputs = []

In [12]:
# Process each sentence
for sentence in sentences:
    # Tokenize and predict
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
    output = model.predict([sentence], tokenizer, output_style='json')
    outputs.append(output)

In [6]:
# Save the outputs
with open('/content/model_outputs.json', 'w', encoding='utf-8') as json_file:
    json.dump(outputs, json_file, ensure_ascii=False, indent=4)

In [13]:
# Initialize a list for CSV rows
csv_rows = []

In [16]:
# Find the maximum number of verbs in future tense and compounds for any sentence
max_verbs = max(len(verbs) for entry in outputs for sub_entry in entry for verbs in [[token['token'] for token in sub_entry['tokens'] if token['morph']['pos'] in ['VERB', 'AUX'] and token['morph']['feats'].get('Tense') == 'Fut']])
max_compounds = max(len(compounds) for entry in outputs for sub_entry in entry for compounds in [[token['token'] for token in sub_entry['tokens'] if token['syntax']['dep_func'] == 'compound:smixut']])

In [17]:
# Process the outputs to extract future tense verbs and compound:smixut elements
for entry in outputs:
    for sub_entry in entry:
        # Initialize row with the sentence
        row = [sub_entry['text']]
        # Collect all future tense verbs and compound:smixut elements
        verbs = [token['token'] for token in sub_entry['tokens'] if token['morph']['pos'] in ['VERB', 'AUX'] and token['morph']['feats'].get('Tense') == 'Fut']
        compounds = [token['token'] for token in sub_entry['tokens'] if token['syntax']['dep_func'] == 'compound:smixut']

        # Extend the row with verbs and compounds while maintaining the order
        extended_row = row + verbs + [''] * (max_verbs - len(verbs)) + compounds
        csv_rows.append(extended_row)

In [19]:
# Deduplicate rows
unique_rows = set(tuple(row) for row in csv_rows)
csv_rows = [list(row) for row in unique_rows]

In [20]:
# Create column headers
column_headers = ['Sentence'] + [f'Verb_Fut{i+1}' for i in range(max_verbs)] + [f'Compound_Smixut{i+1}' for i in range(max_compounds)]

In [21]:
# Create a DataFrame and save to CSV
csv_df = pd.DataFrame(csv_rows, columns=column_headers)
csv_file_path = '/content/future_tense_verbs.csv'
csv_df.to_csv(csv_file_path, index=False, header=True, encoding='utf-8')