<a href="https://colab.research.google.com/github/Sourasky-DHLAB/LLMs/blob/main/pos_he_dicta_bert_il_tiny_1st_person_pronoun.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [80]:
# Import libraries
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import json
from tqdm import tqdm

In [81]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('dicta-il/dictabert-tiny-joint')
model = AutoModel.from_pretrained('dicta-il/dictabert-tiny-joint', trust_remote_code=True)

In [None]:
# Set the model to evaluation mode
model.eval()

In [83]:
# Read sentences from the file
file_path = "/content/Serialized-100.txt"
with open(file_path, "r", encoding="utf-8") as file:
    sentences = [line.strip() for line in file if line.strip()]

In [84]:
# Initialize a list to hold the outputs
outputs = []

In [85]:
# Assuming 'sentences' is a list or similar iterable
for sentence in tqdm(sentences, desc="Processing sentences"):
    # Tokenize and predict
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
    output = model.predict([sentence], tokenizer, output_style='json')
    outputs.append(output)

Processing sentences: 100%|██████████| 100/100 [00:22<00:00,  4.52it/s]


In [86]:
# Save the outputs
with open('/content/model_outputs.json', 'w', encoding='utf-8') as json_file:
    json.dump(outputs, json_file, ensure_ascii=False, indent=4)

In [87]:
# Load JSON data from the file
with open('/content/model_outputs.json', 'r', encoding='utf-8') as file:
    outputs = json.load(file)

In [88]:
csv_data = []
max_verbs = 0  # To track the maximum number of first-person verbs

In [89]:
# Process each entry in the JSON data
for entry in outputs:
    for sub_entry in entry:
        # Initialize row with the sentence
        sentence = sub_entry['text']

        # Collect all first-person verbs
        first_person_verbs = [token['token'] for token in sub_entry['tokens']
                              if token['morph']['pos'] == 'VERB'
                              and 'Person' in token['morph']['feats']
                              and token['morph']['feats']['Person'] == '1']

        # Update maximum verb count if current is greater and there are first-person verbs
        if first_person_verbs:
            max_verbs = max(max_verbs, len(first_person_verbs))

            # Extend the row with the collected first-person verbs
            extended_row = [sentence] + first_person_verbs
            csv_data.append(extended_row)

In [90]:
# Only proceed if we have collected data with first-person verbs
if csv_data:
    # Define column headers based on the maximum number of verbs found
    column_headers = ['Sentence'] + [f'Verb_First_Person{i+1}' for i in range(max_verbs)]

    # Ensure all rows have the same number of columns by padding with empty strings
    for i in range(len(csv_data)):
        while len(csv_data[i]) < len(column_headers):
            csv_data[i].append('')  # Pad with empty strings to match the column headers

    # Create a DataFrame with the collected data
    df = pd.DataFrame(csv_data, columns=column_headers)

    # Save the DataFrame to a CSV file
    csv_file_path = '/content/first_person_verbs.csv'
    df.to_csv(csv_file_path, index=False, encoding='utf-8')

    print(f"Data successfully saved to {csv_file_path}")
else:
    print("No sentences with first-person verbs were found.")

Data successfully saved to /content/first_person_verbs.csv
