In [2]:
import json
import re

def load_corpus(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            # Extract the sentence text
            sentences.append(data['sentence_text'])
    return sentences


def preprocess_sentences(sentences):
    tokenized_sentences = []

    pattern = (
        r'\"[^\"]+\"|'          # Quoted text
        r'[א-ת]+\"[א-ת]+|'      # Hebrew abbreviations
        r'[א-ת]+\'[א-ת]+|'      # Words with apostrophes
        r'[א-ת]+'               # Regular Hebrew words
    )

    for sentence in sentences:
        # Tokenize the sentence based on the patterns above
        tokens = re.findall(pattern, sentence)
        # Remove surrounding quotes form quoted words
        clean_tokens = [token.strip('"') for token in tokens]
        # Removing tokens that are in the size of 1
        filtered_tokens = [token for token in clean_tokens if len(token) > 1]
        tokenized_sentences.append(filtered_tokens)
    return tokenized_sentences

knesset_corpus_path = "/content/drive/MyDrive/Colab Notebooks/knesset_corpus.jsonl"
raw_sentences = load_corpus(knesset_corpus_path)
tokenized_sentences = preprocess_sentences(raw_sentences)


print("Sample tokenized sentence:", tokenized_sentences[:10])  # Display first ten


Sample tokenized sentence: [['בוקר', 'טוב', 'אני', 'מתכבד', 'לפתוח', 'את', 'ישיבת', 'ועדת', 'הכספים'], ['אנחנו', 'מתחילים', 'בדיון', 'על', 'קריסתם', 'של', 'בתי', 'החולים', 'הציבוריים'], ['אני', 'שמח', 'לפי', 'מה', 'שאני', 'יודע', 'בסך', 'הכול', 'הכללי', 'שערב', 'לפני', 'קיום', 'הישיבה', 'הגיעו', 'להסכמה', 'ובתי', 'החולים', 'יוכלו', 'להמשיך', 'לתפקד'], ['אני', 'מבקש', 'רק', 'את', 'מנכ"ל', 'משרד', 'הבריאות', 'היות', 'שהוא', 'יכול', 'רק', 'עכשיו', 'בזום', 'אז', 'אני', 'מבקש', 'אם', 'אפשר', 'להעלות', 'אותו', 'הוא', 'יוכל', 'להגיד', 'כמה', 'מילים', 'בעניין', 'לפני', 'שאני', 'נותן', 'לחברי', 'הכנסת', 'שהעלו', 'את', 'הנושאים', 'הללו'], ['איך', 'אתה', 'בסדר'], ['תשתפר', 'כמה', 'שאפשר', 'יותר', 'מהר', 'ותחזור', 'לאיתנך'], ['בבקשה', 'פרופ', 'לוי'], ['אם', 'אתה', 'יכול', 'להגיד', 'לוועדה', 'הרי', 'ידענו', 'שיש', 'קורונה', 'לצערנו', 'הרב'], ['ידענו', 'גם', 'שלא', 'יהיה', 'תקציב', 'כדי', 'לממן', 'את', 'הפעילות', 'השוטפת', 'של', 'בתי', 'החולים', 'האלה'], ['מי', 'עמד', 'מהצד', 'ולא', 'עשה', 'כלום']]


In [3]:
# #this code creates the model based on the rquirements and saves it
# from gensim.models import Word2Vec

# # Train the Word2Vec model
# model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1)

# # Save the model for later use
# model.save("/content/drive/MyDrive/Colab Notebooks/knesset_word2vec.model")

In [4]:
# word_vectors = model.wv
# print(word_vectors['ישראל'])

In [5]:
from gensim.models import Word2Vec
words_to_check = ["ישראל", "גברת", "ממשלה", "חבר", "בוקר", "מים", "אסור", "רשות", "זכויות"]
model_path = "/content/drive/MyDrive/Colab Notebooks/knesset_word2vec.model"
model = Word2Vec.load(model_path)
output_file = "/content/drive/MyDrive/Colab Notebooks/words_similar_knesset.txt"

with open(output_file, "w", encoding="utf-8") as file:
    # For each word in the list
    for word in words_to_check:
          # Find the 5 most similar words to the this word
          similar_words = model.wv.most_similar(word, topn=5)
          # Formating the results
          formatted_similar_words = ", ".join(
              f"({similar_word}, {similarity_score:.4f})"
              for similar_word, similarity_score in similar_words
          )
          file.write(f"{word}: {formatted_similar_words}\n")

print(f"Results saved to {output_file}")

Results saved to /content/drive/MyDrive/Colab Notebooks/words_similar_knesset.txt


In [6]:
import numpy as np
def compute_sentence_embeddings(sentences, model):
    sentence_embeddings = []

    for sentence in sentences:
        # Filter tokens that are in the model
        valid_tokens = [word for word in sentence if word in model.wv]
        if valid_tokens:
            # If its a valid token then we get word vectors
            word_vectors = np.array([model.wv[word] for word in valid_tokens])
            # Average of the word vectors
            sentence_vector = np.mean(word_vectors, axis=0)
        else:
            # If its not a valid token then we assign zero vector
            sentence_vector = np.zeros(model.vector_size)
        sentence_embeddings.append(sentence_vector)
    return sentence_embeddings

model_path = "/content/drive/MyDrive/Colab Notebooks/knesset_word2vec.model"
model = Word2Vec.load(model_path)

sentence_embeddings = compute_sentence_embeddings(tokenized_sentences, model)

print("Embedding for the first sentence:", sentence_embeddings[0])

Embedding for the first sentence: [-0.7756516   0.69469047  0.9895823   0.79107213  0.02913811 -1.3916577
  0.6016847   0.79829293 -0.4799482  -0.79569364 -0.34313476 -1.0601889
  0.74503666 -0.42906505  0.45159245 -0.535218    0.49791908  0.14715338
  0.3464399  -0.63412833 -0.22103806  0.25078434 -0.32450512 -0.27713835
  0.8217668  -0.25826088 -0.46972433 -0.28977263 -0.5030062   0.45479864
 -0.18374509  0.42788112  0.33447817 -0.6532954  -0.6538002   0.5502498
  0.4007246  -0.46860424 -0.21042082 -0.91058224  0.33447245 -1.3113021
 -0.6564729   0.45808172 -0.41776052 -1.0287533  -1.0273561  -0.04855783
  0.7324143   0.5463391   0.5521435  -0.6488103   0.002288   -0.38048887
 -0.05635273  1.4338484   0.6611271  -0.8470733  -0.74398124  0.03446085
 -0.24655014  0.01121269  0.35736907  0.16393842 -0.47288787  0.76587313
  0.11087058  1.1370144   0.353863    0.8502846   0.19439399  0.24148414
  0.80964565  0.2944702   0.5610873   0.7394935  -0.45991632 -0.5418068
 -0.14715311  0.677120

In [7]:
import numpy as np
from scipy.spatial.distance import cosine


def find_most_similar_sentences(raw_sentences, embeddings, chosen_indices):
    results = []

    for idx in chosen_indices:
        chosen_sentence = raw_sentences[idx]
        chosen_embedding = embeddings[idx]

        # Compute cosine similarity with all other sentences
        similarities = []
        for i, embedding in enumerate(embeddings):
            if i != idx:  # If its not the same sentence
                similarity = 1 - cosine(chosen_embedding, embedding)
                similarities.append((i, similarity))

        # Find the highest similarity score
        most_similar_idx, _ = max(similarities, key=lambda x: x[1])
        most_similar_sentence = raw_sentences[most_similar_idx]

        results.append((chosen_sentence, most_similar_sentence))

    return results


# We take only sentences with at least 4 valid tokens
chosen_indices = [i for i, sentence in enumerate(tokenized_sentences) if len(sentence) >= 4][:10]  # Select the first 10 sentences that meet the criteria
similar_sentences = find_most_similar_sentences(raw_sentences, sentence_embeddings, chosen_indices)

output_file = "/content/drive/MyDrive/Colab Notebooks/sentences_similar_knesset.txt"
with open(output_file, "w", encoding="utf-8") as file:
    for chosen_sentence, most_similar_sentence in similar_sentences:
        file.write(f"{chosen_sentence}: most similar sentence: {most_similar_sentence}\n")

print(f"Results saved to {output_file}")


  dist = 1.0 - uv / math.sqrt(uu * vv)


Results saved to /content/drive/MyDrive/Colab Notebooks/sentences_similar_knesset.txt


In [1]:
from gensim.models import Word2Vec

# Load the model
model_path = "/content/drive/MyDrive/Colab Notebooks/knesset_word2vec.model"
model = Word2Vec.load(model_path)

# Input sentences
sentences = [
    "בעוד מספר דקות נתחיל את הדיון בנושא השבת החטופים .",
    "בתור יושבת ראש הוועדה , אני מוכנה להאריך את ההסכם באותם תנאים .",
    "בוקר טוב , אני פותח את הישיבה .",
    "שלום , אנחנו שמחים להודיע שחברינו היקר קיבל קידום .",
    "אין מניעה להמשיך לעסוק בנושא ."
]

# Red words and their respective positive/negative contexts
red_words_context = {
    "דקות": {"positive": ["דקה", "רגעים", "זמן"], "negative":[]},
    "הדיון": {"positive": ["דיבור", "מפגש", "מועצה", "שיחה"], "negative": ["קונפליקט", "ויכוח", "סכסוך"]},
    "הוועדה": {"positive": ["ועידה","אסיפה","רשות"], "negative": []},
    "אני": {"positive": ["עצמי"], "negative": []},
    "ההסכם": {"positive": [], "negative": []},
    "בוקר": {"positive": ["צהריים", "יום", "אור"], "negative": []},
    "פותח": {"positive": ["עוצר"], "negative": []},
    "שלום": {"positive": ["נעים", "ברוכים","ברוך","וברכה","וסהלן"],"negative": ["מלחמה"]},
    "שמחים": {"positive": [], "negative": []},
    "היקר": {"positive": ["מוערך"], "negative": ["זול"]},
    "קידום": {"positive": ["עצום","שיפור","עלייה","תוספות"], "negative": []},
    "מניעה": {"positive": ["אישורים", "הרשאה", "אפשרות", "היתר"], "negative": []}
}

# Output file path
output_file = "/content/drive/MyDrive/Colab Notebooks/sentences_words_red.txt"

with open(output_file, "w", encoding="utf-8") as file:
    for i, sentence in enumerate(sentences, start=1):
        original_sentence = sentence
        modified_sentence = sentence
        replacements = []

        for word, context in red_words_context.items():
            if word in sentence:
                # Fetch positive and negative lists for the word
                positive = context["positive"]
                negative = context["negative"]

                # Use Word2Vec to find similar words with positive and negative contexts
                try:
                    similar_words = model.wv.most_similar(positive=positive + [word], negative=negative, topn=3)

                    # Select the most appropriate replacement
                    replacement_word = similar_words[0][0]

                    # Replace the word in the sentence
                    modified_sentence = modified_sentence.replace(word, replacement_word)
                    replacements.append((word, replacement_word))

                    # Print the top 3 replacements
                    print(f"Top 3 replacements for '{word}' in '{sentence}':")
                    for similar_word, score in similar_words:
                        print(f"  {similar_word} ({score:.4f})")
                except KeyError:
                    # If the word is not in the vocabulary, skip replacement
                    replacement_word = word

        # Write results to the file
        file.write(
            f"{i}: {original_sentence}: {modified_sentence}\n"
            f"replaced words: {', '.join(f'({orig}:{rep})' for orig, rep in replacements)}\n\n"
        )

print(f"Results saved to {output_file}")


Top 3 replacements for 'דקות' in 'בעוד מספר דקות נתחיל את הדיון בנושא השבת החטופים .':
  שניות (0.9275)
  ספורות (0.9069)
  הדקות (0.9059)
Top 3 replacements for 'הדיון' in 'בעוד מספר דקות נתחיל את הדיון בנושא השבת החטופים .':
  השרה (0.7616)
  הישיבה (0.7571)
  מכתבו (0.7402)
Top 3 replacements for 'הוועדה' in 'בתור יושבת ראש הוועדה , אני מוכנה להאריך את ההסכם באותם תנאים .':
  ההסתדרות (0.9076)
  העירייה (0.8857)
  המועצה (0.8761)
Top 3 replacements for 'אני' in 'בתור יושבת ראש הוועדה , אני מוכנה להאריך את ההסכם באותם תנאים .':
  לשמוע (0.8525)
  הייתי (0.8512)
  דברי (0.8439)
Top 3 replacements for 'ההסכם' in 'בתור יושבת ראש הוועדה , אני מוכנה להאריך את ההסכם באותם תנאים .':
  הדוח (0.9117)
  הוויכוח (0.9045)
  המסמך (0.8981)
Top 3 replacements for 'אני' in 'בוקר טוב , אני פותח את הישיבה .':
  לשמוע (0.8525)
  הייתי (0.8512)
  דברי (0.8439)
Top 3 replacements for 'בוקר' in 'בוקר טוב , אני פותח את הישיבה .':
  לילה (0.9236)
  וערב (0.9127)
  בסבלנות (0.9107)
Top 3 replacements for 'פ

In [20]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
import random
random.seed(42)
np.random.seed(42)

# Load the dataset and filter for the two most frequent speakers
def load_and_prepare_data(filepath):

    with open(filepath, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]

    df = pd.DataFrame(data)
    speaker_counts = df['speaker_name'].value_counts() # Count occurrences of each speaker
    top_speakers = speaker_counts.index[:2]  # Top 2 speakers

    # Clean all the sentences that aren't from the two speakers
    df['class'] = df['speaker_name'].apply(lambda x: 'first' if x == top_speakers[0] else ('second' if x == top_speakers[1] else None))
    df = df[df['class'].notna()]

    # Downsample classes
    min_samples = df['class'].value_counts().min()
    df = df.groupby('class').apply(lambda x: x.sample(n=min_samples, random_state=42)).reset_index(drop=True)

    return df

def eval_knn(features, labels, n_neighbors=9, metric='cosine'):

    classifier = KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)
    # Encoding labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    # 5-fold cross-validation
    predictions = cross_val_predict(classifier, features, encoded_labels, cv=5)
    scores = cross_val_score(classifier, features, encoded_labels, cv=5)
    report = classification_report(encoded_labels, predictions, target_names=label_encoder.classes_, zero_division=0)

    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Classification Report:\n{report}")
    return report

df = load_and_prepare_data(knesset_corpus_path)

# Reset the index to ensure alignment with embeddings
df = df.reset_index(drop=True)

# Align embeddings with the filtered dataset
filtered_embeddings = np.array(sentence_embeddings)[df.index.values]

# Evaluate the KNN classifier
classification_report = eval_knn(filtered_embeddings, df['class'])

# Save the classification report to a file
output_file = "classification_report_knn.txt"
with open(output_file, 'w', encoding='utf-8') as file:
    file.write(classification_report)

print(f"Classification report saved to {output_file}")


  df = df.groupby('class').apply(lambda x: x.sample(n=min_samples, random_state=42)).reset_index(drop=True)


Mean Accuracy: 0.5546
Classification Report:
              precision    recall  f1-score   support

       first       0.55      0.60      0.57      2016
      second       0.56      0.51      0.53      2016

    accuracy                           0.55      4032
   macro avg       0.56      0.55      0.55      4032
weighted avg       0.56      0.55      0.55      4032

Classification report saved to classification_report_knn.txt


In [10]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

# Load the DictaBERT tokenizer and model and masked file
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
model = AutoModelForMaskedLM.from_pretrained("avichr/heBERT")
masked_sentences_file = "/content/drive/MyDrive/masked_sampled_sents.txt"
output_file = "/content/drive/MyDrive/dictabert_results.txt"

# Load masked sentences and replace from * to [MASK]
def load_masked_sentences(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            original_sentence = line.strip()
            masked_sentence = original_sentence.replace("*", "[MASK]")
            sentences.append((original_sentence, masked_sentence))
    return sentences

# Predict masked tokens using DictaBERT
def predict_masked_tokens(masked_sentence):
    # Tokenize the masked sentences
    inputs = tokenizer(masked_sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits

    # Identify the masked token positions
    mask_token_index = (inputs['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

    # Predict the top token for each masked position
    predicted_tokens = []
    for mask_index in mask_token_index:
        top_token_id = logits[0, mask_index].argmax(dim=-1).item()
        predicted_tokens.append(tokenizer.decode([top_token_id]).strip())

    return predicted_tokens

# Process masked sentences and save results
def process_and_save_results(sentences, output_filepath):
    with open(output_filepath, 'w', encoding='utf-8') as file:
        for original_sentence, masked_sentence in sentences:
            # Predict tokens for the masked sentence
            predicted_tokens = predict_masked_tokens(masked_sentence)

            # Replace `[MASK]` tokens with predicted tokens
            dictaBERT_sentence = masked_sentence
            for token in predicted_tokens:
                dictaBERT_sentence = dictaBERT_sentence.replace("[MASK]", token, 1)

            # Write the results to the output file
            file.write(f"original_sentence: {original_sentence}\n")
            file.write(f"masked_sentence: {masked_sentence}\n")
            file.write(f"dictaBERT_sentence: {dictaBERT_sentence}\n")
            file.write(f"dictaBERT tokens: {', '.join(predicted_tokens)}\n")
            file.write("\n")  # Add a blank line between entries

# Main execution
if __name__ == '__main__':
    # Load the masked sentences
    sentences = load_masked_sentences(masked_sentences_file)

    # Process sentences and save results
    process_and_save_results(sentences, output_file)

    print(f"Results saved to {output_file}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/299k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Results saved to /content/drive/MyDrive/dictabert_results.txt
