# <span style="color: #4daafc">Legal Case Similarity Detection - Translate legal documents</span>
- [Load Data](#load-data)
- [Trasnslate Hebrew to English](#translate-hebrew-to-english)
- [Save translated data](#save-translated-data)

# Prepare python environment

In [None]:
from utils.file_utils import load_file, save_file
from utils.df import df_shape
import tiktoken
import numpy as np
import pandas as pd
from langchain_ollama.llms import OllamaLLM
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.vectorstores import FAISS 
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from sklearn.preprocessing import MultiLabelBinarizer
import ast
import re

# Load Data

In [2]:
f_path = 'data/processed_data_ar_ap_100.xlsx'
df = load_file(file_name=f_path)

Successfully loaded DataFrame from data/processed_data_ar_ap_100.xlsx


In [3]:
df_shape(df)
display(df.head(10))

Data shape: 100 rows x 5 columns


Unnamed: 0,case_number,procedure_name,case_date,case_link,document_body
0,1108/97,"ע""א 1108/97 מרחיב אביב נ. מדינת ישראל",1997-05-11,https://supremedecisions.court.gov.il/Verdicts...,"בבית המשפט העליון בש""פ 97 / 1108 בפני: כבוד הר..."
1,4477/00,"ע""א 4477/00 לודמילה וורוביוב נ. היועצ המשפטי ל...",2000-07-06,https://supremedecisions.court.gov.il/Verdicts...,"בבית המשפט העליון בה""נ 4477/00 בפני כבוד נשיא ..."
2,1890/16,"ע""פ 1890/16",2017-03-09,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 1890/16 בבית המשפט העליון בשב..."
3,7176/04,"ע""פ 7176/04 ירונ תלמי נ. מדינת ישראל",2006-02-02,https://supremedecisions.court.gov.il/Verdicts...,"פסק-דין בתיק ע""פ 7176/04 בבית המשפט העליון בשב..."
4,3766/12,"ע""א 3766/12",2012-06-17,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""א 3766/12 בבית המשפט העליון בירוש..."
5,8178/12,"ע""א 8178/12 עו""ד צבי סלנט נ. יונתנ גוטליב",2014-11-12,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""א 8178/12 בבית המשפט העליון בשבתו..."
6,3015/09,"ע""פ 3015/09 מדינת ישראל נ. פואד קדיח",2010-07-20,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 3015/09 בבית המשפט העליון בשב..."
7,4272/05,"ע""פ 4272/05 אמיר חג'וג' נ. מדינת ישראל",2006-01-04,https://supremedecisions.court.gov.il/Verdicts...,"פסק-דין בתיק ע""פ 4272/05 בבית המשפט העליון בשב..."
8,10467/08,"ע""א 10467/08 עומר חג'אזי נ. אדיב עיסא דיאב",2010-11-03,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""א 10467/08 בבית המשפט העליון בש..."
9,3330/11,"ע""א 3330/11 אגד אגודה שיתופית לתחבורה בישראל ב...",2011-11-17,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""א 3330/11 בבית המשפט העליון בשב..."


# Translate Hebrew to English

In [4]:
base_url = "http://localhost:11434"
translation_model = 'dictalm2.0-instruct:f16'

### Number of tokens estimated by GPT-4o-mini

In [5]:
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

### Define translation chain

In [6]:
translation_prompt = PromptTemplate(
    input_variables=["text"],
    template="""Translate the following Hebrew text to English. Provide a complete, word-for-word translation without summarizing. Respond only in English.\n\n
    Hebrew: {text}
    English: 
    """
)

translation_llm = OllamaLLM(base_url=base_url, model=translation_model, temperature=0.0, num_predict=3072, num_ctx=8192)

translation_chain = translation_prompt | translation_llm

In [None]:
def split_df_column(df, col, max_length=32767):
    """Splits a column with long text into multiple columns."""
    
    def split_long_text(text, max_length):
        """Splits long text into chunks."""
        return [text[i:i + max_length] for i in range(0, len(text), max_length)]
    
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Iterate over each row in the DataFrame
    for idx, row in df_copy.iterrows():
        long_text = row[col]
        # Split the text into smaller parts
        chunks = split_long_text(long_text, max_length)
        # Add the chunks as new columns
        for i, chunk in enumerate(chunks):
            df_copy.at[idx, f'{col}_{i+1}'] = chunk

    return df_copy

def split_to_sentences(text):
    """
    Splitts the text into sentences. Delimiters are . or ? or !.
    """
    # regex pattern for sentence boundaries (., ?, !)
    pattern = r'(?<=[.?!])\s+'
    # split text by the pattern
    sentences = re.split(pattern, text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def split_to_paragraphs(text):
    # This regular expression looks for a number (with more than one digit) or a Hebrew letter followed by a dot and space
    pattern = r'(?<=\. )([0-9]{1,2}|[א-ת])\.\s'
    
    # Split the text into paragraphs by matching the pattern
    paragraphs = re.split(pattern, text)
    
    # Add the first part of each split back as a paragraph, and keep the dot and space after the number/letter
    paragraphs = [paragraphs[0]] + [paragraphs[i] + ". " + paragraphs[i + 1] for i in range(1, len(paragraphs), 2)]
    
    return paragraphs

def count_tokens(text):
    return len(encoding.encode(text))

def split_text(text, max_tokens=512):
    """
    The functions spiltes the text in chunks of max 512 tokens, without breaking the sentence into 2 separate chunks.
    """
    # split the text into paragraphs
    paragraphs = split_to_paragraphs(text)
    chunks = []
    current_chunk = ""

    for paragraph in paragraphs:
        # if the paragraph is smaller than max_tokens, add it directly
        if count_tokens(paragraph) <= max_tokens:
            chunks.append(paragraph)
        else:
            # split paragraph into sentences
            sentences = split_to_sentences(paragraph)

            current_chunk = ""
            for sentence in sentences:
                
                # check if adding the current sentence would exceed the max token limit
                if len(encoding.encode(current_chunk + sentence)) > max_tokens:
                    # if it does, finalize the current chunk and start a new one
                    chunks.append(current_chunk.strip())
                    current_chunk = sentence + " "
                else:
                    # otherwise, add the sentence to the current chunk
                    current_chunk += sentence + " "
    
            # add the last chunk if it's not empty
            if current_chunk:
                chunks.append(current_chunk.strip())
            
    return chunks

def translate_text(i_text):
    """
    Translates the input text by invoking the translation chain. In the end - non-necessary charaters are removed.
    """
    chunks = split_text(i_text)
    translations = [translation_chain.invoke({"text": c}) for c in chunks]
    res = ' '.join(translations)
    res = res.strip()
    res = re.sub(r'\n+', ' ', res)
    return res

In [9]:
# apply the translation function to 'document_body' column
df_translated = df.copy()
df_translated['document_body_english'] = df_translated['document_body'].apply(translate_text)

In [14]:
df_translated['document_body_english'].apply(len)

0       502
1       944
2     10014
3     23063
4     12965
      ...  
95      576
96      863
97      910
98     4262
99      865
Name: document_body_english, Length: 100, dtype: int64

In [19]:
for i, v in enumerate(df_translated['document_body_english'].apply(len)):
    print(f"doc#{i}, char length: {v}")

doc#0, char length: 502
doc#1, char length: 944
doc#2, char length: 10014
doc#3, char length: 23063
doc#4, char length: 12965
doc#5, char length: 1241
doc#6, char length: 6352
doc#7, char length: 932
doc#8, char length: 37109
doc#9, char length: 13205
doc#10, char length: 2055
doc#11, char length: 2202
doc#12, char length: 411
doc#13, char length: 2426
doc#14, char length: 1655
doc#15, char length: 430
doc#16, char length: 797
doc#17, char length: 5706
doc#18, char length: 11550
doc#19, char length: 816
doc#20, char length: 13622
doc#21, char length: 880
doc#22, char length: 1702
doc#23, char length: 1568
doc#24, char length: 1090
doc#25, char length: 1857
doc#26, char length: 719
doc#27, char length: 8808
doc#28, char length: 2430
doc#29, char length: 911
doc#30, char length: 2975
doc#31, char length: 7901
doc#32, char length: 2011
doc#33, char length: 3801
doc#34, char length: 627
doc#35, char length: 734
doc#36, char length: 1823
doc#37, char length: 1368
doc#38, char length: 1074
d

<div class="alert alert-block alert-info">  
<b>⚠️ Info:</b> Since English language has longer words than Hebrew, translated document can exceed the max Excel cell length of 32768 chracters, to avoid text truncation - we need to split the text into multiple columns.
</div>

In [48]:
df_trans_split = split_df_column(df_translated, col='document_body_english')

Before split

In [49]:
df_translated[60:70]

Unnamed: 0,case_number,procedure_name,case_date,case_link,document_body,document_body_english
60,8970/15,"ע""פ 8970/15 אחמד עיסא נ. מדינת ישראל",2016-11-07,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 8970/15 בבית המשפט העליון בשב...",The case of Appellant Ahmed Issa v. State of I...
61,319/21,"ע""פ 319/21",2022-04-10,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 319/21 בבית המשפט העליון בשבת...",In the case of Appeal No. 319/21 at the Suprem...
62,1494/07,"ע""פ 1494/07 ג'ואד אבו כפ נ. מדינת ישראל",2008-06-02,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""פ 1494/07 בבית המשפט העליון בשבתו...",The decision in case No. 1494/07 at the Suprem...
63,953/14,"ע""א 953/14 מ.ע.א השקעות בע""מ נ. המועצה המקומית...",2014-05-18,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""א 953/14 בבית המשפט העליון בירושל...",The decision in Case A953/14 at the Supreme Co...
64,8940/20,"ע""א 8940/20 עמותת ישיבת אהל יוספ נ. רשמ העמותות",2021-01-07,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""א 8940/20 בבית המשפט העליון ע""א...",The decision in Civil Appeal No. 8940/20 at th...
65,7209/22,"ע""פ 7209/22",2022-12-25,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""פ 7107/22 בבית המשפט העליון ע""פ 7...",The decision in Case No. 7107/22 at the Suprem...
66,8361/17,"ע""פ 8361/17 חליל עומר נ. מדינת ישראל",2018-07-11,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 8361/17 בבית המשפט העליון בשב...",In the case of Appeal No. 8361/17 at the Supre...
67,11628/05,"ע""א 11628/05 אליהו חברה לביטוח בע""מ נ. מאגר צי...",2007-12-27,https://supremedecisions.court.gov.il/Verdicts...,"['פסק דין בתיק ע""א 11628/05 בבית המשפט העליון ...",The case of Appeal No. 11628/05 at the Supreme...
68,7799/10,"ע""א 7799/10 אריה גולובנציק נ. מנהל מיסוי מקרקע...",2010-12-15,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""א 7799/10 בבית המשפט העליון בירוש...",In the case of Appeal No. 7799/10 at the Supre...
69,2542/19,"ע""פ 2542/19 ג'רמי ריצ'רד טוויל נ. פרקליטות המדינה",2019-12-03,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 2542/19 בבית המשפט העליון בשב...",The Israeli Supreme Court in Criminal Appeal N...


After split

In [50]:
df_trans_split[60:70]

Unnamed: 0,case_number,procedure_name,case_date,case_link,document_body,document_body_english,document_body_english_1,document_body_english_2
60,8970/15,"ע""פ 8970/15 אחמד עיסא נ. מדינת ישראל",2016-11-07,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 8970/15 בבית המשפט העליון בשב...",The case of Appellant Ahmed Issa v. State of I...,The case of Appellant Ahmed Issa v. State of I...,
61,319/21,"ע""פ 319/21",2022-04-10,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 319/21 בבית המשפט העליון בשבת...",In the case of Appeal No. 319/21 at the Suprem...,In the case of Appeal No. 319/21 at the Suprem...,"court's intervention (see, among others, APP 7..."
62,1494/07,"ע""פ 1494/07 ג'ואד אבו כפ נ. מדינת ישראל",2008-06-02,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""פ 1494/07 בבית המשפט העליון בשבתו...",The decision in case No. 1494/07 at the Suprem...,The decision in case No. 1494/07 at the Suprem...,
63,953/14,"ע""א 953/14 מ.ע.א השקעות בע""מ נ. המועצה המקומית...",2014-05-18,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""א 953/14 בבית המשפט העליון בירושל...",The decision in Case A953/14 at the Supreme Co...,The decision in Case A953/14 at the Supreme Co...,
64,8940/20,"ע""א 8940/20 עמותת ישיבת אהל יוספ נ. רשמ העמותות",2021-01-07,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""א 8940/20 בבית המשפט העליון ע""א...",The decision in Civil Appeal No. 8940/20 at th...,The decision in Civil Appeal No. 8940/20 at th...,
65,7209/22,"ע""פ 7209/22",2022-12-25,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""פ 7107/22 בבית המשפט העליון ע""פ 7...",The decision in Case No. 7107/22 at the Suprem...,The decision in Case No. 7107/22 at the Suprem...,
66,8361/17,"ע""פ 8361/17 חליל עומר נ. מדינת ישראל",2018-07-11,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 8361/17 בבית המשפט העליון בשב...",In the case of Appeal No. 8361/17 at the Supre...,In the case of Appeal No. 8361/17 at the Supre...,
67,11628/05,"ע""א 11628/05 אליהו חברה לביטוח בע""מ נ. מאגר צי...",2007-12-27,https://supremedecisions.court.gov.il/Verdicts...,"['פסק דין בתיק ע""א 11628/05 בבית המשפט העליון ...",The case of Appeal No. 11628/05 at the Supreme...,The case of Appeal No. 11628/05 at the Supreme...,hould bear the full amount of the indemnificat...
68,7799/10,"ע""א 7799/10 אריה גולובנציק נ. מנהל מיסוי מקרקע...",2010-12-15,https://supremedecisions.court.gov.il/Verdicts...,"החלטה בתיק ע""א 7799/10 בבית המשפט העליון בירוש...",In the case of Appeal No. 7799/10 at the Supre...,In the case of Appeal No. 7799/10 at the Supre...,
69,2542/19,"ע""פ 2542/19 ג'רמי ריצ'רד טוויל נ. פרקליטות המדינה",2019-12-03,https://supremedecisions.court.gov.il/Verdicts...,"פסק דין בתיק ע""פ 2542/19 בבית המשפט העליון בשב...",The Israeli Supreme Court in Criminal Appeal N...,The Israeli Supreme Court in Criminal Appeal N...,


In [51]:
df_trans_split = df_trans_split.drop(columns=['document_body_english'])

# Save translated data

In [52]:
f_path_translated = f_path.replace("processed", "translated")
save_file(df_trans_split, f_path_translated)

DataFrame successfully saved to data/translated_data_ar_ap_100.xlsx
