## Imports & PiP

In [None]:
# # virtual environment
# !pip install virtualenv
# !virtualenv myenv
# !source myenv/bin/activate

In [None]:
!pip install langchain
!pip install openai
!pip install openai==0.28
!pip  install -U farasapy
!pip install gensim
!pip install typing_extensions==4.7.1 --upgrade
!pip install gradio
!pip install python-dotenv
!pip install bert_score
!pip install sentence-transformers
!pip install chromadb

In [None]:
import pandas as pd
import re
import string
from langchain.output_parsers import StructuredOutputParser
from langchain.output_parsers import ResponseSchema
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
import datetime
import os
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')
import re
from nltk.stem import ARLSTem
from farasa.pos import FarasaPOSTagger
from farasa.ner import FarasaNamedEntityRecognizer
from farasa.diacratizer import FarasaDiacritizer
from farasa.segmenter import FarasaSegmenter
from farasa.stemmer import FarasaStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/dataset/dataset_v3.csv")

In [None]:
df.isnull().sum()

Unnamed: 0.1         0
F1                 331
court                0
city                33
court_id             0
court_date           0
appeal_court       831
region             864
appeal_id          831
appeal_date        864
judgment_text        0
appeal_text        864
Unnamed: 0        1897
links             1897
appeal_data       2195
appleal_text      2195
judgment_regex       0
cases_text           0
text_for_simi        0
dtype: int64

## Similarity preprocessing for input text

In [None]:
## Stopword
stop_words = set(stopwords.words('arabic'))
to_remove = {"الحمد" ,"لله","والصلاة","السلام","رسول", "القاضي", "رئيسا", "عضوا", "الدائرة",
             "وبناءً" ,"القضية", "رئيس" ,"وصلى", "ﷲ"," وسلم"," نبينا", "محمد", "وآله"," وصحبه" ,"أجمعين", "وبﷲ", "والسلام"
             ,"التوفيق", "المدعي"," عليها", "هوية"," وطنية", "إلزام" , "منطوق" ,"الحكم","جلسة", "وكيل"," المدعية", "وكالة","حكم"
             ,"رقم", "حكمت", "العام", "موكلتي", "صحيفة", "الدعوى", "تاريخ","بموجب","وكالة", "حضر" ,"وكيل","المدعى","جلسة", "هوية","وطنية",
             "سجل","تجاري"}

stop = stop_words | to_remove

stemmer = FarasaStemmer(interactive=True)

stop = set(stemmer.stem(' '.join(stop)).split())

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop]
    return ' '.join(filtered_words)



In [None]:
class Processor_simi:
    def __init__(self):
        arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
        english_punctuations = string.punctuation
        self.all_punctuations = set(arabic_punctuations + english_punctuations)

        # Initialize Farasa stemmer for stemming
        self.stemmer = FarasaStemmer(interactive=True)

    def normalize_arabic(self, text):
        # Remove Arabic digits
        text = re.sub("[٠١٢٣٤٥٦٧٨٩]", "", text)
        text = re.sub("[0123456789]", "", text)

        # Remove English digits, slashes, and all punctuations
        text = re.sub(r'[\d/]+', '', text)
        text = re.sub(f'[{re.escape("".join(self.all_punctuations))}]', '', text)

        # Normalize Arabic characters
        text = re.sub("[إأآاٱ]", "ا", text)
        text = re.sub("ى", "ي", text)
        text = re.sub("ؤ", "ء", text)
        text = re.sub("ئ", "ء", text)
        text = re.sub("ة", "ه", text)
        text = re.sub("گ", "ك", text)
        text = re.sub(r'\bال(\w\w+)', r'\1', text)
        text = re.sub(r'\sو(\w+)', r' و \1', text)
        text = re.sub(r'[()\.:\'/<>،`,]', '', text)
        text = re.sub(r'[0-9٠-٩]', '', text)
        return text

        # Remove remaining digits
        text = re.sub("\u0640", '', text)

        return text

    def stem_and_process(self, text):
        # Use Farasa stemmer for stemming

        stemmed_text = self.stemmer.stem(text)

        return stemmed_text

# Create an instance of the text processor
text_processor_simi = Processor_simi()



## OpenAI API & LLM model

In [None]:
from dotenv import load_dotenv, find_dotenv
import os

_= load_dotenv(find_dotenv())
myvar = os.getenv('OPENAI_API_KEY')

In [None]:
llm_model = "gpt-3.5-turbo-16k-0613"
chat = ChatOpenAI(temperature=0.0, model=llm_model)

## Prompt Template

In [None]:
style = """ Arabic \
in a calm and respectful tone
"""

In [None]:
#For Summrize
template_QASummrize = """You are an Arabic judicial summrizer:
summarize the key points from the three cases mentioned, into three sentences per paragraph, with each sentence not exceeding 30 words::\
Format the output into {style}\
cases: {text}
"""
QASummrize_template = ChatPromptTemplate.from_template(template_QASummrize)

In [None]:
template_Summrize = """ You are an Arabic judicial summarizer:
summarize the key points from the three given cases mentioned, and generate three sentences for each paragraph,
each case contains three paragraphs (الوقائع, الاسباب, منطوق الحكم). First summarize <الوقائع> and make it short, then summarize <الاسباب>, finally summarize <منطوق الحكم>.
Remove mentioned names and ensure each generated sentence is no more than 30 words:\
Format the output into {style}.\
cases:{text}
"""

Summrize_template = ChatPromptTemplate.from_template(template_Summrize)

In [None]:
#For Q&A (ChatBot)
template_answer = """ You are an Arabic judicial assistant:
You can answer the questions based on the given summarized cases. the answer should not exceed 30 words.
If the question needs a number answer, your answer should be in the range of two numbers, for example, if the actual answer is 2000 you should respond (between 1000 and 3000).
cases: {cases}
Question: {input_text}
Format the output into {style} """

QA_template = ChatPromptTemplate.from_template(template_answer)

## VectorDB

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
persist_directory = '/content/drive/MyDrive/dataset/docs/chroma'
from langchain.vectorstores import Chroma
vectordb = Chroma(
    embedding_function=embedding_function,
    persist_directory=persist_directory
)
print(vectordb._collection.count())

2228


## Functions

In [None]:
# Similarity function using TF-IDF -> return dataframe

def similarity_tfidf(input_text):
    input_text = text_processor_simi.normalize_arabic(text_processor_simi.stem_and_process(input_text))
    input_text = remove_stopwords(input_text)
    X = df['text_for_simi']

    cv_tfidf = TfidfVectorizer(min_df=3, max_df=0.9)
    X_tf = cv_tfidf.fit_transform(X)

    svd = TruncatedSVD(n_components=100)
    svd_matrix = svd.fit_transform(X_tf)

    input_vector = cv_tfidf.transform([input_text])
    input_svd = svd.transform(input_vector)

    svd_matrix_normalized = normalize(svd_matrix, norm='l2')
    input_svd_normalized = normalize(input_svd, norm='l2')
    similarity_scores = cosine_similarity(input_svd_normalized, svd_matrix_normalized)
    df['similarity'] = similarity_scores[0]
    top_3_similar = df.sort_values(by='similarity', ascending=False).head(3)
    top_3_similar = top_3_similar[['cases_text', 'similarity']]

    return top_3_similar

In [None]:
# Similarity function using vectorDB -> return dataframe
def similarity_vector(input_text):
    docs = vectordb.similarity_search_with_relevance_scores(input_text,k=3)
    top_3_similar = pd.DataFrame(docs, columns=['cases_text', 'similarity'])

    return top_3_similar

In [None]:
# Summarization function for Q/A
def summarize(df):
    cases = df['cases_text']
    messages = QASummrize_template.format_messages(style=style, text=cases)
    customer_response = chat(messages)

    return customer_response.content

In [None]:
# Summrize function for Tap2 (Similarity and Summarization)
def similarity_summarization(df):

    cases = df['cases_text']
    messages = Summrize_template.format_messages(style=style, text=cases)
    customer_response = chat(messages)

    return customer_response.content

## Tap1 : ChatBot

In [None]:
# Tap1 :أسأل سؤالك - version 1

# using TF-IDF
def process_and_answer_tfidf(input_text, history):
    np.random.seed(42)

    # Similarity
    top_3_similar = similarity_tfidf(input_text)

    # Summarize
    top_3_similar['summarized_text'] = top_3_similar.apply(summarize, axis=1)

    # Q/A
    all_cases = ".".join(top_3_similar['summarized_text'])
    messages = QA_template.format_messages(style=style, cases=all_cases, input_text=input_text)
    customer_response = chat(messages)

    return customer_response.content.replace(".", ".\n")

In [None]:
# Tap1 :أسأل سؤالك - version 2

# using VectorDB
def process_and_answer_vec(input_text, history):
    np.random.seed(42)
    # similarity
    top_3_similar = similarity_vector(input_text)

    # Summarize
    top_3_similar['summarized_text'] = top_3_similar.apply(summarize, axis=1)

    # Q/A
    all_cases = ".".join(top_3_similar['summarized_text'])
    messages = QA_template.format_messages(style=style, cases=all_cases, input_text=input_text)
    customer_response = chat(messages)

    return customer_response.content.replace(".", ".\n")

## Tap2 : Finding Similar Cases

In [None]:
# Tap 2: القضايا المتشابهة - بالتلخيص

def similarity_with_summrize(input_text,history):
    seed = 42
    np.random.seed(seed)

    #smilirtiy
    top_3_similar = similarity_tfidf(input_text)

    #Summrize
    top_3_similar['summarized_text'] = top_3_similar.apply(similarity_summarization, axis=1)

    filtered_results = top_3_similar[top_3_similar['similarity'] > 0.5]

    # Check if there are any rows in filtered_results
    if filtered_results.empty:
        return "ليس هناك قضايا مشابهة لقضيتك في قاعدة البيانات "

    # Create a formatted string for the filtered summarized_text
    formatted_results = "\n".join(
        [f"نسبة التشابة: %{similarity * 100:.2f} \nالقضية:\n{cases_summary}\n" for similarity, cases_summary in
         zip(filtered_results['similarity'], filtered_results['summarized_text'])])

    return formatted_results

In [None]:
# عرض القضايا بدون تلخيص
def without_summrize(input_text,history):
    seed = 42
    np.random.seed(seed)

    #smilirtiy
    top_3_similar = similarity_tfidf(input_text)

    filtered_results = top_3_similar[top_3_similar['similarity'] > 0.5]

    # Check if there are any rows in filtered_results
    if filtered_results.empty:
        return "ليس هناك قضايا مشابهة لقضيتك في قاعدة البيانات "

    # Create a formatted string for the filtered summarized_text
    formatted_results = "\n".join(
        [f"نسبة التشابة: %{similarity * 100:.2f} \nالقضية:\n{cases_text}\n" for similarity, cases_text in
         zip(filtered_results['similarity'], filtered_results['cases_text'])])

    return formatted_results

## Tap3 : Name entites

In [None]:
label_mapping = {
    'LABEL_1': 'O',
    'LABEL_2': 'B-ORG',
    'LABEL_3': 'B-PER',
    'LABEL_4': 'O',
    'LABEL_5': 'O',
    'LABEL_6': 'I-ORG',
    'LABEL_7': 'I-PER',
    'LABEL_8': 'O'
         # Add mappings for other labels as needed
}

entity_name_mapping = {
     'B-ORG' :'منظمة',
     'B-PER':  'شخص'

}

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import gradio as gr

new_model_path = "/content/drive/MyDrive/ner-model"

device = "cuda:0"

custom_labels = ['B-DATE', 'B-NUM', 'B-ORG', 'B-PER', 'I-DATE', 'I-NUM', 'I-ORG', 'I-PER', 'O']

label_mapping = {
    'LABEL_1': 'O',
    'LABEL_2': 'B-ORG',
    'LABEL_3': 'B-PER',
    'LABEL_4': 'O',
    'LABEL_5': 'O',
    'LABEL_6': 'I-ORG',
    'LABEL_7': 'I-PER',
    'LABEL_8': 'O'
         # Add mappings for other labels as needed
}

entity_name_mapping = {
     'B-ORG' :'منظمة',
     'B-PER':  'شخص'

}

model_cp = f"{new_model_path}/best"

tokenizer = AutoTokenizer.from_pretrained(model_cp)
model = AutoModelForTokenClassification.from_pretrained(model_cp, num_labels=len(custom_labels)).to(device)

device = "cuda:0"

# Create NER pipeline
get_completion = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    device=device
)

def map_labels(output):
    for entity in output:
        entity['entity'] = label_mapping.get(entity['entity'], entity['entity'])
    return output

def merge_tokens(tokens):
    merged_tokens = []
    for token in tokens:
        if token['entity'] != 'O':
            if merged_tokens and token['entity'].startswith('I-') and merged_tokens[-1]['entity'].endswith(token['entity'][2:]):
                # If the current token continues the entity of the last one, merge them
                last_token = merged_tokens[-1]
                last_token['word'] += token['word'].replace('##', '')
                last_token['end'] = token['end']
                last_token['score'] = (last_token['score'] + token['score']) / 2
            else:
                # Otherwise, add the token to the list
                merged_tokens.append(token)

    return merged_tokens

def ner(input):
    output = get_completion(input)
    output = map_labels(output)
    merged_tokens = merge_tokens(output)
    filtered_entities = []
    for token in merged_tokens:
        token['entity'] = entity_name_mapping.get(token['entity'], token['entity'])
        # if entity_label in ['شخص', 'منظمة']:
        #     filtered_entities.append(token)

    return {"text": input, "entities": merged_tokens}


## Gradio

In [None]:
with gr.Blocks(theme='JohnSmith9982/small_and_pretty') as demo:

    with gr.Tab("إسأل سؤالك"):
        gr.ChatInterface(process_and_answer_vec)

    with gr.Tab("القضايا المتشابهة"):
        name = gr.Textbox(label="أدخل القضية", text_align= 'right')
        output = gr.Textbox(label="القضايا المتشابهة", text_align= 'right', lines=10)
        search_btn = gr.Button("تلخيص القضايا")
        btn = gr.Button("عرض القضايا بدون تلخيص ")
        search_btn.click(fn=similarity_with_summrize, inputs=name, outputs=output, show_progress=False)
        btn.click(fn=without_summrize, inputs=name, outputs=output, show_progress=False)

    with gr.Tab("ازالة الاسماء"):
        ner_interface = gr.Interface(
                    fn=ner,
                    inputs=[gr.Textbox(label="نص القضية", lines=2)],
                    outputs=[gr.HighlightedText(label=" تحديد الأسماء الخاصة")],
                    title="ازالة الأسماء العربية وأسماء الشركات",
                    allow_flagging="never",
                    examples = ['لحمد لله والصلاة والسلام على رسول ﷲ أما بعد:فلدى الدائرة التجارية الثامنة وبناءً على القضية رقم ٥٤٨٣ لعام ١٤٤٢ هـالمقامة من/ خالد علي محمد سواس سجل تجاري (...) ضد/ شركة ديبا العربية السعودية للمقاولات والديكور الداخلي المحدودة غير ذلك (...) القاضي خالد بن محمد الزهراني رئيسا(الوقائع)تتلخص وقائع هذه القضية بالقدر اللازم للحكم فيها في أنّه تقدّم وكيل المدعي بصحيفة دعوى إلى المحكمة التجارية بالرياض جاء فيها مطالبة المدعى عليها بسداد مبلغ وقدره (١٢٩.٤٥٧.٣) مائة وتسعة وعشرون ألف وأربعمائة وسبعة وخمسون ريالا وثلاثة هللات  ']
        )
demo.launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://c69d6f32683cf272c3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


