#### Code written by Mohamed Abdallah


### Installing required packages

In [None]:
!pip install openai langchain chromadb tiktoken langchain-community langchain_chroma langchain_openai unstructured sentence-transformers bitsandbytes

In [2]:
!pip install python-dotenv==1.0.0

Collecting python-dotenv==1.0.0
  Downloading python_dotenv-1.0.0-py3-none-any.whl.metadata (21 kB)
Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
  Attempting uninstall: python-dotenv
    Found existing installation: python-dotenv 1.0.1
    Uninstalling python-dotenv-1.0.1:
      Successfully uninstalled python-dotenv-1.0.1
Successfully installed python-dotenv-1.0.0


In [3]:
url = "https://api.together.xyz/inference"
my_key = '...'

In [4]:
# Setting Directory for the uploaded files
! mkdir '/content/Docs'

### Import Packages and Libraries

In [5]:
import os
import re
import pandas as pd
import numpy as np
import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import openai
from langchain import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain_core.messages import AIMessage, HumanMessage
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain_core.prompts import MessagesPlaceholder, ChatPromptTemplate
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_chroma import Chroma
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

---

### Reading table of Q&A csv file

In [7]:
df = pd.read_csv('/content/allData_mod3.csv')

In [8]:
df.columns

Index(['Column1', 'id', 'title', 'context', 'question', 'answers',
       'modified_question (GPT 3.5 Turbo)', 'modified_question',
       'Answers_last', 'Mistral_answers'],
      dtype='object')

---

### Uploading files, Splitting and Use various retrievals

In [11]:
loader = DirectoryLoader('/content/Docs')
docs = loader.load()

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(docs)

In [None]:
splits

### Create the retriever

###  E5 (dim_size= 1024)

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')

# Function to generate embeddings
def embed_texts(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().tolist())
    return embeddings


In [14]:
class E5Embeddings:
    def embed_documents(self, texts):
        return embed_texts(texts)

    def embed_query(self, text):
        return embed_texts([text])[0]

embedding_E5 = E5Embeddings()

In [15]:
# Create Chroma vectorstore with embeddings from the model
vectorstore_E5 = Chroma.from_documents(documents=splits, embedding=embedding_E5)
retriever_E5 = vectorstore_E5.as_retriever(search_type="similarity", search_kwargs={'k': 5})

### **Answers Generation**

In [16]:
import os
from dotenv import load_dotenv
import os
from dotenv import load_dotenv, find_dotenv
import warnings
import requests
import json
import time

### Calling the LLama-3 model through a hosted api

In [35]:
class Llama:
    def __init__(self, url, headers, model="meta-llama/Llama-3-70b-chat-hf", temperature=0.0, max_tokens=500, base=2, max_tries=3, add_inst=True, verbose=False):
        self.url = url
        self.headers = headers
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.base = base
        self.max_tries = max_tries
        self.add_inst = add_inst
        self.verbose = verbose

    def _call_api(self, prompt):
        if self.add_inst:
            prompt = f"[INST]{prompt}[/INST]"

        if self.verbose:
            print(f"Prompt:\n{prompt}\n")
            print(f"Model: {self.model}")

        data = {
            "model": self.model,
            "prompt": prompt,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }

        wait_seconds = [self.base**i for i in range(self.max_tries)]

        for num_tries in range(self.max_tries):
            try:
                response = requests.post(self.url, headers=self.headers, json=data)
                response.raise_for_status()
                return response.json()['output']['choices'][0]['text']
            except requests.exceptions.HTTPError as e:
                if response.status_code != 500:
                    return response.json()

                print(f"HTTP error occurred: {e}")
                print(f"Num tries: {num_tries}")
                print(f"Waiting {wait_seconds[num_tries]} seconds before automatically trying again.")
                time.sleep(wait_seconds[num_tries])
            except Exception as e:
                print(f"Other error occurred: {e}")
                return {"error": str(e)}

        print(f"Tried {self.max_tries} times to make API call to get a valid response object")
        print("Returning provided response")
        return response

    def __call__(self, prompt):
        return self._call_api(prompt)

In [36]:
llama_llm = Llama(url=url, headers={"Authorization": f"Bearer {my_key}", "Content-Type": "application/json"}, model="meta-llama/Llama-3-70b-chat-hf")

### Creating the prompts and chains

In [37]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "Use the given context to answer the question in Arabic language. REPLY ONLY IN ARABIC "
    "Use three sentence maximum and keep the answer precise and concise."
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [38]:
question_answer_chain = create_stuff_documents_chain(llama_llm , prompt)
chain = create_retrieval_chain(retriever_E5, question_answer_chain)


In [39]:
chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'E5Embeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x79a75ca36830>, search_kwargs={'k': 5}), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | ChatPromptTemplate(input_variables=['context', 'input'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template='Use the given context to answer the question in Arabic language. REPLY ONLY IN ARABIC Use three sentence maximum and keep the answer precise and concise.Context: {context}')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], template='{input}'))])
            |

In [40]:
question = df['modified_question'][0]
chain.invoke({"input": question})['answer']

'Hamza bin Abdul-Muttalib Al-Hashemi Al-Qurashi, a companion of the Prophet Muhammad, peace be upon him, and his uncle and foster brother, one of the fourteen closest companions, and the best of his uncles, as the Prophet said: "The best of my uncles is Hamza."assistant\n\nحمزة بن عبد المطلب الهاشمي القرشي صحابي من صحابة رسول الإسلام محمد، وعمه وأخوه من الرضاعة، أحد وزرائه الأربعة عشر، وهو خير أعمامه لقوله: «خَيْرُ إِخْوَتِي عَلِيٌّ، وَخَيْرُ أَعْمَامِي حَمْزَةُ رَضِيَ اللَّهُ عَنْهُمَا».assistant\n\nحمزة بن عبد المطلب الهاشمي القرشي صحابي من صحابة رسول الإسلام محمد، وعمه وأخوه من الرضاعة، أحد وزرائه الأربعة عشر، وهو خير أعمامه لقوله: «خَيْرُ إِخْوَتِي عَلِيٌّ، وَخَيْرُ أَعْمَامِي حَمْزَةُ رَضِيَ اللَّهُ عَنْهُمَا». تربى حمزة في كنف والده عبد المطلب بن هاشم الذي كان سيد قريش وبني هاشم، ونشأ مع تِربه وابن أخيه عبد الله وأخيه من الرضاعة محمد بن عبد الله بن عبد المطلب.assistant\n\nحمزة بن عبد المطلب الهاشمي القرشي صحابي من صحابة رسول الإسلام محمد، وعمه وأخوه من الرضاعة، أحد وزرائه الأربعة

In [41]:
def extract_text_llama(text):
    # Check if 'assistant' is present
    if 'assistant' in text:
        # Split the text by 'assistant'
        parts = text.split('assistant')

        # Check the number of 'assistant' occurrences
        if len(parts) >= 2:
            # Extract text between the first two 'assistant'
            extracted_text = parts[1].strip()
        else:
            # Extract all text after the first 'assistant'
            extracted_text = parts[1].strip() if len(parts) > 1 else ""

        return extracted_text.strip()
    else:
        # Use regex to find all sequences of English characters
        english_pattern = re.compile(r'[a-zA-Z\s]+')
        english_matches = list(english_pattern.finditer(text))

        if len(english_matches) >= 1:
            # Extract Arabic text from the first English sequence to the end
            start_index = english_matches[0].end()
            arabic_text = text[start_index:]

            # Use regex to find all sequences of Arabic characters within the extracted text
            arabic_pattern = re.compile(r'[\u0600-\u06FF\s]+')
            arabic_matches = list(set(arabic_pattern.findall(arabic_text)))

            return ''.join(arabic_matches).strip()

### **Generating chain answers**

In [42]:
questions = df['modified_question']
true_answers = df['Answers_last']

In [24]:
questions.fillna('', inplace=True)

In [28]:
questions.isna().sum()

0

#### Generate Answers

In [33]:
questions[663:]

Unnamed: 0,modified_question
663,من أُرسل من قبل الرسول محمد ليقوم بجمع المعلوم...
664,هل أنكر الرسول محمد على أصحابه الذين لم يخرجوا؟
665,هل كانت نية الرسول محمد القتال؟
666,متى ولد عثمان الأول؟
667,متى قامت الجمهورية التركية؟
...,...
1390,ما هو حجم رأس البظر البشرى؟
1391,كم عدد النهايات العصبية فى رأس البظر البشرى؟
1392,"ما هو اصل كلمة""Clitoris""طبقا لقاموس اكسفورد لل..."
1393,"كيف يُستخدم الشكل المختصر ""Clit""؟"


In [43]:
questions

Unnamed: 0,modified_question
0,من هو حمزة بن عبد المطلب؟
1,بما وصف رسول الله حمزة بن عبد المطلب؟
2,بما وصف رسول الله عليّ؟
3,متى أسلم حمزة؟
4,و ماذا فعل حمزة في غزوة بدر؟
...,...
1390,ما هو حجم رأس البظر البشرى؟
1391,كم عدد النهايات العصبية فى رأس البظر البشرى؟
1392,"ما هو اصل كلمة""Clitoris""طبقا لقاموس اكسفورد لل..."
1393,"كيف يُستخدم الشكل المختصر ""Clit""؟"


In [None]:
def process_question_llama(q):
    try:
        response = chain.invoke({"input": q})
        answer = response['answer']
        return extract_text_llama(answer) or ""
    except Exception as e:
        return ""



In [None]:
# Generate answers using the chain
generated_answers = {}
i = 12
for q in questions[12:]:
    generated_answers[i] = extract_text_llama(chain.invoke({"input": q})['answer'])
    i += 1

In [None]:
len(generated_answers)

In [None]:
llama_df = df['modified_question','Answers_last']

In [None]:
llama_df['llama_answers'] = generated_answers.values()

In [None]:
llama_df['llama_answers'].fillna('', inplace=True)

In [None]:
llama_df

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk import ngrams
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def removeTashkeel(text):
    # Removes Tashkeel from input text

    p_tashkeel = re.compile(r'[\u0616-\u061A\u064B-\u0652\u06D6-\u06ED\u08F0-\u08F3\uFC5E-\uFC63\u0670]')
    text = re.sub(p_tashkeel, "", text)
    return text

def normalizeText(text):
    """
        normalizes all forms to alf to ا, converts ة to ه, and ى to ي.  It also converts new lines and tabs to a single space
        and seperates common punctuation marks from text
    """
    text = removeTashkeel(text)
    search = ["أ", "إ", "آ", "ٱ", "ة", "_", "-", "/", ".", "،", " و ", '"', "ـ", "'", "ى", "ی", "\\", '\n', '\t',
              '&quot;', '?', '؟', '!', 'ﷲ']
    replace = ["ا", "ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", "", "", "", "ي", "ي", "", ' ', ' ', ' ', ' ? ',
               ' ؟ ',' ! ', 'الله']

    # search = ["آ", "إ", "أ", "ة"]
    # replace = ["ا", "ا", "ا", "ه"]

    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    return text

def calculate_precision_recall_f1(system_answer, reference_answer):
    # Tokenizing the answers into sets of words
    system_answer = normalizeText(system_answer)
    reference_answer = normalizeText(reference_answer)
    reference_tokens = set(reference_answer.split())
    system_tokens = set(system_answer.split())

    # Calculating Precision and Recall
    common_tokens = reference_tokens.intersection(system_tokens)
    precision = len(common_tokens) / len(system_tokens) if system_tokens else 0
    recall = len(common_tokens) / len(reference_tokens) if reference_tokens else 0

    # Calculating F1 Score
    if precision + recall != 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0

    return precision, recall, f1_score

def calculate_bleu(system_answer, reference_answer):
    system_answer = normalizeText(system_answer)
    reference_answer = normalizeText(reference_answer)
    reference_tokens = [reference_answer.split()]
    system_tokens = system_answer.split()
    bleu_score = sentence_bleu(reference_tokens, system_tokens, smoothing_function=SmoothingFunction().method1)
    return bleu_score

def calculate_cosineSim(system_answer, reference_answer):
    system_answer_v  = embedding_E5.embed_documents([system_answer])
    reference_answer_v = embedding_E5.embed_documents([reference_answer])

    cosine_sim = cosine_similarity(system_answer_v, reference_answer_v)[0][0]
    return cosine_sim





### Model stats

In [None]:
models = [
'llama_answers'
]


# Add new columns for metrics
for ll_model in models:
    llama_df[f'{ll_model}_precision'] = 0.0
    llama_df[f'{ll_model}_recall'] = 0.0
    llama_df[f'{ll_model}_f1_score'] = 0.0
    llama_df[f'{ll_model}_bleu_score'] = 0.0
    llama_df[f'{ll_model}_cosineSim'] = 0.0

# Calculate metrics for each row and each model
for idx, row in llama_df.iterrows():
    reference_answer = row['Answers_last']
    for ll_model in models:
        system_generated_answer = row[ll_model]
        if system_generated_answer.strip() == "":
            precision, recall, f1_score = 0, 0, 0
            bleu_score = 0
            sim_score = 0
        else:
            precision, recall, f1_score = calculate_precision_recall_f1(system_generated_answer, reference_answer)
            bleu_score = calculate_bleu(system_generated_answer, reference_answer)
            sim_score = calculate_cosineSim(system_generated_answer, reference_answer)

        # Store the metrics in the DataFrame
        llama_df.at[idx, f'{ll_model}_precision'] = precision
        llama_df.at[idx, f'{ll_model}_recall'] = recall
        llama_df.at[idx, f'{ll_model}_f1_score'] = f1_score
        llama_df.at[idx, f'{ll_model}_bleu_score'] = bleu_score
        llama_df.at[idx, f'{ll_model}_cosineSim'] = sim_score


# Calculate the average of each metric column and rename the columns
for ll_model in models:
    for metric in ['precision', 'recall', 'f1_score', 'bleu_score', 'cosineSim']:
        col_name = f'{ll_model}_{metric}'
        avg_value = llama_df[col_name].mean()
        new_col_name = f'{col_name} ({avg_value:.2f})'
        llama_df.rename(columns={col_name: new_col_name}, inplace=True)

In [None]:
llama_df.to_csv('/content/llama_all.csv', index=False, encoding='utf-8-sig')

---

### Mistral

In [None]:
class Mistral:
    def __init__(self, url, headers, model="mistralai/Mistral-7B-Instruct-v0.2", temperature=0.0, max_tokens=3000, base=2, max_tries=3, add_inst=True, verbose=False):
        self.url = url
        self.headers = headers
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.base = base
        self.max_tries = max_tries
        self.add_inst = add_inst
        self.verbose = verbose

    def _call_api(self, prompt):
        if self.add_inst:
            prompt = f"[INST]{prompt}[/INST]"

        if self.verbose:
            print(f"Prompt:\n{prompt}\n")
            print(f"Model: {self.model}")

        data = {
            "model": self.model,
            "prompt": prompt,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }

        wait_seconds = [self.base**i for i in range(self.max_tries)]

        for num_tries in range(self.max_tries):
            try:
                response = requests.post(self.url, headers=self.headers, json=data)
                response.raise_for_status()
                return response.json()['output']['choices'][0]['text']
            except requests.exceptions.HTTPError as e:
                if response.status_code != 500:
                    return response.json()

                print(f"HTTP error occurred: {e}")
                print(f"Num tries: {num_tries}")
                print(f"Waiting {wait_seconds[num_tries]} seconds before automatically trying again.")
                time.sleep(wait_seconds[num_tries])
            except Exception as e:
                print(f"Other error occurred: {e}")
                return {"error": str(e)}

        print(f"Tried {self.max_tries} times to make API call to get a valid response object")
        print("Returning provided response")
        return response

    def __call__(self, prompt):
        return self._call_api(prompt)

In [None]:
mistral_llm = Mistral(url=url, headers={"Authorization": f"Bearer {my_key}", "Content-Type": "application/json"}, model="mistralai/Mistral-7B-Instruct-v0.2")


In [None]:
question_answer_chain = create_stuff_documents_chain(mistral_llm , prompt)
chain = create_retrieval_chain(retriever_E5, question_answer_chain)

In [None]:
df['modified_question'][194]

'متى أعلنت موريشيوس استقلالها؟'

In [None]:
chain.invoke({"input": question})

{'input': 'متى أعلنت موريشيوس استقلالها؟',
 'context': [Document(metadata={'source': '/content/Docs/all_text.txt'}, page_content='جمهورية موريشيوس هي جزر صغيرة بوسط المحيط الهندي تبعد عن ملاجاش (مدغشقر) بحوالي 860 كيلومتر. الرحالة البرتغالي دون بيدرو ماسكارينهاس كان أول من عرف العالم بها في العام 1505 وقد قام بإطلاق اسم ماسكارينس على مجموعة الجزر المعروفة الآن بموريشيوس، رودريغز وريونيون. وفي عام 1598، رسا أسطول هولندي في غراند بورت مما أدى إلى إقامة أول مستعمرة هولندية على الجزيرة في 1638. وعلى مدى السنين، أدخل الهولنديون إلى الجزيرة قصب السكر والحيوانات الأليفة والغزلان قبل رحيلهم عنها في 1710. وجاء من بعدهم الفرنسيون في سنة 1715 وأسسوا ميناء بورت لويس -عاصمة البلاد حاليا- وظلت جزيرة موريشيوس قاعدة لهم حتى هزيمة نابليون، فاستولت عليها بريطانيا في سنة 1810 م، وأقاموا سلطة تحت قيادة روبرت فاركوهار قامت فيما بعد بغرس تغييرات اجتماعية واقتصادية سريعة في الجزيرة. أجرت البلاد انتخابات عامة في 1967، وضعت بعدها موريشويس دستور جديد وأعلنت استقلالها في 12 مارس 1968، ثم لحق ذلك الإعلان عن جمهور

In [None]:
question = df['modified_question'][194]
print(chain.invoke({"input": question})['answer'])

 HumanMessage(content='12 مارس 1968')


In [None]:
def extract_arabic_mistral(text):
    # Define a regular expression to match Arabic text
    arabic_pattern = re.compile(r'[\u0600-\u06FF]+')

    # Find all Arabic text segments in the string
    arabic_matches = arabic_pattern.findall(text)

    # Join the Arabic text segments into a single string
    arabic_text = ' '.join(arabic_matches)

    # If the length of the Arabic text is less than or equal to 20 characters
    if len(arabic_text) <= 20:
        arabic_text = text
        arabic_text = arabic_text.replace('\n\n', ' ').replace('\n', ' ')

    return arabic_text

In [None]:
extract_arabic_mistral(chain.invoke({"input": question})['answer'])

" HumanMessage(content='12 مارس 1968')"

In [None]:
generated_answers

In [None]:
def process_question(q):
    try:
        response = chain.invoke({"input": q})
        answer = response['answer']
        return extract_arabic_mistral(answer) or ""
    except Exception as e:
        return ""

# Using enumerate to keep track of indices
generated_answers = {}
for i, q in enumerate(questions):
    answer = process_question(q)
    generated_answers[i] = answer


In [None]:
generated_answers.values()

In [None]:
df['Mistral_answers'] = generated_answers.values()

In [None]:
df['Mistral_answers'].isna().sum()

0

In [None]:
df['Mistral_answers'].fillna('', inplace=True)
df.to_csv('/content/allData_mod3.csv', index=False)

In [None]:
df

---

### Mixtral

In [None]:
class Mixtral:
    def __init__(self, url, headers, model="mistralai/Mixtral-8x7B-Instruct-v0.1", temperature=0.0, max_tokens=2000, base=2, max_tries=3, add_inst=True, verbose=False):
        self.url = url
        self.headers = headers
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.base = base
        self.max_tries = max_tries
        self.add_inst = add_inst
        self.verbose = verbose

    def _call_api(self, prompt):
        if self.add_inst:
            prompt = f"[INST]{prompt}[/INST]"

        if self.verbose:
            print(f"Prompt:\n{prompt}\n")
            print(f"Model: {self.model}")

        data = {
            "model": self.model,
            "prompt": prompt,
            "temperature": self.temperature,
            "max_tokens": self.max_tokens
        }

        wait_seconds = [self.base**i for i in range(self.max_tries)]

        for num_tries in range(self.max_tries):
            try:
                response = requests.post(self.url, headers=self.headers, json=data)
                response.raise_for_status()
                return response.json()['output']['choices'][0]['text']
            except requests.exceptions.HTTPError as e:
                if response.status_code != 500:
                    return response.json()

                print(f"HTTP error occurred: {e}")
                print(f"Num tries: {num_tries}")
                print(f"Waiting {wait_seconds[num_tries]} seconds before automatically trying again.")
                time.sleep(wait_seconds[num_tries])
            except Exception as e:
                print(f"Other error occurred: {e}")
                return {"error": str(e)}

        print(f"Tried {self.max_tries} times to make API call to get a valid response object")
        print("Returning provided response")
        return response

    def __call__(self, prompt):
        return self._call_api(prompt)

In [None]:
mixtral_llm = Mixtral(url=url, headers={"Authorization": f"Bearer {my_key}", "Content-Type": "application/json"}, model="meta-llama/Llama-3-70b-chat-hf")


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "Use the given context to answer the question in Arabic language."
    "Use three sentence maximum and keep the answer precise and concise."
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(mixtral_llm , prompt)
chain = create_retrieval_chain(retriever_E5, question_answer_chain)

In [None]:
import re

def extract_arabic_mixtral(text):
    # Check if 'assistant' is in the text
    if 'assistant' in text:
        # Split by 'assistant'
        parts = text.split('assistant')

        # Check the number of 'assistant' occurrences
        if len(parts) > 2:
            # Extract text between the first two 'assistants'
            extracted_text = parts[1].strip() + ' ' + parts[2].strip()
        elif len(parts) == 2:
            # Extract text after the first 'assistant'
            extracted_text = parts[1].strip()
        else:
            # No 'assistant' found, return an empty string
            extracted_text = ""
    else:
        # Find all occurrences of 'SystemMessage(content='
        parts = text.split("SystemMessage(content=")

        if len(parts) > 1:
            # Take the last part after the last 'SystemMessage(content='
            last_part = parts[-1]
            # Extract text before the first 'HumanMessage(content='
            extracted_text = last_part.split("HumanMessage(content=")[0]
        else:
            # No 'SystemMessage(content=' found, return empty string
            extracted_text = ""

    # Remove non-Arabic characters but keep Arabic and standard numerals
    arabic_text = re.sub(r'[^\u0600-\u06FF0-9٠-٩\s]', '', extracted_text)

    # Return the extracted Arabic text
    return arabic_text.strip()

In [None]:
question = df['modified_question'][600]
question

'من هو الزبون الأول لتونس؟'

In [None]:
type(chain)

In [None]:
chain.invoke({"input": question})

{'input': 'من هو الزبون الأول لتونس؟',
 'context': [Document(metadata={'source': '/content/Docs/all_text.txt'}, page_content='تونس، رسميًا الجمهورية التونسية، هي دولة تقع في شمال أفريقيا يحدها من الشمال والشرق البحر الأبيض المتوسط ومن الجنوب الشرقي ليبيا (459 كم) ومن الغرب الجزائر (965 كم). لعبت تونس أدوارا هامة في التاريخ القديم منذ عهد الفينيقيين والأمازيغ وااقرطاجيين والونداليين والرومان وقد عرفت باسم مقاطعة أفريكا إبان الحكم الروماني لها والتي سميت باسمها كامل القارة. فتحها المسلمون في القرن السابع الميلادي وأسسوا فيها مدينة القيروان سنة 50 هـ لتكون ثاني مدينة إسلامية في شمال أفريقيا بعد الفسطاط. في ظل الدولة العثمانية، كانت تسمى "الإيالة التونسية". وقعت تحت الاحتلال الفرنسي في عام 1881، ثم حصلت على استقلالها في عام 1956 لتصبح رسميا المملكة التونسية في نهاية عهد محمد الأمين باي. مع إعلان الجمهورية التونسية في 25 يوليو 1957، أصبح الحبيب بورقيبة أول رئيس لها.'),
  Document(metadata={'source': '/content/Docs/all_text.txt'}, page_content='تونس، رسميًا الجمهورية التونسية، هي دولة تقع في

In [None]:
extract_arabic_mixtral(chain.invoke({"input": question})['answer'])

'الاتحاد الأوروبي       الاتحاد الأوروبي'

In [None]:
questions = df['modified_question']
true_answers = df['answers']

In [None]:
i

396

In [None]:
generated_answers

In [None]:
questions.iloc[396:]

In [None]:
for i, q in enumerate(questions.iloc[561:]):
  print(q)
  break

ما هى الاعراض العامة اللتى تظهر  مع المرض؟


In [None]:
generated_answers

In [None]:
questions.iloc[561:]

Unnamed: 0,question
561,ما هى الاعراض العامة اللتى تظهر مع المرض؟
562,كيف يمكن تشخيص المرض؟
563,بماذا يعرف السرطان؟
564,ما هى نسب الوفاة بالسرطان؟
565,كم عد الوفيات بالسرطان فى العالم عام 2007؟
...,...
1390,ما هو حجم رأس البظر البشرى؟
1391,كم عدد النهايات العصبية فى رأس البظر البشرى؟
1392,"ما هو اصل كلمة""Clitoris""طبقا لقاموس اكسفورد لل..."
1393,"اين استخدم الشكل المختصر ::Clit""؟"


In [None]:
chain.invoke({"input": q})
# generated_answers[i+561] = answer

In [None]:
def process_question_mixtral(q):
    try:
        response = chain.invoke({"input": q})
        answer = response['answer']
        return extract_arabic_mixtral(answer) or ""
    except Exception as e:
        return ""

# Using enumerate to keep track of indices
#generated_answers = {}
for i, q in enumerate(questions.iloc[561:]):
    answer = process_question_mixtral(q)
    generated_answers[i+561] = answer


In [None]:
generated_answers

#### General processong to the output

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk import ngrams
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def removeTashkeel(text):
    # Removes Tashkeel from input text

    p_tashkeel = re.compile(r'[\u0616-\u061A\u064B-\u0652\u06D6-\u06ED\u08F0-\u08F3\uFC5E-\uFC63\u0670]')
    text = re.sub(p_tashkeel, "", text)
    return text

def normalizeText(text):
    """
        normalizes all forms to alf to ا, converts ة to ه, and ى to ي.  It also converts new lines and tabs to a single space
        and seperates common punctuation marks from text
    """
    text = removeTashkeel(text)
    search = ["أ", "إ", "آ", "ٱ", "ة", "_", "-", "/", ".", "،", " و ", '"', "ـ", "'", "ى", "ی", "\\", '\n', '\t',
              '&quot;', '?', '؟', '!', 'ﷲ']
    replace = ["ا", "ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", "", "", "", "ي", "ي", "", ' ', ' ', ' ', ' ? ',
               ' ؟ ',' ! ', 'الله']

    # search = ["آ", "إ", "أ", "ة"]
    # replace = ["ا", "ا", "ا", "ه"]

    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    return text

### Metrics

In [None]:
def calculate_precision_recall_f1(system_answer, reference_answer):
    # Tokenizing the answers into sets of words
    system_answer = normalizeText(system_answer)
    reference_answer = normalizeText(reference_answer)
    reference_tokens = set(reference_answer.split())
    system_tokens = set(system_answer.split())

    # Calculating Precision and Recall
    common_tokens = reference_tokens.intersection(system_tokens)
    precision = len(common_tokens) / len(system_tokens) if system_tokens else 0
    recall = len(common_tokens) / len(reference_tokens) if reference_tokens else 0

    # Calculating F1 Score
    if precision + recall != 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0

    return precision, recall, f1_score

def calculate_bleu(system_answer, reference_answer):
    system_answer = normalizeText(system_answer)
    reference_answer = normalizeText(reference_answer)
    reference_tokens = [reference_answer.split()]
    system_tokens = system_answer.split()
    bleu_score = sentence_bleu(reference_tokens, system_tokens, smoothing_function=SmoothingFunction().method1)
    return bleu_score

def calculate_cosineSim(system_answer, reference_answer):
    system_answer_v  = embedding_E5.embed_documents([system_answer])
    reference_answer_v = embedding_E5.embed_documents([reference_answer])

    cosine_sim = cosine_similarity(system_answer_v, reference_answer_v)[0][0]
    return cosine_sim

