In [21]:
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import os
import time
import pinecone
from datasets import load_dataset
from tqdm.auto import tqdm
from torch import cuda, bfloat16
import transformers
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
import pandas as pd
import re 
from maha.cleaners.functions import remove, normalize


embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(f"Using device: {device}")

Using device: cuda:0


In [2]:
embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)


docs = [
    'هذه هي الجملة الأولى',
    'هذه هي الجملة الثانية',
    'هذه هي الجملة الثالثة',
]

embeddings = embed_model.embed_documents(docs)

print(f"We have {len(embeddings)} embeddings, each with " f"a dimensionality of {len(embeddings[0])}")

We have 3 embeddings, each with a dimensionality of 384


In [4]:
pinecone.init(
    api_key='api-key',
    environment='us-west4-gcp-free'
)

index_name = 'arag-chat'

if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        index_name,
        dimension=len(embeddings[0]),
        metric='cosine',
    )

    while not pinecone.describe_index(index_name).status['ready']:
        time.sleep(1)

index = pinecone.Index(index_name)

print(index.describe_index_stats())

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [9]:
with open('data/مجموع الفتاوى.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Number of characters in the text: {len(text)}")

Number of characters in the text: 25926305


In [10]:
paragraphs = text.split("_________")

print(f"Number of paragraphs: {len(paragraphs)}")

Number of paragraphs: 565


In [12]:
data = {"ID": range(1, len(paragraphs) + 1), "Paragraph": paragraphs}
df = pd.DataFrame(data)

In [22]:
hashtag_re = re.compile(pattern="#[\w\d]+")

def remove_hashtag(text: str) -> str:
    return hashtag_re.sub(repl="", string=text)

mention_re = re.compile("\B@\w+")
def remove_mention(text: str) -> str:
    return mention_re.sub(repl="", string=text)

punc_re = re.compile(r"""[!"#$%&\'()*+,-./:;<=>?@[\\\]^_`{|}~،؟…«“\":\"…”]""")
def remove_punctation(text: str) -> str:
    return punc_re.sub(repl="", string=text)

url_re = re.compile(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)")
def remove_urls(text: str) -> str:
    return url_re.sub(repl="", string=text)

diactircs_re = re.compile("[\u064B-\u0652]")

def remove_diactrics(text: str) -> str:
    return diactircs_re.sub(repl="", string=text)

numbers_re = re.compile("\d")
def remove_numbers(text: str) -> str:
    return numbers_re.sub(repl="", string=text)

english_chars_re = re.compile("[A-Za-z]")
def remove_english_characters(text: str) -> str:
    return english_chars_re.sub(repl="", string=text)

multiple_space_re = re.compile("\s{2,}")
def remove_multiple_whitespace(text: str) -> str:
    return multiple_space_re.sub(repl=" ", string=text)


def clean_all(text: str) -> str:
    text = remove_hashtag(text)
    text = remove_mention(text)
    text = remove_punctation(text)
    text = remove_urls(text)
    text = remove_diactrics(text)
    text = remove_numbers(text)
    text = remove_english_characters(text)
    text = remove_multiple_whitespace(text)
    text = text.strip()
    
    text = remove(text=text, all_harakat=True, tatweel=True, punctuations=True)
    text = normalize(text=text, all=True)
    return text

In [23]:
df['cleaned'] = df['Paragraph'].apply(clean_all)

In [24]:
df.head()

Unnamed: 0,ID,Paragraph,cleaned
0,1,الْجُزْءُ الْأَوَّلُ\nكِتَابُ تَوْحِيدِ الْأُل...,الجزء الاول\nكتاب توحيد الالوهيه\nقال شيخ الاس...
1,2,\n<s0>\nبياض بالأصل، والزيادة من الحاكم في الت...,بياض بالاصل والزياده من الحاكم في التفسير وقال...
2,3,\n<s0>\n(١) بياض بالأصل\nقال الشيخ ناصر بن حم...,بياض بالاصل\nقال الشيخ ناصر بن حمد الفهد ص وال...
3,4,\n<s0>\n(*) قال الشيخ ناصر بن حمد الفهد (ص ١٤...,قال الشيخ ناصر بن حمد الفهد ص قلت وهنا امران\n...
4,5,\n<s0>\n(١) هكذا بالأصلإلَيْهِ. وَالْخَلْقُ: ...,هكذا بالاصلاليه والخلق اهون ما يكون عليهم احوج...


In [25]:
df.to_csv('data/مجموع الفتاوى_cleaned.csv', index=False)

In [30]:
batch_size = 64

for i in tqdm(range(0,len(df), batch_size)):
    i_end = min(len(df), i + batch_size)
    batch = df.iloc[i:i_end]

    ids = [f"{x['ID']}" for _, x in batch.iterrows()]
    paragraphs = [x['cleaned'] for _, x in batch.iterrows()]
    embeds = embed_model.embed_documents(paragraphs)

    metadata = [
        {
            'id': x['ID'],
            'paragraph': x['Paragraph'][:200]
        }
        for _, x in batch.iterrows()
    ]
    index.upsert(vectors=zip(ids, embeds, metadata))
    print(f"Indexed {i_end} paragraphs")

print(index.describe_index_stats())

 11%|█         | 1/9 [00:02<00:18,  2.33s/it]

Indexed 64 paragraphs


 22%|██▏       | 2/9 [00:04<00:13,  1.95s/it]

Indexed 128 paragraphs


 33%|███▎      | 3/9 [00:05<00:10,  1.77s/it]

Indexed 192 paragraphs


 44%|████▍     | 4/9 [00:07<00:09,  1.83s/it]

Indexed 256 paragraphs


 56%|█████▌    | 5/9 [00:09<00:07,  2.00s/it]

Indexed 320 paragraphs


 67%|██████▋   | 6/9 [00:11<00:05,  1.95s/it]

Indexed 384 paragraphs


 78%|███████▊  | 7/9 [00:13<00:03,  1.85s/it]

Indexed 448 paragraphs


 89%|████████▉ | 8/9 [00:15<00:01,  1.92s/it]

Indexed 512 paragraphs


100%|██████████| 9/9 [00:16<00:00,  1.86s/it]

Indexed 565 paragraphs





{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 565}},
 'total_vector_count': 565}
