# Libererías necesarias

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util

# Análisis del Quran en Árabe

Primero importamos el dataset que hemos limpiado con la función anteriormente creada

In [2]:
with open("../data/cleaned_data/cleaned_arab_quran.txt", encoding="utf-8") as f:
    lines = f.readlines()

df = pd.DataFrame(lines, columns=["text"])
df["text"] = df["text"].str.strip()

df.head()

Unnamed: 0,text
0,1|1|بسم الله الرحمن الرحيم
1,1|2|الحمد لله رب العالمين
2,1|3|الرحمن الرحيم
3,1|4|مالك يوم الدين
4,1|5|اياك نعبد واياك نستعين


Importamos el sentence-transformer que vamos a usar para ambos idiomas, ya que éste es multilingüe

In [None]:
#Y si usamos fastText? Proguntar a Miguel y a Unai porque ha decidido usar sentence transformers
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Y ahora creamos los embeddings

In [4]:
df["arab_embeddings"] = df["text"].apply(lambda x: model.encode(x, convert_to_tensor=True))

Now we can see the applied arab embeddings

In [5]:
df.head()

Unnamed: 0,text,arab_embeddings
0,1|1|بسم الله الرحمن الرحيم,"[tensor(-0.0018), tensor(0.0619), tensor(-0.03..."
1,1|2|الحمد لله رب العالمين,"[tensor(0.0052), tensor(0.0776), tensor(-0.006..."
2,1|3|الرحمن الرحيم,"[tensor(0.0033), tensor(0.0738), tensor(0.0118..."
3,1|4|مالك يوم الدين,"[tensor(0.0616), tensor(0.1301), tensor(0.0168..."
4,1|5|اياك نعبد واياك نستعين,"[tensor(-0.0151), tensor(0.0704), tensor(-0.01..."


Ahora probamos la búsqueda semántica por concepto, vamos a usar la similitud de coseno

In [6]:
concept = "Paradise" # por ejemplo
concept_emb = model.encode(concept.lower(), convert_to_tensor=True)

df["cos_similarity"] = df["arab_embeddings"].apply(lambda x: util.pytorch_cos_sim(x, concept_emb).item())
df_sorted = df.sort_values(by="cos_similarity", ascending=False)
print(pd.DataFrame(df_sorted[["text", "cos_similarity"]].head(10)))

                                                   text  cos_similarity
5600          76|10|انا نخاف من ربنا يوما عبوسا قمطريرا        0.218459
279   2|273|للفقراء الذين احصروا في سبيل الله لا يست...        0.217785
2535                21|53|قالوا وجدنا اباءنا لها عابدين        0.215674
6236                                                           0.213338
6237                                                           0.213338
5879             83|32|واذا راوهم قالوا ان هؤلاء لضالون        0.209789
3227  27|69|قل سيروا في الارض فانظروا كيف كان عاقبه ...        0.208556
3532               32|30|فاعرض عنهم وانتظر انهم منتظرون        0.207411
1548  11|76|يا ابراهيم اعرض عن هذا ۖ انه قد جاء امر ...        0.204135
5621  76|31|يدخل من يشاء في رحمته ۚ والظالمين اعد له...        0.202699


# Análisis de Quran en Inglés

In [None]:
with open("/home/unaiolaizolaosa/Dokumentuak/NLP-Group-Project/data/cleaned_data/cleaned_english_quran.txt", encoding="utf-8") as f:
    lines = f.readlines()

df = pd.DataFrame(lines, columns=["text"])
df["text"] = df["text"].str.strip()

df.head()

In [20]:
df["arab_embeddings"] = df["text"].apply(lambda x: model.encode(x, convert_to_tensor=True))

In [21]:
concept = "Paradise" # por ejemplo
concept_emb = model.encode(concept.lower(), convert_to_tensor=True)

df["cos_similarity"] = df["arab_embeddings"].apply(lambda x: util.pytorch_cos_sim(x, concept_emb).item())
df_sorted = df.sort_values(by="cos_similarity", ascending=False)
print(df_sorted[["text", "cos_similarity"]].head(10))

                                                   text  cos_similarity
6022                              and enter my paradise        0.638474
4394        enter paradise you and your kinds delighted        0.631074
5812                  and when paradise is brought near        0.614703
2878  the companions of paradise that day are in a b...        0.591378
4559  is the description of paradise which the right...        0.576164
5610  and when you look there in paradise you will s...        0.570232
4660  and paradise will be brought near to the right...        0.556255
4396  and that is paradise which you are made to inh...        0.549605
3021  and paradise will be brought near that day to ...        0.535889
615   paradise is not obtained by your wishful think...        0.533608
