Libraries

In [1]:
import pandas as pd
import re
import pickle
import string
import numpy as np
from scipy import sparse
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words;
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import mysql.connector
from nltk.stem.isri import ISRIStemmer
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer
from camel_tools.tokenizers.word import simple_word_tokenize as tokenizer
from camel_tools.utils.normalize import normalize_unicode, normalize_alef_maksura_ar, normalize_teh_marbuta_ar, normalize_alef_ar
from camel_tools.utils.dediac import dediac_ar

Loading Corpus, Queries, Qrels files

In [2]:
#loading en_corpus
en_corpus = pd.read_csv('C:/Users/Ahmed/Desktop/data/en_corpus.csv')

#loading en_quereies
en_queries = pd.read_json("C:/Users/Ahmed/Desktop/data/en_queries.jsonl", lines=True)

#loading en_qrels
en_qrels = pd.read_csv("C:/Users/Ahmed/Desktop/data/en_qrels.tsv", sep="\t")

#loading ar_corpus
ar_corpus = pd.read_csv('C:/Users/Ahmed/Desktop/data/ar_corpus.csv')

#loading ar_quereies
ar_queries = pd.read_csv("C:/Users/Ahmed/Desktop/data/ar_queries.csv", delimiter="\t")

# #loading ar_qrels file
ar_qrels = pd.read_csv("C:/Users/Ahmed/Desktop/data/ar_qrels.csv")


English Text Preprocessing

In [3]:
#English Text Preprocessing

stopWords = stopwords.words('English') + ["'d", "'ll", "'re", "'s", "'ve", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would','arent', 'couldnt', 'didnt', 'doesnt', 'dont', 'hadnt', 'hasnt', 'havent', 'isnt', 'mightnt', 'mustnt', 'neednt', 'nt', 'shant', 'shes', 'shouldnt', 'shouldve', 'thatll', 'wasnt', 'werent', 'wont', 'wouldnt', 'youd', 'youll', 'youre', 'youve''arent', 'couldnt', 'didnt', 'doesnt', 'dont', 'hadnt', 'hasnt', 'havent', 'isnt', 'mightnt', 'mustnt', 'neednt', 'nt', 'shant', 'shes', 'shouldnt', 'shouldve', 'thatll', 'wasnt', 'werent', 'wont', 'wouldnt', 'youd', 'youll', 'youre', 'youve']
additonal_words = ['cigarettes', 'vaping', 'vape', 'privatize', 'palestinian', 'israeli']
words = set(words.words())
words.update(additonal_words)
string.punctuation = "!#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
#"!#$%&'()*+,-./:;<=>?@[\]^_`{|}~≤…•¾–—‘’“”≡"
lemmatizerr = WordNetLemmatizer()

def get_wordnet_pos(tag_parameter):
    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def tokenizer(text):
    return word_tokenize(text)

def list_to_string(list):
    return ' '.join(map(str, list))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def normalization(text):
    return text.lower()

def remove_links(text):
    return re.sub(r'\b(?:https?://|www\d{0,3}\.)\S+\b', '', text)

def stop_words_remove(text):
    result = []
    for word in word_tokenize(text):
        if word not in stopWords:
            result.append(word)
    return list_to_string(result)

def verb_adj_lemma(text):
    words1 = word_tokenize(text)
    text1 = ' '.join([WordNetLemmatizer().lemmatize(word, pos='v') for word in words1])
    words2 = word_tokenize(text1)
    text2 = ' '.join([WordNetLemmatizer().lemmatize(word, pos='a') for word in words2])
    text_pos = pos_tag(word_tokenize(text2)) 
    answer = ' '.join([WordNetLemmatizer().lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in text_pos])
    return answer

def lemmatizer(text):
    text_pos = pos_tag(word_tokenize(text))
    lemmatized = [lemmatizerr.lemmatize(word, pos=get_wordnet_pos(tag)) for word, tag in text_pos]
    return list_to_string(lemmatized)

def stemmer(text):
    words = word_tokenize(text)
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return list_to_string(stemmed_words)

def punctuation_remove(text):
    return text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

def remove_meaningless_word(text):
    return " ".join(w for w in word_tokenize(text) if w.lower() in words or not w.isalpha())

def remove_spaces(text):
    return " ".join(text.split())
    
def remove_non_alphanumeric(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def remove_short_words(text):
    return re.sub(r'\b\w{1,2}\b', '', text)

def preProcessor(text):
    text1 = normalization(text)
    text2 = remove_links(text1)
    text3 = remove_numbers(text2)
    text4 = punctuation_remove(text3)
    text5 = remove_non_alphanumeric(text4)
    text6 = remove_spaces(text5)
    text7 = verb_adj_lemma(text6)
    text8 = stop_words_remove(text7)
    text9 = remove_short_words(text8)
    text10 = remove_meaningless_word(text9)
    return text10



Arabic Preprocessing

In [7]:
#Arabic Text Preprocessing

# Remove non-Arabic characters
def remove_non_arabic(text):
    arabic_text = re.sub(r'[^\u0600-\u06FF\s]', '', str(text))
    return arabic_text
    
# Remove Arabic numbers
def remove_arabic_numbers(text):
    return re.sub(r'[٠-٩]', '', text)

# Punctuation removal
arabic_punctuation = """/ːː،؛؟.٪؛،:«»::–()[]{}<>+=-%*/&:|~\\''``"""
translator = str.maketrans('', '', arabic_punctuation)

def remove_punctuation(text):
    if isinstance(text, str):
        return text.translate(translator)
    return text

# Initialize disambiguators
mle_msa = MLEDisambiguator.pretrained('calima-msa-r13')
msa_atb_tokenizer = MorphologicalTokenizer(disambiguator=mle_msa, scheme='atbtok')

# Regex pattern to split by _ and +
pattern = re.compile(r'[^_+]+')


# Remove stopwords
def remove_stopwords_arabic(text):
    if pd.isna(text):
        return text 
    custom_stopwords = ['لل', 'أليس', '', 'ك', 'س', 'ف', 'ب', 'أو', 'و', 'ما', 'لو', 'ال', 'لا', 'ّ', 'ٌ', ' ', 'ء', 'ئ', '‘', '؛', 'أ', 'إ', ',', '’', 'آ', '~', 'ًٍَُِْ', ' ', '  ', 'ؤ', ' ', ' ', ' ', ' ', 'وهي', 'او', 'و', 'بهذا', 'هذا',  'وايضا', 'ايضا', 'ومع', 'مع', 'ما', 'وما', 'والتي', 'اليها', 'الي','علي','على', 'كان', 'ان', 'في', 'التي', 'اذا','معظم','هي',  'متي','متى','الى','م','سم','لهذا','ال','حوالي', 'لأ', 'NOAN']
    stop_words = set(stopwords.words('arabic')) | set(custom_stopwords)
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)
    
# Normalization
def normalize_arabic_text(text):
    if pd.isna(text):
        return text 
    text = normalize_unicode(text)
    text = normalize_alef_maksura_ar(text)
    text = normalize_teh_marbuta_ar(text)
    text = normalize_alef_ar(text)
    text = dediac_ar(text)
    return text

# Stemming
st = ISRIStemmer()
def stem_arabic_text(text):
    if pd.isna(text):
        return text
    words = word_tokenize(text)
    stemmed_words = [st.stem(word) for word in words]
    return ' '.join(stemmed_words)
# Explicitly remove NOAN
def remove_specific_word(text, word):
    return ' '.join([t for t in text.split() if t != word])

# All text processors
def text_proccessers(text):
    text = remove_non_arabic(text)
    text = tokenizer(text)
    text = msa_atb_tokenizer.tokenize(text)
    text = pattern.findall(' '.join(text))
    text = ' '.join(text)
    text = remove_stopwords_arabic(text)
    text = remove_arabic_numbers(text)
    text = remove_punctuation(text)
    text = normalize_arabic_text(text)
    text = stem_arabic_text(text)
    text = remove_specific_word(text, "NOAN")
    return text

Building Index usin tfidfvectorizer

In [None]:
#Traning Model & indexing

def save_terms(path , vectorizer):
    with open(path, 'w') as outfile:
        outfile.writelines(f"{i}\n" for i in vectorizer.get_feature_names_out())
    return "Saving Terms Done"

def save_model(path, vectorizer):
    with open(path, 'wb') as outp:
        pickle.dump(vectorizer, outp, pickle.HIGHEST_PROTOCOL)
    return "Saving model Done"

def save_index(path, index):
    sparse.save_npz(path, index)
    return "Saving index Done"

def traning_model(lang, docs: list, preprocessor, tokenizer):
    vectorizer = TfidfVectorizer(tokenizer=tokenizer, preprocessor=preprocessor)
    index = vectorizer.fit_transform(docs)
    save_terms(f'C:/Users/Ahmed/Desktop/ir_output/{lang}_terms.txt', vectorizer)
    save_model(f'C:/Users/Ahmed/Desktop/ir_output/{lang}_model.pkl', vectorizer)
    save_index(f'C:/Users/Ahmed/Desktop/ir_output/{lang}_index.npz', index)
    return "Traning Model & indexing doen!"

traning_model("en", en_corpus['title'], preProcessor, tokenizer)
traning_model("ar", ar_corpus['text'], text_proccessers, tokenizer)


Matching, Cosine Similarity

In [4]:
#Matching (Cosine Similarity)

#loading model object
# with open('C:/Users/Ahmed/Desktop/IR_output_en_prev/tfidfvectorizer_object.pkl', 'rb') as inp:
#     vectorizer = pickle.load(inp)

# #Loading index
# docs_vector = sparse.load_npz("C:/Users/Ahmed/Desktop/IR_output_en_prev/csr_martix.npz")

# #Cosine Similarity method 1, each query sorted and stored in csv file
# test = pd.DataFrame(0, index=["cosine"], columns=corpus["_id"], dtype=np.float64)
# for index, row in queries.iterrows():
#     query_vector = vectorizer.transform([row['text']])
#     result = cosine_similarity(docs_vector, query_vector).flatten()
#     test.loc['cosine'] = result
#     sorted_test = test.sort_values(by="cosine", axis=1, ascending=False)
#     sorted_test.to_csv(f"C:/Users/Ahmed/Desktop/cosine_similarity/{index}.csv", index=False)
# print("done cosine similarity")

# #calculate MAP
# map_result = np.full((49, 10), False)
# for index, row in queries.iterrows():
#     cos_similarity = pd.read_csv(f"C:/Users/Ahmed/Desktop/cosine_similarity/{index}.csv")
#     first_ten_values = cos_similarity.iloc[0, :10]
#     for k, (doc, cos) in enumerate(first_ten_values.items()):
#         for i, value in qrels.iterrows():
#             if(index == value["query-id"]-1):
#                 if(doc == value["corpus-id"]):
#                     map_result[index][k] = True
#             else: continue
# np.savetxt('C:/Users/Ahmed/Desktop/map.txt', map_result, fmt='%d')
# print("done calculating map")


#Matching Service
def load_model(path):
    with open(path, 'rb') as inp:
        return pickle.load(inp)
    
def load_index(path):
    return sparse.load_npz(path)

Calculate Map

In [None]:
def claculate_map(self):
    actual = self.revelance_matrix()
    Q = len(actual)
    predicted = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    k = 10
    ap = []
    ap_num = 0    
    for x in range(k):        
        act_set = set(actual[self.q])        
        pred_set = set(predicted[:x+1])
        precision_at_k = len(act_set & pred_set) / (x+1)
        if predicted[x] in actual[self.q]:
            rel_k = 1        
        else:
            rel_k = 0    
        ap.append(self.ap_q)
    return sum(ap) / Q

Search Api Code

In [8]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="",
  database="IR"
)

mycursor = mydb.cursor()
en_sql = "SELECT content FROM corpus WHERE doc_id = %s"
ar_sql = "SELECT content FROM ar_corpus WHERE doc_id = %s"

# # #query api function without database
# # api_result = []
# # en_query_vector = vectorizer.transform(["Should teachers get tenure?"])
# # result = cosine_similarity(docs_vector, en_query_vector).flatten()
# # en_test.loc['cosine'] = result
# # en_test.sort_values(by="cosine", axis=1, ascending=False ,inplace=True)
# # ans = en_test.iloc[0, :30]
# # for key, value in ans.items():
# #     if(value < 0.6):
# #         break
# #     for index, row in corpus.iterrows():
# #         if(row["_id"] == key):
# #             api_result.append(row["title"])
# # print(len(api_result))
# # api_result


# #query api function with database

def search_api(query, language):
  api_result = []
  if(language == 'en'):
    test = pd.DataFrame(0, index=["cosine"], columns=en_corpus["_id"], dtype=np.float64)
    vectorizer = load_model('C:/Users/Ahmed/Desktop/ir_output_last/en_model.pkl')
    index = load_index('C:/Users/Ahmed/Desktop/ir_output_last/en_index.npz')
  else:
    test = pd.DataFrame(0, index=["cosine"], columns=ar_corpus["doc_id"], dtype=np.float64)
    vectorizer = load_model('C:/Users/Ahmed/Desktop/ir_output_last/ar_model.pkl')
    index = load_index('C:/Users/Ahmed/Desktop/ir_output_last/ar_index.npz')

    query_vector = vectorizer.transform([query])
    result = cosine_similarity(index, query_vector).flatten()
    test.loc['cosine'] = result
    test.sort_values(by="cosine", axis=1, ascending=False ,inplace=True)
    ans = test.iloc[0, :30]

  if(language == 'en'):
    for key, value in ans.items():
      if(value < 0.6):
        break
      adr = (key, )
      mycursor.execute(en_sql, adr)
      myresult = mycursor.fetchall()
      api_result.append(myresult)
  else:
    for key, value in ans.items():
      if(value < 0.3):
        break
      adr = (key, )
      mycursor.execute(ar_sql, adr)
      myresult = mycursor.fetchall()
      api_result.append(myresult)
  return api_result

search_api("624 ألاسكا () هي ولاية أمريكية تقع في أقصى الشمال الغربي لأمريكا الشمالية.", 'ar')


# api_result = []

# en_test = pd.DataFrame(0, index=["cosine"], columns=en_corpus["_id"], dtype=np.float64)
# en_vectorizer = load_model('C:/Users/Ahmed/Desktop/ir_output_last/en_model.pkl')
# en_index = load_index('C:/Users/Ahmed/Desktop/ir_output_last/en_index.npz')

# en_query_vector = en_vectorizer.transform(["Should teachers get tenure?"])
# en_result = cosine_similarity(en_index, en_query_vector).flatten()
# en_test.loc['cosine'] = en_result
# en_test.sort_values(by="cosine", axis=1, ascending=False ,inplace=True)
# ans = en_test.iloc[0, :30]

# ar_test = pd.DataFrame(0, index=["cosine"], columns=ar_corpus["doc_id"], dtype=np.float64)
# ar_vectorizer = load_model('C:/Users/Ahmed/Desktop/ir_output_last/ar_model.pkl')
# ar_index = load_index('C:/Users/Ahmed/Desktop/ir_output_last/ar_index.npz')


# ar_query_vector = ar_vectorizer.transform(["624 ألاسكا () هي ولاية أمريكية تقع في أقصى الشمال الغربي لأمريكا الشمالية."])
# ar_result = cosine_similarity(ar_index, ar_query_vector).flatten()
# ar_test.loc['cosine'] = ar_test
# ar_test.sort_values(by="cosine", axis=1, ascending=False ,inplace=True)
# ans = ar_test.iloc[0, :30]

# #Retreive Values
# for key, value in ans.items():
#   if(value < 0.6):
#     break
#   adr = (key, )
#   mycursor.execute(sql, adr)
#   myresult = mycursor.fetchall()
#   api_result.append(myresult)


# print(len(api_result))
# api_result


# query_ids = []
# for index, row in ar_queries.iterrows():
#     id = ""
#     for c in row["query"]:
#         if(not c.isdigit()):
#             query_ids.append(id)
#             break
#         id+=c

# print(len(query_ids))

# print(ar_queries.iloc[5000])
# print(query_ids[5000])



[[('الشمالية (توضيح) الشمالية ( توضيح ) - الشمالية ( ولاية ) - الشمالية ( عين العرب ) - الشمالية ( مدينة بحرينية ) - الشمالية ( محافظة ) - الحدود الشمالية ( منطقة )',)],
 [('أسكاون (توضيح) أسكاون ( توضيح ) قد يقصد من « أسكاون » : - أسكاون - أسكاون ( أداي ) - أسكاون ( إد نوح )',)],
 [('بشملة  البَشْمَلَة أو أو هو جنس نباتي يتبع الفصيلة الوردية من رتبة الورديات . والبشملة شجر دائم الخضرة، تعطي الشجرة ثمار صغيرة بيضاوية الشكل صفراء اللون عند نضجها . ويعرف في بعض اللهجات المحلية باسم الأكي دنيا و أسكيدنيا و ناسبولي، وتسمى أيضا بُوصَاع في تونس و عرنوط في بعض اللهجات المحلية الأخرى، و تعرف بالمغرب باسم مزاح . أصل الاسم أكي دنيا هو تركي ويعني الدنيا الجديدة . يوجد عدة أنواع من جنس البشملة : - بشملة يابانية - بشملة منحنية - بشملة كافاليرية - بشملة عطرية - بشملة هنرية - بشملة ماليبونية - بشملة بيضاوية - بشملة سنديانية - بشملة سالوينية - بشملة سيغوينية - بشملة مسننة - بشملة تنغشونغية - بشملة بنغالية - بشملة غامضة - بشملة إهليلجية - بشملة جرداء - بشملة هوكرية - بشملة عريضة الأوراق - بشملة طويلة ا