project 1

In [1]:
import pandas as pd
import pickle
import os


In [2]:
class Preprocessor():
    @staticmethod
    def process(doc_query):
       res = Preprocessor.tokenizeDocument(doc_query)
       return res

    @staticmethod
    def tokenizeDocument(sentence):
        return sentence.split(' ')

In [3]:


class IndexModel:

    def __init__(self, documents_df,index_file=None ,meta_file=None):
        
        self._index = {}
        self.create_new_index(documents_df)
        
        if index_file==None and meta_file==None :
           self.create_new_index
           
        else :
            self.read_index    
        

    def create_new_index(self, documents_df):
        
        termdoc = documents_df.to_dict('list')
        unique_terms = set()
        doc_ids = termdoc['id']

       
        for terms in termdoc['ntext']:
            unique_terms.update(terms)
        
        term_ids = sorted(list(unique_terms))  
        term_to_id = {term: idx for idx, term in enumerate(term_ids)}
        doc_to_id = {doc_id: idx for idx, doc_id in enumerate(doc_ids)}
        
        
        self._index_matrix = [[0 for _ in range(len(doc_ids))] for _ in range(len(term_ids))]
        
        
        for i, terms in enumerate(termdoc['ntext']):
            for term in terms:
                term_idx = term_to_id[term]
                doc_idx = doc_to_id[doc_ids[i]]
                self._index_matrix[term_idx][doc_idx] = 1

        self._term_to_id = term_to_id
        self._doc_to_id = doc_to_id

    def get_term_vector(self, term):  
        
        if term in self._term_to_id:
            term_idx = self._term_to_id[term]
            return self._index_matrix[term_idx]
        else:
            return [0 for _ in range(len(self._doc_to_id))]
        
    
    def read_index(self, index_file, meta_file):
        
        
        with open(index_file, 'rb') as f:
            self._index_matrix = pickle.load(f)
        
        
        with open(meta_file, 'rb') as f:
            meta_data = pickle.load(f)
        
        self._term_to_id = meta_data['term_to_id']
        self._doc_to_id = meta_data['doc_to_id']
        
        
    def save_index(self, index_file, meta_file):
        with open(index_file, 'wb') as f:
            pickle.dump(self._index_matrix, f)

        meta_data = {
            'term_to_id': self._term_to_id,
            'doc_to_id': self._doc_to_id
        }

        with open(meta_file, 'wb') as f:
            pickle.dump(meta_data, f)
            
            
    

       

 
                
        
          
        
    
        
                        


   

In [4]:

class Retriever:
    def __init__(self):
        self._terms_operator = ['&', '|', '~']

    def boolean_operator_processing(self, bop, prevV, nextV=None):
        if bop == "&":
            return [a & b for a, b in zip(prevV, nextV)] # zip بحيث كل عنصر من المجموعة الاولى بشوفها مع عناصر المصفوفة التانية ونتبهي بنية معطياتو تبل 
        elif bop=="|" :
            return [a | b for a, b in zip(prevV, nextV)]
        elif bop == "~":
            return [1-a for a in prevV]

    def retrieve(self, query_terms, index_model):
        ret_docs = []
        bitwiseop=""
        result=[]
        has_previous_term=False
        has_not_operation=False
        inc_vec_prev=[]
        inc_vec_next=[]
        for term in query_terms:
            if term not in self._terms_operator:
                if has_not_operation:
                    if has_previous_term:
                        inc_vec_next=self.boolean_operator_processing("~",index_model.term_incidence_vector(term),inc_vec_next)
                    else :
                        inc_vec_prev=self.boolean_operator_processing("~",index_model.term_incidence_vector(term),inc_vec_next)
                        result=inc_vec_prev
                    has_not_operation=False
                elif has_previous_term:
                    inc_vec_next=index_model.term_incidence_vector(term)
                else:
                    inc_vec_prev=index_model.term_incidence_vector(term)
                    result= inc_vec_prev
                    has_previous_term=True
            elif term =="~":
                has_not_operation=True
            else:
                bitwiseop=term

            #----------
            if len(inc_vec_next)!= 0  :
                result = self.boolean_operator_processing(bitwiseop,inc_vec_prev,inc_vec_next)
                inc_vec_prev=result
                has_previous_term=True
                inc_vec_next= []

        #-----
        for i,res in enumerate(result):
            if res == 1:
                ret_docs.append({'id':i, 'score':res})
        ret_docs = pd.DataFrame(ret_docs, columns=['id', 'score', 'content']).sort_values(by=['score'], ascending=False)
        return ret_docs



In [5]:
def loadDocuments(directory):
     documents = []
     for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                doc_id = os.path.splitext(filename)[0]  
                documents.append([doc_id, content])

     documents_df = pd.DataFrame(documents, columns=['id', 'text'])
     return documents_df
 
directory = 'Project1_datacoll'
documents_df = loadDocuments(directory)
print(documents_df)



        id                                               text
0  African  ﻿If you want to go fast, go alone. If you want...
1     Bill  ﻿A successful team is a group of many hands bu...
2    Funny  ﻿A team is like a pack of wolves—always hungry...
3    Helen  ﻿Alone, we can do so little; together, we can ...
4    Henry  ﻿Coming together is a beginning. Keeping toget...


In [6]:
# #ex :
# documents = {
#     'id': ["TheJungle", "FruitTypes", "HealthyLife"],
#     'ntext': [["tree", "garden", "juice"], ["tree", "apple"], ["apple", "fruit", "juice"]]
# }

# documents_df = pd.DataFrame(documents)
# index_model = IndexModel(documents_df)

# print("TermId:", index_model._term_to_id)
# print("DocId:", index_model._doc_to_id)
# print("Index Matrix:")
# for row in index_model._index_matrix:
#     print(row)

# # Testing term incidence vectors
# print("Incidence vector for 'tree':", index_model.term_incidence_vector("tree"))
# print("Incidence vector for 'juice':", index_model.term_incidence_vector("juice"))


In [7]:
# # ex2 : 

# data = {
#     'id': [1, 2, 3],
#     'ntext': [['hello', 'world'], ['example', 'text'], ['hello', 'example']]
# }


# documents_df = pd.DataFrame(data)

# index_model = IndexModel(documents_df)


# index_model.save_index('myindex.index', 'td_ids.meta')


# index_model.read_index('myindex.index', 'td_ids.meta')


# term_vector = index_model.get_term_vector('hello')
# print(term_vector) 

In [8]:
class SearchEngine:

    def __init__(self, preprocessor, retriever, documents):
        self.preprocessor = preprocessor
        self.retriever = retriever
        self.documents = None
        self.model = None
        self.rebuild(documents)

    # offline
    def rebuild(self, documents):
        self.documents = documents
        self.documents['ntext'] = self.documents['text'].apply(self.preprocessor.process)
        self.model = IndexModel(self.documents)

    # online
    def querying(self, query):
        query_terms = self.preprocessor.process(query)
        docs_res = self.retriever.retrieve(query_terms, self.model)
        if docs_res.shape[0]>0: #لعرض كلشي شغل شتغتلو
            docs_res['content'] = docs_res.apply(
                lambda row: self.documents[self.documents['id']==row.id]['text'].iloc[0], axis = 1)
        return docs_res



In [9]:
directory = 'Project1_datacoll'
documents_df = load_documents(directory)


index_model = IndexModel(documents_df)


search_engine = SearchEngine(index_model)


query1 = "success fast"
query2 = "success together"


print(search_engine.search(query1))


print(search_engine.search(query2))

NameError: name 'load_documents' is not defined

In [None]:
# import pandas as pd

# class IndexModel:
#     def __init__(self, documents_df=None, index_path=None, meta_path=None):
#         self.documents_df = documents_df
#         self.term_id = {}
#         self.doc_id = {}
#         self.tdim_index = []

#         if index_path and meta_path:
#             self.index_read(index_path, meta_path)
#         elif documents_df is not None:
#             self.index_new_create()

#     def index_new_create(self):
#         termdoc = self.documents_df.to_dict('list')

#         # تحديد الأرقام التعريفية للوثائق
#         self.doc_id = {doc: i for i, doc in enumerate(termdoc['id'])}

#         # جمع المصطلحات الفريدة وتحديد الأرقام التعريفية للمصطلحات
#         uniqterm = set()
#         for terms in termdoc['ntext']:
#             uniqterm.update(terms)
#         self.term_id = {term: i for i, term in enumerate(uniqterm)}

#         # إنشاء مصفوفة TDIM
#         num_terms = len(self.term_id)
#         num_docs = len(self.doc_id)
#         self.tdim_index = [[0] * num_docs for _ in range(num_terms)]

#         # تعبئة مصفوفة TDIM
#         for i, doc_terms in enumerate(termdoc['ntext']):
#             for term in doc_terms:
#                 term_index = self.term_id[term]
#                 doc_index = self.doc_id[termdoc['id'][i]]
#                 self.tdim_index[term_index][doc_index] = 1

#     def index_read(self, index_path, meta_path):
#         import pickle
        
#         # قراءة ملف الفهرس
#         with open(index_path, 'rb') as f:
#             self.tdim_index = pickle.load(f)
        
#         # قراءة ملف معلومات الفهرس
#         with open(meta_path, 'rb') as f:
#             meta_data = pickle.load(f)
#             self.term_id = meta_data[0]
#             self.doc_id = meta_data[1]

#     def vector_term_get(self, term):
#         try:
#             return self.tdim_index[self.term_id[term]]
#         except KeyError:
#             return [0] * len(self.doc_id)

#     def save_index(self, index_path, meta_path):
#         import pickle
        
#         # حفظ مصفوفة الفهرس
#         with open(index_path, 'wb') as f:
#             pickle.dump(self.tdim_index, f)
        
#         # حفظ معلومات الفهرس
#         meta_data = [self.term_id, self.doc_id]
#         with open(meta_path, 'wb') as f:
#             pickle.dump(meta_data, f)

# # مثال تطبيقي
# data = {'id': ["TheJungle", "FruitTypes", "HealthyLife"],
#         'ntext': [['tree', 'garden', 'juice'], ['apple', 'fruit'], ['apple', 'juice', 'fruit']]}

# documents_df = pd.DataFrame(data)

# # إنشاء نموذج الفهرسة
# index_model = IndexModel(documents_df=documents_df)

# # إنشاء الفهرس
# index_model.index_new_create()

# # طباعة الأرقام التعريفية للمصطلحات والوثائق
# print(f"Term Id: {index_model.term_id}")
# print(f"Doc Id: {index_model.doc_id}")

# # طباعة مصفوفة TDIM
# print("TDIM Index:")
# for row in index_model.tdim_index:
#     print(row)

# # الحصول على متجه وقوع المصطلح "apple"
# apple_vector = index_model.vector_term_get('apple')
# print(f"متجه وقوع المصطلح 'apple': {apple_vector}")

# # حفظ الفهرس ومعلوماته
# index_model.save_index('index.myindex', 'meta.ids_td')

# # قراءة الفهرس ومعلوماته من الملفات
# new_index_model = IndexModel(index_path='index.myindex', meta_path='meta.ids_td')

# # طباعة الأرقام التعريفية للمصطلحات والوثائق من الفهرس المحمل
# print(f"Loaded Term Id: {new_index_model.term_id}")
# print(f"Loaded Doc Id: {new_index_model.doc_id}")

# # طباعة مصفوفة TDIM من الفهرس المحمل
# print("Loaded TDIM Index:")
# for row in new_index_model.tdim_index:
#     print(row)
