# Search Engine

1. Connectors
2. Indexation
3. HashMap
4. Save
5. Query
    * search(word)
    * search word0 AND word1 AND word2
    * search word0 OR word1 OR word2

### Deps

* Python3 (Python 3.7.3)
    * os
    * pickle

### Parties

#### Partie 1 : Obligatoire

#### Partie 2 : Index Incrémental

#### Partie 3 : Bonus 

## Paths

In [1]:
# Mettre le path vers le dossier pour fetch
path = "./data/20news-bydate-train/alt.atheism"

path_politics = "./data/20news-bydate-train/talk.politics.guns"

path_sci_space = "./data/20news-bydate-train/sci.space"
path_sci_elect = "./data/20news-bydate-train/sci.electronics/"
path_sci_med   = "./data/20news-bydate-train/sci.med/"

# Partie 1 : Obligatoire

### Connecteurs

File system: prendre tous les fichier lisisbles d'un répertoire

In [2]:
from os import listdir
from os.path import isfile, isdir, join

class Doc:
    def __init__(self, url, text):
        self.text = text # Raw strnig of the whole text  
        self.url = url
    pass


def fetch(path, recursive=True):
    DocList = []
    
    files = [f for f in listdir(path) if isfile(join(path, f))]
    dirs  = [d for d in listdir(path) if isdir(join(path, d))]
    
    for fname in files:
        f = open(path + "/" + fname, "r", errors="ignore")
        try:
            c = f.read()
            DocList.append(Doc(path + "/" + fname, c))
        except Exception:
            pass
        
        f.close
    
    if recursive:
        for d in dirs:
            DocList += fetch(join(path, d), recursive=recursive)              
    
    return DocList


# res = fetch("/home/sidore_m/projects/search_engine/data/20news-bydate-train/")

DocList = fetch(path)

# Debug
print(len(DocList))
# (DocList[0].url, DocList[0].text)

480


### Analyseur

* Transformer le texte brut des documents d'entrées en mots (effectuer des traitements sur ces mots)

In [3]:
class LowerProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.lower()
        return proc_word
    
class AccentProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.replace("'", " ")
        proc_word = proc_word.replace("`", " ")
        proc_word = proc_word.replace("\"", "")
        return proc_word
    
class PunctationProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.replace(".", " ")
        proc_word = proc_word.replace(",", " ")
        proc_word = proc_word.replace(";", "")
        proc_word = proc_word.replace(":", "")
        proc_word = proc_word.replace("!", "")
        proc_word = proc_word.replace("?", "")
        proc_word = proc_word.replace("\n", " ")
        return proc_word
    
class SpecialProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.replace("(", "")
        proc_word = proc_word.replace("-", " ")
        proc_word = proc_word.replace("_", " ")
        proc_word = proc_word.replace("=", " ")
        proc_word = proc_word.replace("*", " ")
        proc_word = proc_word.replace("<", "")
        proc_word = proc_word.replace(">", "")
        proc_word = proc_word.replace(")", "")
        proc_word = proc_word.replace("/", "")
        proc_word = proc_word.replace("\\", "")
        proc_word = proc_word.replace("[", "")
        proc_word = proc_word.replace("^", "")
        proc_word = proc_word.replace("]", "")
        proc_word = proc_word.replace("", "")
        proc_word = proc_word.replace(" ", "")
        proc_word = proc_word.replace("  ", "")
        return proc_word     

class TokenizedDoc:
    def __init__(self, words, url):
        self.words = words # list of strings (list of words)
        self.url = url
    pass    


def analyse(docs, processorsList):
    """
    docs           - list of Docs
    processorsList - list of Processors like TextProcessor
    return         - list of TokenizedDoc
    """
    tokdocs = []
    
    for d in docs:
        tokens = d.text.split(" ") # Split and tokenzie on ' ', return list of words
        # Process
        words = []
        for w in tokens:
            w = w.strip()
            for p in processorsList:
                 w = p.process(w)
            w_r = w.split()
            for w in w_r:
                if w != '' and w != ' ':        
                    words.append(w)
        tokdocs.append(TokenizedDoc(words, d.url))    
    
    return tokdocs
    
Processors = [PunctationProcessor, SpecialProcessor, AccentProcessor, LowerProcessor]
                                      
TokDocs = analyse(DocList, Processors)

len(TokDocs)
print(TokDocs[0].words[0], TokDocs[0].words[1], TokDocs[0].words[2],
      TokDocs[1].words[2], TokDocs[2].words[30], TokDocs[2].words[45])


from rashidsubject re bill you have


In [4]:
# for KKK in TokDocs:
#     print(KKK.words)

### Indexeur

* Transforme les listes de mots appartenant au document en des listes inversées de mots associés à des documents 

In [5]:
class Posting:
    def __init__(self, word, urls):
        """
        word - string
        urls - list of url
        """
        self.word = word
        self.urls = urls
    pass

def make_index(TokeneizedDocs):
    """
    TokeneizedDocs - list of TokenizedDoc
    """
    done_words = []
    postingList = []
    
    words = []
    for tokD in TokeneizedDocs:
        words += tokD.words
        
    print("Number of words ", len(words))    
    
        
    for w in words:
        if w in done_words:
            continue
                
        done_words.append(w)
        urls = []
        for d in TokeneizedDocs:
            if w in d.words:
                urls.append(d.url)
        postingList.append(Posting(w, urls))   
    
    return postingList

class Index:
    def __init__(self, urlToDocId, wordToDocIds, idToUrl):
        """
        urlToDocId   - Map/Dico <string, int> 
        wordToDocIds - Map/Dico <string, int[]> 
        idToUrl   - Map/Dico <int, string> 
        """
        self.urlToDocId = urlToDocId
        self.idToUrl = idToUrl
        self.wordToDocIds = wordToDocIds
    pass


def build(Postings):
    """
    Build Index from list of Posting
    """
    urlToId = {}
    idToUrl = {}
    wordToDocIds = {}
    ids = 0
    for i in Postings:
        u_list = []
        for j in i.urls:
            if not (j in urlToId):
                urlToId[j] = ids
                idToUrl[ids] = j
                u_list.append(ids)
                ids += 1
            else:
                u_list.append(urlToId[j])
        wordToDocIds[i.word] = u_list     
        
    return Index(urlToId, wordToDocIds, idToUrl)



postingList = make_index(TokDocs)
index = build(postingList)

print(len(TokDocs), len(postingList))
print(len(index.urlToDocId), len(index.wordToDocIds))

Number of words  150896
480 19895
480 19895


In [6]:
import pickle

path_bin_index = "/home/sido/projects/SearchEngine/"

def save(index, path):
    """
    Save index on disk
    usin pickle python lib
    """
    with open(path + "/" + "index.bin", "wb") as f:
        pickle.dump(index, f, pickle.HIGHEST_PROTOCOL)
        
save(index, path_bin_index)

def load(path):
    with open(path + "/" + "index.bin", "rb") as f:
        index = pickle.load(f)
    return index

index = load(path_bin_index)
print(len(index.urlToDocId), len(index.wordToDocIds))

480 19895


### Searcher

* Lire les listes inversées afin de répondre à une requete
* Pouvoir lire un index sauvegardé précédemment sur disque


In [7]:
# an object 'Index' as index should exist !

class Searcher:
    def __init__(self):
        self.index = None
    
    def load(self, path):
        with open(path + "/" + "index.bin", "rb") as f:
            self.index = pickle.load(f)
            
    
    def search(self, word, nb_urls=50):
        """
        Search word, return urls for this word
        word      - string
        nb_urls   - number of result urls to give back, if 0 return all
        index_obj - specify an indexobject
        """
        if not (word in self.index.wordToDocIds):
            return "No results, word not indexed"
        urls = self.index.wordToDocIds[word]
        res = ""
        for k in urls:
            res += self.index.idToUrl[k] + "\n"
        return res


    def searchAllOf(self, words):
        """
        AND oper
        words - list of words
        """
        and_urls = []
        first = True
        for w in words:
            if not (w in self.index.wordToDocIds):
                return "No results, one of the words is not indexed"
            urls = self.index.wordToDocIds[w]
            if first:
                and_urls = urls
                first = False
            else:
                and_urls = list(set(and_urls) & set(urls))
        res = ""
        for k in and_urls:
            res += self.index.idToUrl[k] + "\n"
        return res

    def searchOneOf(self, words):
        """
        OR oper
        """
        or_urls = []
        for w in words:
            if not (w in self.index.wordToDocIds):
                return "No results, one of the words is not indexed"
            urls = self.index.wordToDocIds[w]
            or_urls = list(set(or_urls) | set(urls))
        res = ""
        for k in or_urls:
            res += self.index.idToUrl[k] + "\n"
        return res



In [8]:
searcher = Searcher()
searcher.load(path_bin_index)

print(searcher.search('sports'))

./data/20news-bydate-train/alt.atheism/53090
./data/20news-bydate-train/alt.atheism/53373
./data/20news-bydate-train/alt.atheism/53136
./data/20news-bydate-train/alt.atheism/53521



In [9]:
print(searcher.searchAllOf(['sports', 'is']))

./data/20news-bydate-train/alt.atheism/53521
./data/20news-bydate-train/alt.atheism/53136
./data/20news-bydate-train/alt.atheism/53373
./data/20news-bydate-train/alt.atheism/53090



In [10]:
print(searcher.searchOneOf(['sports', 'and', 'of']))

./data/20news-bydate-train/alt.atheism/53120
./data/20news-bydate-train/alt.atheism/51317
./data/20news-bydate-train/alt.atheism/51201
./data/20news-bydate-train/alt.atheism/51318
./data/20news-bydate-train/alt.atheism/53229
./data/20news-bydate-train/alt.atheism/53177
./data/20news-bydate-train/alt.atheism/53230
./data/20news-bydate-train/alt.atheism/53150
./data/20news-bydate-train/alt.atheism/53067
./data/20news-bydate-train/alt.atheism/53434
./data/20news-bydate-train/alt.atheism/51285
./data/20news-bydate-train/alt.atheism/53466
./data/20news-bydate-train/alt.atheism/53440
./data/20news-bydate-train/alt.atheism/53113
./data/20news-bydate-train/alt.atheism/53183
./data/20news-bydate-train/alt.atheism/53754
./data/20news-bydate-train/alt.atheism/53239
./data/20news-bydate-train/alt.atheism/53139
./data/20news-bydate-train/alt.atheism/53532
./data/20news-bydate-train/alt.atheism/51260
./data/20news-bydate-train/alt.atheism/51184
./data/20news-bydate-train/alt.atheism/53056
./data/20n

# Partie 2 : Index Incrémental

### Indexation Incrementale

In [11]:
class Generation:
    def __init__(self, gen_id, wordToDocIds):
        self.gen_id = gen_id
        self.wordToDocIds = wordToDocIds

class IndexIncr:
    def __init__(self):
        self.ids = 0
        self.urlToDocId = {}
        self.idToDocUrl = {}
        self.docIdToGeneration = {}  # key (gen) = [doc id, ...]
        self.generations = []
        # Bonus 2 : Suppr vector
        self.suppressions = []
    
    def build_gen(self, Postings):
        """
        Build Index from list of Posting
        """
        gen_id = len(self.generations)
        wordToDocIds = {}
        for i in Postings:
            u_list = []
            for j in i.urls:
                if not (j in self.urlToDocId):
                    self.urlToDocId[j] = self.ids
                    self.idToDocUrl[self.ids] = j
                    self.docIdToGeneration[self.ids] = gen_id
                    u_list.append(self.ids)
                    self.ids += 1
                else:
                    u_list.append(self.urlToDocId[j])
            wordToDocIds[i.word] = u_list        
        self.generations.append(Generation(gen_id, wordToDocIds))
    


In [12]:
main_index = IndexIncr()

main_index.build_gen(postingList)

DocList1 = fetch(path_politics)
DocList2 = fetch(path_sci_space)

TokDocs1 = analyse(DocList1, Processors)
TokDocs2 = analyse(DocList2, Processors)

postingList1 = make_index(TokDocs1)
postingList2 = make_index(TokDocs2)



Number of words  182486
Number of words  163558


In [13]:
main_index.build_gen(postingList1)

In [14]:
main_index.build_gen(postingList2)

In [15]:
import pickle

path_bin_index = "."

def save_incr(index, path):
    """
    Save index on disk
    usin pickle python lib
    """
    with open(path + "/" + "index_incr.bin", "wb") as f:
        pickle.dump(index, f, pickle.HIGHEST_PROTOCOL)
        
save_incr(main_index, path_bin_index)

def load_incr(path):
    with open(path + "/" + "index_incr.bin", "rb") as f:
        index = pickle.load(f)
    return index

index_incr = load_incr(path_bin_index)
index_incr

<__main__.IndexIncr at 0x7f226797bdd0>

In [16]:
# For the SearchIncrIndex, I prefer retruning python list  as result,
# and not like previously a string

class SearchIncrIndex:
    def __init__(self, index_incr=None):
        if None == index_incr:
            self.load(path_bin_index)
        else:
            self.index = index_incr
    
    def load(self, path):
        with open(path + "/" + "index_incr.bin", "rb") as f:
            self.index = pickle.load(f)
    
    def id_to_url(self, id_l):
        res = []
        for i in id_l:
            res.append(self.index.idToDocUrl[i])
        return res
    
    def search(self, word):
        res_id = self.search_ids(word)
        res_urls = self.id_to_url(res_id)
        return res_urls
    
    def search_ids(self, word):
        """
        Search word, return urls for this word
        word      - string
        """
        gs = self.index.generations
        res = []
        gl = len(self.index.generations)
        current_urls = []
        for g in reversed(gs):  # Get last first
            if not (word in g.wordToDocIds):
                continue
            preserved_urls = []
            urls_ids = list.copy(g.wordToDocIds[word])
            urls = []
            for u in urls_ids:
                urls.append(self.index.idToDocUrl[u])
            for u in urls:
                if not (u in current_urls):
                    preserved_urls.append(u)
                    current_urls.append(u)
            urls_ids = []
            for u in preserved_urls:
                urls_ids.append(self.index.urlToDocId[u])
            res = res + urls_ids
        return res


    def searchAllOf(self, words):
        """
        AND oper
        words - list of words
        """
        and_urls = []
        first = True
        for w in words:
            if first:
                and_urls = self.search_ids(w)
                first = False
            else:
                and_urls = list(set(and_urls) & set(self.search_ids(w)))
        res_urls = self.id_to_url(and_urls)
        return res_urls

    def searchOneOf(self, words):
        """
        OR oper
        """
        or_urls = []
        for w in words:
            or_urls = list(set(or_urls) | set(self.search_ids(w)))
        res_urls = self.id_to_url(or_urls)
        return res_urls


In [17]:
incr_searcher = SearchIncrIndex(index_incr)

In [18]:
incr_searcher.search('sports')

['./data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/talk.politics.guns/54690',
 './data/20news-bydate-train/talk.politics.guns/54235',
 './data/20news-bydate-train/alt.atheism/53090',
 './data/20news-bydate-train/alt.atheism/53373',
 './data/20news-bydate-train/alt.atheism/53136',
 './data/20news-bydate-train/alt.atheism/53521']

In [19]:
incr_searcher.searchAllOf(['sports', 'is'])

['./data/20news-bydate-train/alt.atheism/53521',
 './data/20news-bydate-train/talk.politics.guns/54235',
 './data/20news-bydate-train/alt.atheism/53136',
 './data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/alt.atheism/53090',
 './data/20news-bydate-train/talk.politics.guns/54690',
 './data/20news-bydate-train/alt.atheism/53373']

In [20]:
incr_searcher.searchOneOf(['sports', 'and', 'of'])

['./data/20news-bydate-train/alt.atheism/53120',
 './data/20news-bydate-train/alt.atheism/51317',
 './data/20news-bydate-train/alt.atheism/51201',
 './data/20news-bydate-train/alt.atheism/51318',
 './data/20news-bydate-train/alt.atheism/53229',
 './data/20news-bydate-train/alt.atheism/53177',
 './data/20news-bydate-train/alt.atheism/53230',
 './data/20news-bydate-train/alt.atheism/53150',
 './data/20news-bydate-train/alt.atheism/53067',
 './data/20news-bydate-train/alt.atheism/53434',
 './data/20news-bydate-train/alt.atheism/51285',
 './data/20news-bydate-train/alt.atheism/53466',
 './data/20news-bydate-train/alt.atheism/53440',
 './data/20news-bydate-train/alt.atheism/53113',
 './data/20news-bydate-train/alt.atheism/53183',
 './data/20news-bydate-train/alt.atheism/53754',
 './data/20news-bydate-train/alt.atheism/53239',
 './data/20news-bydate-train/alt.atheism/53139',
 './data/20news-bydate-train/alt.atheism/53532',
 './data/20news-bydate-train/alt.atheism/51260',
 './data/20news-byda

In [21]:
index_incr.generations

[<__main__.Generation at 0x7f2267809610>,
 <__main__.Generation at 0x7f226797bf50>,
 <__main__.Generation at 0x7f2266b04b10>]

In [22]:
DocList3 = fetch(path)

TokDocs3 = analyse(DocList3, Processors)

postingList3 = make_index(TokDocs3)

Number of words  150896


In [23]:
index_incr.build_gen(postingList3)

In [24]:
incr_searcher.searchAllOf(['sports', 'from'])

['./data/20news-bydate-train/alt.atheism/53521',
 './data/20news-bydate-train/talk.politics.guns/54235',
 './data/20news-bydate-train/alt.atheism/53136',
 './data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/alt.atheism/53090',
 './data/20news-bydate-train/talk.politics.guns/54690',
 './data/20news-bydate-train/alt.atheism/53373']

In [25]:
incr_searcher.searchAllOf(['from'])

['./data/20news-bydate-train/alt.atheism/53120',
 './data/20news-bydate-train/alt.atheism/51317',
 './data/20news-bydate-train/alt.atheism/51201',
 './data/20news-bydate-train/alt.atheism/51318',
 './data/20news-bydate-train/alt.atheism/53229',
 './data/20news-bydate-train/alt.atheism/53177',
 './data/20news-bydate-train/alt.atheism/53230',
 './data/20news-bydate-train/alt.atheism/53150',
 './data/20news-bydate-train/alt.atheism/53067',
 './data/20news-bydate-train/alt.atheism/53434',
 './data/20news-bydate-train/alt.atheism/51285',
 './data/20news-bydate-train/alt.atheism/53466',
 './data/20news-bydate-train/alt.atheism/53440',
 './data/20news-bydate-train/alt.atheism/53113',
 './data/20news-bydate-train/alt.atheism/53183',
 './data/20news-bydate-train/alt.atheism/53754',
 './data/20news-bydate-train/alt.atheism/53239',
 './data/20news-bydate-train/alt.atheism/53139',
 './data/20news-bydate-train/alt.atheism/53532',
 './data/20news-bydate-train/alt.atheism/51260',
 './data/20news-byda

In [26]:
incr_searcher = SearchIncrIndex(index_incr)
incr_searcher.search('sports')

['./data/20news-bydate-train/alt.atheism/53090',
 './data/20news-bydate-train/alt.atheism/53373',
 './data/20news-bydate-train/alt.atheism/53136',
 './data/20news-bydate-train/alt.atheism/53521',
 './data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/talk.politics.guns/54690',
 './data/20news-bydate-train/talk.politics.guns/54235']

In [27]:
incr_searcher.searchAllOf(['sports', 'from'])

['./data/20news-bydate-train/alt.atheism/53521',
 './data/20news-bydate-train/talk.politics.guns/54235',
 './data/20news-bydate-train/alt.atheism/53136',
 './data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/alt.atheism/53090',
 './data/20news-bydate-train/talk.politics.guns/54690',
 './data/20news-bydate-train/alt.atheism/53373']

In [28]:
incr_searcher.searchAllOf(['from', 'sido'])

[]

In [29]:
incr_searcher.searchAllOf(['from', 'weapons'])

['./data/20news-bydate-train/talk.politics.guns/54314',
 './data/20news-bydate-train/talk.politics.guns/54316',
 './data/20news-bydate-train/talk.politics.guns/54191',
 './data/20news-bydate-train/sci.space/60184',
 './data/20news-bydate-train/talk.politics.guns/54173',
 './data/20news-bydate-train/talk.politics.guns/54436',
 './data/20news-bydate-train/talk.politics.guns/54406',
 './data/20news-bydate-train/talk.politics.guns/54231',
 './data/20news-bydate-train/talk.politics.guns/54525',
 './data/20news-bydate-train/talk.politics.guns/53299',
 './data/20news-bydate-train/talk.politics.guns/54681',
 './data/20news-bydate-train/talk.politics.guns/54366',
 './data/20news-bydate-train/talk.politics.guns/53328',
 './data/20news-bydate-train/talk.politics.guns/54116',
 './data/20news-bydate-train/talk.politics.guns/54213',
 './data/20news-bydate-train/talk.politics.guns/54201',
 './data/20news-bydate-train/talk.politics.guns/54269',
 './data/20news-bydate-train/talk.politics.guns/54387',
 


# Partie 3 : Bonus 

### Bonus 1 : MetaData
### Bonus 2 : Suppression of Document

In [30]:
from os import listdir, stat
from os.path import isfile, isdir, join


class Doc:
    def __init__(self, url, text, metadata={}):
        self.text = text # Raw strnig of the whole text  
        self.url = url
        self.metadata = metadata
    pass


def fetch(path, recursive=True):
    DocList = []
    
    files = [f for f in listdir(path) if isfile(join(path, f))]
    dirs  = [d for d in listdir(path) if isdir(join(path, d))]
    
    for fname in files:
        f = open(path + "/" + fname, "r", errors="ignore")
        try:
            c = f.read()
            metadata = {}
            metadata["name"] = fname
            metadata["file"] = fname
            metadata["date"] = stat(path + "/" + fname).st_mtime
            DocList.append(Doc(path + "/" + fname, c, metadata))
        except Exception:
            pass
        
        f.close
    
    if recursive:
        for d in dirs:
            DocList += fetch(join(path, d), recursive=recursive)              
    
    return DocList


# Changer le Path ici
path = "./data/20news-bydate-train/sci.space"

DocList = fetch(path)

# Debug
print(len(DocList))
(DocList[0].url, DocList[0].text, DocList[0].metadata)

593


('./data/20news-bydate-train/sci.space/60919',
 "From: higgins@fnalf.fnal.gov (Bill Higgins-- Beam Jockey)\nSubject: Re: Science News article on Federal R&D\nOrganization: Fermi National Accelerator Laboratory\nLines: 24\nNNTP-Posting-Host: fnalf.fnal.gov\n\nIn article <C5r2DK.764@skates.gsfc.nasa.gov>, xrcjd@resolve.gsfc.nasa.gov (Charles J. Divine) writes:\n> Just a pointer to the article in the current Science News article\n> on Federal R&D funding.\n> \n> Very briefly, all R&D is being shifted to gaining current \n> competitive advantage from things like military and other work that\n> does not have as much commercial utility.\n> -- \n> Chuck Divine\n\nGulp.\n\n[Disclaimer:  This opinion is mine and does not represent the views of\nFermilab, Universities Research Association, the Department of Energy,\nor the 49th Ward Regular Science Fiction Organization.]\n \n-- \n     O~~*           /_) ' / /   /_/ '  ,   ,  ' ,_  _           \\|/\n   - ~ -~~~~~~~~~~~/_) / / /   / / / (_) (_) / 

In [31]:
class LowerProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.lower()
        return proc_word
    
class AccentProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.replace("'", " ")
        proc_word = proc_word.replace("`", " ")
        proc_word = proc_word.replace("\"", "")
        return proc_word
    
class PunctationProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.replace(".", " ")
        proc_word = proc_word.replace(",", " ")
        proc_word = proc_word.replace(";", "")
        proc_word = proc_word.replace(":", "")
        proc_word = proc_word.replace("!", "")
        proc_word = proc_word.replace("?", "")
        proc_word = proc_word.replace("\n", " ")
        return proc_word
    
class SpecialProcessor:
    def __init__(self):
        pass
    def process(word):
        """
        word      - string
        return    - processed string
        """
        proc_word = word.replace("(", "")
        proc_word = proc_word.replace("-", " ")
        proc_word = proc_word.replace("_", " ")
        proc_word = proc_word.replace("=", " ")
        proc_word = proc_word.replace("*", " ")
        proc_word = proc_word.replace("<", "")
        proc_word = proc_word.replace(">", "")
        proc_word = proc_word.replace(")", "")
        proc_word = proc_word.replace("/", "")
        proc_word = proc_word.replace("\\", "")
        proc_word = proc_word.replace("[", "")
        proc_word = proc_word.replace("^", "")
        proc_word = proc_word.replace("]", "")
        proc_word = proc_word.replace("", "")
        proc_word = proc_word.replace(" ", "")
        proc_word = proc_word.replace("  ", "")
        return proc_word     

class TokenizedDoc:
    def __init__(self, words, url):
        self.words = words # list of strings (list of words)
        self.url = url
    pass    


def analyse(docs, processorsList):
    """
    docs           - list of Docs
    processorsList - list of Processors like TextProcessor
    return         - list of TokenizedDoc
    """
    tokdocs = []
    
    for d in docs:
        tokens = d.text.split(" ") # Split and tokenzie on ' ', return list of words
        # Process
        words = []
        for w in tokens:
            w = w.strip()
            for p in processorsList:
                 w = p.process(w)
            w_r = w.split()
            for w in w_r:
                if w != '' and w != ' ':        
                    words.append(w)
        tokdocs.append(TokenizedDoc(words, d.url))    
    
    return tokdocs
    
Processors = [PunctationProcessor, SpecialProcessor, AccentProcessor, LowerProcessor]
                                      
TokDocs = analyse(DocList, Processors)

len(TokDocs)

593

In [32]:
class Posting:
    def __init__(self, word, urls):
        """
        word - string
        urls - list of url
        """
        self.word = word
        self.urls = urls
    pass

def make_index(TokeneizedDocs):
    """
    TokeneizedDocs - list of TokenizedDoc
    """
    done_words = []
    postingList = []
    
    words = []
    for tokD in TokeneizedDocs:
        words += tokD.words
        
    print("Number of words ", len(words))     
    for w in words:
        if w in done_words:
            continue
                
        done_words.append(w)
        urls = []
        for d in TokeneizedDocs:
            if w in d.words:
                urls.append(d.url)
        postingList.append(Posting(w, urls))   
    
    return postingList


postingList = make_index(TokDocs)

Number of words  163558


In [33]:
class Generation:
    def __init__(self, gen_id, wordToDocIds, didToMetaData):
        self.gen_id = gen_id
        self.wordToDocIds = wordToDocIds
        # Bonus 1 : Meta Datas
        self.didToMetaData = didToMetaData

class IndexIncr:
    def __init__(self):
        self.ids = 0
        self.urlToDocId = {}
        self.idToDocUrl = {}
        self.docIdToGeneration = {}  # key (gen) = [doc id, ...]
        self.generations = []
        # Bonus 2 : Suppr vector
        self.suppressions = []
    
    def get_doc(self, doclist, url):
        for i in doclist:
            if i.url == url:
                return i
        return None
    
    def rm_doc(self, url):
        """
        Remove a document
        url - document url to remove
        """
        url_id = self.urlToDocId[url]
        if not (url_id in self.suppressions):
            self.suppressions.append(url_id)
        # print(self.suppressions)

    
    def build_gen(self, Postings, doclist):
        """
        Build Index from list of Posting
        """
        gen_id = len(self.generations)
        wordToDocIds = {}
        docIdToMetaData = {} # Dict of MetaDatas
        for i in Postings:
            u_list = []
            for j in i.urls:
                if not (j in self.urlToDocId):
                    self.urlToDocId[j] = self.ids
                    self.idToDocUrl[self.ids] = j
                    self.docIdToGeneration[self.ids] = gen_id
                    u_list.append(self.ids)
                    x = self.get_doc(doclist, j)
                    if x:
                        docIdToMetaData[self.ids] = x.metadata
                    else:
                        docIdToMetaData[self.ids] = None
                    self.ids += 1
                else:
                    u_list.append(self.urlToDocId[j])
            wordToDocIds[i.word] = u_list        
        self.generations.append(Generation(gen_id, wordToDocIds, docIdToMetaData))
    

In [34]:
main_index = IndexIncr()

main_index.build_gen(postingList, DocList)


DocList1 = fetch(path_politics)
DocList2 = fetch(path_sci_space)
DocList3 = fetch(path_sci_elect)
DocList4 = fetch(path_sci_med)

TokDocs1 = analyse(DocList1, Processors)
TokDocs2 = analyse(DocList2, Processors)
TokDocs3 = analyse(DocList3, Processors)
TokDocs4 = analyse(DocList4, Processors)


postingList1 = make_index(TokDocs1)
postingList2 = make_index(TokDocs2)
postingList3 = make_index(TokDocs3)
postingList4 = make_index(TokDocs4)

main_index.build_gen(postingList1, DocList1)
main_index.build_gen(postingList2, DocList2)
main_index.build_gen(postingList3, DocList3)
main_index.build_gen(postingList4, DocList4)


Number of words  182486
Number of words  163558
Number of words  113778
Number of words  162406


In [35]:
class FoundDocument:
    def __init__(self, url, metaDatas):
        self.url = url
        self.metaDatas = metaDatas

class SearchIncrIndex:
    def __init__(self, index_incr=None):
        if None == index_incr:
            self.load(path_bin_index)
        else:
            self.index = index_incr
    
    def load(self, path):
        with open(path + "/" + "index_incr.bin", "rb") as f:
            self.index = pickle.load(f)
    
    def id_to_url(self, id_l):
        res = []
        for i in id_l:
            res.append(self.index.idToDocUrl[i])
        return res
    
    # Bonus 1 Search Metadatas, for 1 word search
    def search_meta(self, word):
        res = []
        res_ids = self.search_ids(word)
        for i in res_ids:
            gen_id = self.index.docIdToGeneration[i] # Give gen of this doc id
            gen = self.index.generations[gen_id]
            url = self.index.idToDocUrl[i]
            metas = gen.didToMetaData[i]
            res.append(FoundDocument(url, metas))
        return res
    
    def search(self, word):
        res_id = self.search_ids(word)
        res_urls = self.id_to_url(res_id)
        return res_urls
    
    def search_ids(self, word):
        """
        Search word, return urls for this word
        word      - string
        """
        gs = self.index.generations
        res = []
        gl = len(self.index.generations)
        current_urls = []
        for g in reversed(gs):  # Get last first
            if not (word in g.wordToDocIds):
                continue
            preserved_urls = []
            urls_ids = list.copy(g.wordToDocIds[word])
            urls = []
            for u in urls_ids:
                urls.append(self.index.idToDocUrl[u])
            for u in urls:
                if not (u in current_urls):
                    preserved_urls.append(u)
                    current_urls.append(u)
            urls_ids = []
            for u in preserved_urls:
                # Bonus 2 :  Handle supp
                url_id = self.index.urlToDocId[u]
                if url_id in self.index.suppressions:
                    print("sup found", self.index.suppressions)
                    continue
                urls_ids.append(url_id)
            res = res + urls_ids
        return res


    def searchAllOf(self, words):
        """
        AND oper
        words - list of words
        """
        and_urls = []
        first = True
        for w in words:
            if first:
                and_urls = self.search_ids(w)
                first = False
            else:
                and_urls = list(set(and_urls) & set(self.search_ids(w)))
        res_urls = self.id_to_url(and_urls)
        return res_urls

    def searchOneOf(self, words):
        """
        OR oper
        """
        or_urls = []
        for w in words:
            or_urls = list(set(or_urls) | set(self.search_ids(w)))
        res_urls = self.id_to_url(or_urls)
        return res_urls

In [36]:
incr_searcher = SearchIncrIndex(main_index)

In [37]:
incr_searcher.search('space')

['./data/20news-bydate-train/sci.med//58957',
 './data/20news-bydate-train/sci.med//58822',
 './data/20news-bydate-train/sci.med//58965',
 './data/20news-bydate-train/sci.med//59126',
 './data/20news-bydate-train/sci.med//58568',
 './data/20news-bydate-train/sci.med//59087',
 './data/20news-bydate-train/sci.med//58876',
 './data/20news-bydate-train/sci.med//59168',
 './data/20news-bydate-train/sci.med//59072',
 './data/20news-bydate-train/sci.med//58569',
 './data/20news-bydate-train/sci.med//58974',
 './data/20news-bydate-train/sci.med//59014',
 './data/20news-bydate-train/sci.electronics//53730',
 './data/20news-bydate-train/sci.electronics//53777',
 './data/20news-bydate-train/sci.electronics//53844',
 './data/20news-bydate-train/sci.electronics//53512',
 './data/20news-bydate-train/sci.electronics//53563',
 './data/20news-bydate-train/sci.electronics//53652',
 './data/20news-bydate-train/sci.electronics//53877',
 './data/20news-bydate-train/sci.electronics//53873',
 './data/20news-

In [38]:
res = incr_searcher.search_meta('space')
for r in res:
    print(r.url, r.metaDatas)

./data/20news-bydate-train/sci.med//58957 {'name': '58957', 'file': '58957', 'date': 1047990274.0}
./data/20news-bydate-train/sci.med//58822 {'name': '58822', 'file': '58822', 'date': 1047990274.0}
./data/20news-bydate-train/sci.med//58965 {'name': '58965', 'file': '58965', 'date': 1047990274.0}
./data/20news-bydate-train/sci.med//59126 {'name': '59126', 'file': '59126', 'date': 1047990275.0}
./data/20news-bydate-train/sci.med//58568 {'name': '58568', 'file': '58568', 'date': 1047990273.0}
./data/20news-bydate-train/sci.med//59087 {'name': '59087', 'file': '59087', 'date': 1047990275.0}
./data/20news-bydate-train/sci.med//58876 {'name': '58876', 'file': '58876', 'date': 1047990274.0}
./data/20news-bydate-train/sci.med//59168 {'name': '59168', 'file': '59168', 'date': 1047990275.0}
./data/20news-bydate-train/sci.med//59072 {'name': '59072', 'file': '59072', 'date': 1047990274.0}
./data/20news-bydate-train/sci.med//58569 {'name': '58569', 'file': '58569', 'date': 1047990273.0}
./data/20n

In [39]:
incr_searcher.search('weapon')

['./data/20news-bydate-train/sci.space/60946',
 './data/20news-bydate-train/sci.space/61017',
 './data/20news-bydate-train/sci.space/61085',
 './data/20news-bydate-train/talk.politics.guns/54943',
 './data/20news-bydate-train/talk.politics.guns/53302',
 './data/20news-bydate-train/talk.politics.guns/53319',
 './data/20news-bydate-train/talk.politics.guns/53331',
 './data/20news-bydate-train/talk.politics.guns/54191',
 './data/20news-bydate-train/talk.politics.guns/54436',
 './data/20news-bydate-train/talk.politics.guns/53322',
 './data/20news-bydate-train/talk.politics.guns/54681',
 './data/20news-bydate-train/talk.politics.guns/54574',
 './data/20news-bydate-train/talk.politics.guns/54204',
 './data/20news-bydate-train/talk.politics.guns/54526',
 './data/20news-bydate-train/talk.politics.guns/54144',
 './data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/talk.politics.guns/54435',
 './data/20news-bydate-train/talk.politics.guns/54177',
 './data/20news-byd

In [40]:
incr_searcher.search_meta('weapon')

[<__main__.FoundDocument at 0x7f22638a2c10>,
 <__main__.FoundDocument at 0x7f22638a2b90>,
 <__main__.FoundDocument at 0x7f22638a2a50>,
 <__main__.FoundDocument at 0x7f22638a2850>,
 <__main__.FoundDocument at 0x7f22638a2f10>,
 <__main__.FoundDocument at 0x7f22638a2a10>,
 <__main__.FoundDocument at 0x7f22638a2810>,
 <__main__.FoundDocument at 0x7f22638a2ad0>,
 <__main__.FoundDocument at 0x7f22638a2910>,
 <__main__.FoundDocument at 0x7f22638a2990>,
 <__main__.FoundDocument at 0x7f22638a2b10>,
 <__main__.FoundDocument at 0x7f22638a28d0>,
 <__main__.FoundDocument at 0x7f22638a2890>,
 <__main__.FoundDocument at 0x7f22638a2950>,
 <__main__.FoundDocument at 0x7f22638a27d0>,
 <__main__.FoundDocument at 0x7f22638a2d90>,
 <__main__.FoundDocument at 0x7f22638a2e50>,
 <__main__.FoundDocument at 0x7f22638a29d0>,
 <__main__.FoundDocument at 0x7f22638a2c50>,
 <__main__.FoundDocument at 0x7f22638a2b50>,
 <__main__.FoundDocument at 0x7f22638a2dd0>,
 <__main__.FoundDocument at 0x7f22638afad0>,
 <__main__

In [41]:
main_index.rm_doc('./data/20news-bydate-train/sci.space/60946')
main_index.rm_doc('./data/20news-bydate-train/sci.space/61085')

In [42]:
incr_searcher = SearchIncrIndex(main_index)

incr_searcher.search('weapon')

sup found [32, 157]
sup found [32, 157]


['./data/20news-bydate-train/sci.space/61017',
 './data/20news-bydate-train/talk.politics.guns/54943',
 './data/20news-bydate-train/talk.politics.guns/53302',
 './data/20news-bydate-train/talk.politics.guns/53319',
 './data/20news-bydate-train/talk.politics.guns/53331',
 './data/20news-bydate-train/talk.politics.guns/54191',
 './data/20news-bydate-train/talk.politics.guns/54436',
 './data/20news-bydate-train/talk.politics.guns/53322',
 './data/20news-bydate-train/talk.politics.guns/54681',
 './data/20news-bydate-train/talk.politics.guns/54574',
 './data/20news-bydate-train/talk.politics.guns/54204',
 './data/20news-bydate-train/talk.politics.guns/54526',
 './data/20news-bydate-train/talk.politics.guns/54144',
 './data/20news-bydate-train/talk.politics.guns/54215',
 './data/20news-bydate-train/talk.politics.guns/54435',
 './data/20news-bydate-train/talk.politics.guns/54177',
 './data/20news-bydate-train/talk.politics.guns/53294',
 './data/20news-bydate-train/talk.politics.guns/54286',
 

In [44]:
save_incr(main_index, path_bin_index)