In [1]:
import re
from IPython.display import display, HTML 

# Implementierung eines einfachen Invertierter Index

Basierend auf https://medium.com/@fro_g/writing-a-simple-inverted-index-in-python-3c8bcb52169a 

## Basis-Datenstrukturen

Zur vereinfachten Inspektion wird die Häufigkeit eines Terms pro Dokument in der *Apperance* als Dictionary
mit den beiden Schlüsseln *docId* und *frequency* gespeichert. Hier durch ist die Datenstruktur
zwar einfach interpretierbar aber stark redundant. Kompakter wäre die Repräsentation durch ein Python-Tupel.

Die Database ist eine einfache In-Memory-Datenbank. Persistente Speicherung dieser DB ist bisher nicht implementiert.

In [2]:
class Appearance:
    """
    Represents the appearance of a term in a given document, along with the
    frequency of appearances in the same one.
    """
    def __init__(self, docId, frequency):
        self.docId = docId
        self.frequency = frequency

    def __repr__(self):
        """
        String representation of the Appearance object
        """
        return str(self.__dict__)
    
class Database:
    """
    In memory database representing the already indexed documents.
    """
    def __init__(self):
        self.db = dict()

    def __repr__(self):
        """
        String representation of the Database object
        """
        return str(self.__dict__)
    
    def get(self, id):
        return self.db.get(id, None)
    
    def add(self, document):
        """
        Adds a document to the DB.
        """
        return self.db.update({document['id']: document})

    def remove(self, document):
        """
        Removes document from DB.
        """
        return self.db.pop(document['id'], None)

## Invertierter Index

*index_document* ist die zentrale Funktion zur Integration eines neuen Dokuments in den invertierten Index. 
Diese Funktion nutzt eine simple *whitespace tokenization* und entfernt radikal alle Satzzeichen (auch Satzzeichen die in Termen verwendet werden). Für eine reale Implementierung sind sowohl die Tokenization als auch die Löschung der Satzzeichen separat durchzuführen und *index_document* so umzubauen, dass es nur noch eine Tokenliste nutzt.

Darüberhinaus wird keine weitere Normalisierung der Terme durchgeführt. Dies kann als kleine Übungsaufgabe integriert werden.

In [3]:
class InvertedIndex:
    """
    Inverted Index class.
    """
    def __init__(self, db):
        self.index = dict()
        self.db = db

    def __repr__(self):
        """
        String representation of the Database object
        """
        return str(self.index)
        
    def index_document(self, document):
        """
        Process a given document, save it to the DB and update the index.
        """
        
        # Remove punctuation from the text.
        clean_text = re.sub(r'[^\w\s]','', document['text'])
        terms = clean_text.split(' ')
        appearances_dict = dict()

        # Dictionary with each term and the frequency it appears in the text.
        for term in terms:
            term_frequency = appearances_dict[term].frequency if term in appearances_dict else 0
            appearances_dict[term] = Appearance(document['id'], term_frequency + 1)
            
        # Update the inverted index
        update_dict = { key: [appearance]
                       if key not in self.index
                       else self.index[key] + [appearance]
                       for (key, appearance) in appearances_dict.items() }

        self.index.update(update_dict)

        # Add the document into the database
        self.db.add(document)

        return document
    
    def lookup_query(self, query):
        """
        Returns the dictionary of terms with their correspondent Appearances. 
        This is a very naive search since it will just split the terms and show
        the documents where they appear.
        """
        return { term: self.index[term] for term in query.split(' ') if term in self.index }

## Hilfsfunktion zur Hervorhebung von gefundenen Termen

In [4]:
def highlight_term(id, term, text):
    #replaced_text = text.replace(term, "\033[1;32;40m {term} \033[0;0m".format(term=term))
    replaced_text = text.replace(term, '<b>{term}</b>'.format(term=term))
    return "--- document {id}: {replaced}".format(id=id, replaced=replaced_text)

## Testdaten

In [5]:
db = Database()
index = InvertedIndex(db)

document1 = {
    'id': '1',
    'text': 'The big sharks of Belgium drink beer.'
}

document2 = {
    'id': '2',
    'text': 'Belgium has great beer. They drink beer all the time.'
}

document3 = {
    'id': '3',
    'text': 'Brussels is the capital of Belgium.'
}


index.index_document(document1)
index.index_document(document2)
index.index_document(document3)

{'id': '3', 'text': 'Brussels is the capital of Belgium.'}

In [6]:
print(db)

{'db': {'1': {'text': 'The big sharks of Belgium drink beer.', 'id': '1'}, '3': {'text': 'Brussels is the capital of Belgium.', 'id': '3'}, '2': {'text': 'Belgium has great beer. They drink beer all the time.', 'id': '2'}}}


In [7]:
print(index)

{'all': [{'docId': '2', 'frequency': 1}], 'the': [{'docId': '2', 'frequency': 1}, {'docId': '3', 'frequency': 1}], 'time': [{'docId': '2', 'frequency': 1}], 'of': [{'docId': '1', 'frequency': 1}, {'docId': '3', 'frequency': 1}], 'They': [{'docId': '2', 'frequency': 1}], 'drink': [{'docId': '1', 'frequency': 1}, {'docId': '2', 'frequency': 1}], 'The': [{'docId': '1', 'frequency': 1}], 'Belgium': [{'docId': '1', 'frequency': 1}, {'docId': '2', 'frequency': 1}, {'docId': '3', 'frequency': 1}], 'is': [{'docId': '3', 'frequency': 1}], 'capital': [{'docId': '3', 'frequency': 1}], 'sharks': [{'docId': '1', 'frequency': 1}], 'beer': [{'docId': '1', 'frequency': 1}, {'docId': '2', 'frequency': 2}], 'Brussels': [{'docId': '3', 'frequency': 1}], 'great': [{'docId': '2', 'frequency': 1}], 'big': [{'docId': '1', 'frequency': 1}], 'has': [{'docId': '2', 'frequency': 1}]}


In [9]:
search_term = input("Enter term(s) to search: ")
result = index.lookup_query(search_term)

for term in result.keys():
    for appearance in result[term]:
        # Belgium: { docId: 1, frequency: 1}
        document = db.get(appearance.docId)
        display(HTML(highlight_term(appearance.docId, term, document['text'])))
    print("-----------------------------")    

Enter term(s) to search: Belgium beer


-----------------------------


-----------------------------
