# RadixTree
Implementation eines RadixTrees.

In [1]:
class RadixTree:
    """contains a dict, which links to multiple RadixTree's. The key to another tree is a str and
    if a transition ends in a word, the isWord-variable is True
    """

    def __init__(self, isWord=False, transitions=None):
        if transitions is None:
            transitions = dict()
        self.isWord = isWord
        self.transitions = transitions

    def insertWord(self, word):
        """inserts a word into the tree

        Args:
            word (str): is the str, that gets inserted
        """
        for i in range(0, len(word)):
            # goes through 'tobi' with 'tobi', 'tob', 'to', 't'
            possibleTransition = word[:len(word) - i]
            if self.transitions.get(possibleTransition) is not None:
                child = self.transitions.get(possibleTransition)
                if possibleTransition == word:
                    child.isWord = True
                else:
                    child.insertWord(word[len(possibleTransition):])
                return

            for key in self.transitions.keys():
                if possibleTransition == key[:len(possibleTransition)]:
                    child = self.transitions.pop(key)
                    newDict = dict()
                    newDict[key[len(possibleTransition):]] = child
                    self.transitions[possibleTransition] = RadixTree(possibleTransition == word, newDict)
                    if possibleTransition != word:
                        self.transitions[possibleTransition].insertWord(word[len(possibleTransition):])
                    return

        self.transitions[word] = RadixTree(True, dict())

    def __strRecursive__(self, timesOfIndentation, lengthOfTransitionString):
        result = ""
        if self.isWord:
            result += "."
        keys = list(self.transitions.keys())
        keys.sort()
        for key in keys:
            recursiveResult = self.transitions[key].__strRecursive__(timesOfIndentation + lengthOfTransitionString,
                                                                     len(key))
            result += "\n" + (timesOfIndentation + lengthOfTransitionString) * "_" + key + recursiveResult
        return result

    def __str__(self):
        """generates a readable str, containing all class variables (the tree).

        Returns:
            str: the generated str
        """
        return self.__strRecursive__(0, 0)

    def getSimilarWordsOfSameLength(self, maximumDifferentLetters, word):
        """compares the given word with entries of the same length. Returns all of them with less or equal different letters than with maximumDifferentLetters described

        Args:
            maximumDifferentLetters (int): limits the amount of accepted different letters when comparing 2 words
            word (str): the given word

        Returns:
            list: returns a list of similar words with the same length. Does contain itself
        """
        if word == "":
            return [word] if self.isWord else []

        resultList = list()
        for key in self.transitions.keys():
            if len(key) > len(word):
                continue
            differences = 0
            for i in range(0, len(key)):
                differences += word[i] != key[i]
                if differences > maximumDifferentLetters:
                    break

            if differences > maximumDifferentLetters:
                continue
            resultTmp = self.transitions[key].getSimilarWordsOfSameLength(maximumDifferentLetters - differences,
                                                                          word[len(key):])
            if resultTmp:
                resultList.extend([key + tmp for tmp in resultTmp])
        return resultList


# RadixTreesByWordLength
Ein Container, der Wörter je nach Länge in unterschiedliche RadixTrees ablegt und ansonsten wie ein Tree agiert.

In [2]:
class RadixTreesByWordLength:
    """contains a dict, which links to multiple RadixTree's, each storing words of the same length
    """

    def __init__(self):
        self.radixTrees = dict()

    def insertWord(self, word):
        """inserts a word into the tree

        Args:
            word (str): is the str, that gets inserted
        """
        length = len(word)
        if self.radixTrees.get(length) is None:
            self.radixTrees[length] = RadixTree()
        self.radixTrees[length].insertWord(word)

    def __str__(self):
        """generates a readable str, containing all class variables (the tree)

        Returns:
            str: the generated str
        """
        tmp = ""
        lengthStr = self.radixTrees.keys()
        sortedLengths = [int(lengths) for lengths in lengthStr]
        sortedLengths.sort()

        for length in sortedLengths:
            tmp += "RadixTree with words of length " + str(length) + ":"
            tmp += self.radixTrees[length].__str__()
            tmp += "\n"
        return tmp

    def getSimilarWordsOfSameLength(self, maximumDifferentLetters, word):
        """compares the given word with entries of the same length. Returns all of them with less or equal different letters than with maximumDifferentLetters described

        Args:
            maximumDifferentLetters (int): limits the amount of accepted different letters when comparing 2 words
            word (str): the given word

        Returns:
            list: returns a list of similar words with the same length. Does contain itself
        """
        length = len(word)
        if self.radixTrees.get(length) is None:
            return []
        return self.radixTrees[length].getSimilarWordsOfSameLength(maximumDifferentLetters, word)


# SubstituitionMatrix
Implementation einer Substitutionsmatrix.

In [3]:
class SubstitutionMatrix:
    """Stores letter-variations as a matrix. The matrix is a dict linking the first letter to a second dict, linking the second letter to its occurrence (int)
    """

    def __init__(self):
        self.letters = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
            'v', 'w', 'x', 'y', 'z']
        self.matrix = dict()
        matrixTmp = dict()
        for letter in self.letters:
            self.matrix[letter] = dict()
            matrixTmp[letter] = 0
        for letter in self.letters:
            self.matrix[letter] = matrixTmp.copy()
        self.totalLetterTransitions = 0

    def addLetterTransition(self, fr, to):
        """adds the transistion from one letter to another in the matrix

        Args:
            fr (str): a single lower character of the alphabet
            to (str): a second single lower character of the alphabet

        Returns:
            bool: is False if one of the characters was not accepted, True if both were accepted
        """
        if fr not in self.letters or to not in self.letters:
            return False

        self.matrix[fr][to] += 1
        self.matrix[to][fr] += 1
        self.totalLetterTransitions += 1
        return True

    def getMatrixEntryInPercent(self, fr, to):
        """returns a specific matrix-entry in %

        Args:
            fr (str): a single lower character of the alphabet
            to (str): a second single lower character of the alphabet

        Returns:
            float: returns -1.0 if one of the given str's was not accepted, the matrix-entry in percent otherwise
        """
        if fr not in self.letters or to not in self.letters:
            return -1.0

        if self.totalLetterTransitions == 0:
            return 0.0
        return (self.matrix[fr][to] / self.totalLetterTransitions) * 100

    def __str__(self):
        """generates a readable str for the matrix

        Returns:
            str: the generated str
        """

        tmp = "Substitutionmatrix in %:\n"
        for to in self.matrix.keys():
            tmp += "\t" + to
        tmp += "\n"
        
        for fr in self.letters:
            tmp += fr
            for to in self.letters:
                strEntry = str(self.getMatrixEntryInPercent(fr, to))
                if len(strEntry) < 4:
                    strEntry = (4 - len(strEntry)) * "0" + strEntry
                tmp += "\t" + strEntry[:4]
            tmp += "\n"
        return tmp

    def addLetterVariationsToMatrix(self, name, similarNames):
        """retrieves differences of names with the original and inserts these letter-variations into the substitution-matrix

        Args:
            name (str): is the original name
            similarNames (list): list of similar names as str's
        """
        for similarName in similarNames:
            for i in range(len(name)):
                if name[i] != similarName[i]:
                    self.addLetterTransition(name[i], similarName[i])

    # here is a simple way of storing and retrieveing the matrix in an external file
    """
    def save(self):
        f = open("backupFiles/SubstitutionMatrix.txt", "w")
        tmp = str(self.totalLetterTransitions) + ";"
        for key1 in self.matrix.keys():
            tmp += str(key1) + ":"
            for key2 in self.matrix[key1]:
                tmp += str(key2) + "-" + str(self.matrix[key1][key2]) + ","
            tmp = tmp[:-1] + "\n"
        f.write(tmp[:-1])
        f.close()
        
    def loadBackup(self):
        f = open("backupFiles/SubstitutionMatrix.txt", "r")
        backup = (f.read()).split(';')
        self.totalLetterTransitions = int(backup[0])
        backup = backup[1].split("\n")
        for entryStr in backup:
            key1 = entryStr[0]
            for entry in entryStr[2:].split(','):
                key2 = entry[0]
                self.matrix[key1][key2] = int(entry[2:])
        f.close()
    """


# Demo
Extraktion der Variationen in Vornamen.

In [5]:
#import re
# install and import Entrez and Medline first
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    from Bio import Entrez, Medline
except:
    # One of these 2 lines should work
    # !pip install Bio
    install('Bio')
from Bio import Entrez, Medline


def getPapers(myQuery, maxPapers, myEmail="freytag64@gmail.com"):
    """retrieves some Papers from Pubmed

    Args:
        myQuery (str): is the given Query 
        maxPapers (int): is a limit of the number of papers, which will be retrieved
        myEmail (str, optional): an email. Defaults to "freytag64@gmail.com".

    Returns:
        list: papers as list of dictionarys containing abstract, authors, ...
    """
    # Get articles from PubMed
    Entrez.email = myEmail
    record = Entrez.read(Entrez.esearch(db="pubmed", term=myQuery, retmax=maxPapers))
    idlist = record["IdList"]
    print("\nThere are %d records for %s." % (len(idlist), myQuery.strip()))
    records = Medline.parse(Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text"))
    return list(records)


def retrieveAllFirstNames(records):
    """takes list of papers (each is a dict) and extracts the first names as a combined list with a regular expression 

    Args:
        records (list): list of papers

    Returns:
        list: list of first names as str's
    """
    # retrieves first names from authors with regular expressions
    firstNameList = list()
    for record in filter(lambda x: 'FAU' in x, records):
        for fullName in record['FAU']:
            # since names are formatted like 'Lastname, Firstnames', split by ',' to get Firstnames
            firstName = fullName.split(',')[1].strip()
            names1 = list(filter(lambda x: len(x) > 1, firstName.split()))
            firstNameList.extend(names1)
            # print(fullName + " --> " + str(names1))
            
            # fixed: does not get name-pairs like "le Roux, Marlene F", since the lastname 'Roux' starts with ' ' too
            # fixed (see comment): does not really get name-pairs like "Something, A Mohammed", since A is just a single letter (it ignores A as a firstname)
            # a better one can be generated by using Multiple Sequence Alignment from the lectures. Names like Al-Abehd with '-' are just added as 'Al-Abehd' too.
            # sometimes accepts stuff like 'jr'
            
            """
            expression = r' ([a-zA-Z_-][a-zA-Z_-]+)'
            names2 = re.findall(expression, fullName)
            # firstNameList.extend(names2)            
            
            if names1 != names2:
              print(fullName + " --> " + str(names1))
              print(fullName + " --> " + str(names2))
            """

    return firstNameList



# TODO: test for bugs with test-cases (highly encouraged!)
if __name__ == "__main__":
    maxPapers = 60  # limit the number of papers retrieved

    # Alternative: Save everything in a single tree --> then use "tree=RadixTree()"
    tree = RadixTreesByWordLength()  # saves names in radix-trees. One for each word-length

    matrix = SubstitutionMatrix()  # saves letter-transitions in a matrix

    myQuery = "(\"2021/01/20\"[Date - Publication] : \"2021/01/20\"[Date - Publication])"
    records = getPapers(myQuery, maxPapers)

    firstnames = retrieveAllFirstNames(records)

    import time
    start_time = time.time()
    names = list()
    with open("2022FirstNames", "r") as myfile:
        try:
            names = set(myfile.read().split())
            n = len(names)
            print("reading it in took --- %s seconds ---" % (time.time() - start_time))
            print("there exist " + str(n) + " different names")
            lastPercentage = 0.00
            for i in range(0, n):
                if int(i * 100 / n) > lastPercentage:
                    lastPercentage = int(i * 100 / n)
                    print(str(i) + " names read in --> in percent: " + str(int(i * 100 / n)))
                tree.insertWord(names.pop())
        finally:
            myfile.close()
    firstnames.extend(list(names))

    for name in firstnames:
        name = str(name).lower()

        tree.insertWord(name)

        # very simple function to get similar names (names of same length and only limited letter-substitutions).
        # TODO: method of getting similar names sucks. A better way would be, for example, a Levensthein-like method that weigths substitutions less if they are likely in our substitution-matrix
        maximumOfSubsitutions = 1
        similarNames = tree.getSimilarWordsOfSameLength(maximumOfSubsitutions, name)
        similarNames.remove(name)
        #if similarNames != []:
        #    print(name + " --> " + similarNames.__str__())

        matrix.addLetterVariationsToMatrix(name,
                                           similarNames)  # this function is a bit redundant, since the getSimilarWordsOfSameLength-method could do the job too, but easier to read

    #print(tree)
    print(matrix)

# here is a possible way for looping over a lot of papers
#"""
#maxPapers = 100 #limit the number of papers retrieved each loop
#year = "2021"
#daysOfMonth = {"01":31, "02":28, "03":31, "04":30, "05":31, "06":30, "07":31, "08":31, "09":30, "10":31, "11":30, "12":31}
#for month in daysOfMonth.keys():
#    for day in range(1, daysOfMonth[month] + 1):
#        #get Papers
#        myQuery = "(\"" + year + "/" + month + "/" + str(day) + "\"[Date - Publication] : \"" + year + "/" + month + "/" + str(day) + "\"[Date - Publication])"
#        records = getPapers(myQuery, maxPapers)
#
#        #...
#        
#        pass
#    pass
#"""


There are 60 records for ("2021/01/20"[Date - Publication] : "2021/01/20"[Date - Publication]).
reading it in took --- 3.472443103790283 seconds ---
there exist 272358 different names
2724 names read in --> in percent: 1
5448 names read in --> in percent: 2
8171 names read in --> in percent: 3
10895 names read in --> in percent: 4
13618 names read in --> in percent: 5
16342 names read in --> in percent: 6
19066 names read in --> in percent: 7
21789 names read in --> in percent: 8
24513 names read in --> in percent: 9
27236 names read in --> in percent: 10
29960 names read in --> in percent: 11
32683 names read in --> in percent: 12
35407 names read in --> in percent: 13
38131 names read in --> in percent: 14
40854 names read in --> in percent: 15
43578 names read in --> in percent: 16
46301 names read in --> in percent: 17
49025 names read in --> in percent: 18
51749 names read in --> in percent: 19
54472 names read in --> in percent: 20
57196 names read in --> in percent: 21
59919 na

'\nmaxPapers = 100 #limit the number of papers retrieved each loop\nyear = "2021"\ndaysOfMonth = {"01":31, "02":28, "03":31, "04":30, "05":31, "06":30, "07":31, "08":31, "09":30, "10":31, "11":30, "12":31}\nfor month in daysOfMonth.keys():\n    for day in range(1, daysOfMonth[month] + 1):\n        #get Papers\n        myQuery = "("" + year + "/" + month + "/" + str(day) + ""[Date - Publication] : "" + year + "/" + month + "/" + str(day) + ""[Date - Publication])"\n        records = getPapers(myQuery, maxPapers)\n\n        #...\n        \n        pass\n    pass\n'