<a href="https://colab.research.google.com/github/PeterNaggschga/Letter-Variations-in-First-Names-IS/blob/main/LetterVariationsFirstNames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RadixTree
Implementation eines RadixTrees.

In [None]:
class RadixTree:
    """contains a dict, which links to multiple RadixTree's. The key to another tree is a str and
    if a transition ends in a word, the isWord-variable is True
    """

    def __init__(self, isWord=False, transitions=None):
        if transitions is None:
            transitions = dict()
        self.isWord = isWord
        self.transitions = transitions

    def insertWord(self, word):
        """inserts a word into the tree

        Args:
            word (str): is the str, that gets inserted
        """
        for i in range(0, len(word)):
            # goes through 'tobi' with 'tobi', 'tob', 'to', 't'
            possibleTransition = word[:len(word) - i]
            if self.transitions.get(possibleTransition) is not None:
                child = self.transitions.get(possibleTransition)
                if possibleTransition == word:
                    child.isWord = True
                else:
                    child.insertWord(word[len(possibleTransition):])
                return

            for key in self.transitions.keys():
                if possibleTransition == key[:len(possibleTransition)]:
                    child = self.transitions.pop(key)
                    newDict = dict()
                    newDict[key[len(possibleTransition):]] = child
                    self.transitions[possibleTransition] = RadixTree(possibleTransition == word, newDict)
                    if possibleTransition != word:
                        self.transitions[possibleTransition].insertWord(word[len(possibleTransition):])
                    return

        self.transitions[word] = RadixTree(True, dict())

    def __strRecursive__(self, timesOfIndentation, lengthOfTransitionString):
        result = ""
        if self.isWord:
            result += "."
        keys = list(self.transitions.keys())
        keys.sort()
        for key in keys:
            recursiveResult = self.transitions[key].__strRecursive__(timesOfIndentation + lengthOfTransitionString,
                                                                     len(key))
            result += "\n" + (timesOfIndentation + lengthOfTransitionString) * "_" + key + recursiveResult
        return result

    def __str__(self):
        """generates a readable str, containing all class variables (the tree).

        Returns:
            str: the generated str
        """
        return self.__strRecursive__(0, 0)

    def getSimilarWordsOfSameLength(self, maximumDifferentLetters, word):
        """compares the given word with entries of the same length. Returns all of them with less or equal different letters than with maximumDifferentLetters described

        Args:
            maximumDifferentLetters (int): limits the amount of accepted different letters when comparing 2 words
            word (str): the given word

        Returns:
            list: returns a list of similar words with the same length. Does contain itself
        """
        if word == "":
            return [word] if self.isWord else []

        resultList = list()
        for key in self.transitions.keys():
            if len(key) > len(word):
                continue
            differences = 0
            for i in range(0, len(key)):
                differences += word[i] != key[i]
                if differences > maximumDifferentLetters:
                    break

            if differences > maximumDifferentLetters:
                continue
            resultTmp = self.transitions[key].getSimilarWordsOfSameLength(maximumDifferentLetters - differences,
                                                                          word[len(key):])
            if resultTmp:
                resultList.extend([key + tmp for tmp in resultTmp])
        return resultList


# RadixTreesByWordLength
Ein Container, der Wörter je nach Länge in unterschiedliche RadixTrees ablegt und ansonsten wie ein Tree agiert.

In [2]:
class RadixTreesByWordLength:
    """contains a dict, which links to multiple RadixTree's, each storing words of the same length
    """

    def __init__(self):
        self.radixTrees = dict()

    def insertWord(self, word):
        """inserts a word into the tree

        Args:
            word (str): is the str, that gets inserted
        """
        length = len(word)
        if self.radixTrees.get(length) is None:
            self.radixTrees[length] = RadixTree()
        self.radixTrees[length].insertWord(word)

    def __str__(self):
        """generates a readable str, containing all class variables (the tree)

        Returns:
            str: the generated str
        """
        tmp = ""
        lengthStr = self.radixTrees.keys()
        sortedLengths = [int(lengths) for lengths in lengthStr]
        sortedLengths.sort()

        for length in sortedLengths:
            tmp += "RadixTree with words of length " + str(length) + ":"
            tmp += self.radixTrees[length].__str__()
            tmp += "\n"
        return tmp

    def getSimilarWordsOfSameLength(self, maximumDifferentLetters, word):
        """compares the given word with entries of the same length. Returns all of them with less or equal different letters than with maximumDifferentLetters described

        Args:
            maximumDifferentLetters (int): limits the amount of accepted different letters when comparing 2 words
            word (str): the given word

        Returns:
            list: returns a list of similar words with the same length. Does contain itself
        """
        length = len(word)
        if self.radixTrees.get(length) is None:
            return []
        return self.radixTrees[length].getSimilarWordsOfSameLength(maximumDifferentLetters, word)


# Name Extraction

In [5]:
# install and import Entrez and Medline first
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    from Bio import Entrez, Medline
except:
    # One of these 2 lines should work
    # !pip install Bio
    install('Bio')
from Bio import Entrez, Medline


def getPapers(myQuery, maxPapers, myEmail="freytag64@gmail.com"):
    """retrieves some Papers from Pubmed

    Args:
        myQuery (str): is the given Query 
        maxPapers (int): is a limit of the number of papers, which will be retrieved
        myEmail (str, optional): an email. Defaults to "freytag64@gmail.com".

    Returns:
        list: papers as list of dictionarys containing abstract, authors, ...
    """
    # Get articles from PubMed
    Entrez.email = myEmail
    record = Entrez.read(Entrez.esearch(db="pubmed", term=myQuery, retmax=maxPapers))
    idlist = record["IdList"]
    print("\nThere are %d records for %s." % (len(idlist), myQuery.strip()))
    records = Medline.parse(Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text"))
    return list(records)


def retrieveAllFirstNames(records):
    """takes list of papers (each is a dict) and extracts the first names as a combined list with a regular expression 

    Args:
        records (list): list of papers

    Returns:
        list: list of first names as str's
    """
    # retrieves first names from authors with regular expressions
    firstNameList = list()
    for record in filter(lambda x: 'FAU' in x, records):
        if(record['FAU'] == None):
            continue
        
        for fullName in record['FAU']:
            # since names are formatted like 'Lastname, Firstnames', split by ',' to get Firstnames
            firstName = fullName.split(',')
            if len(firstName) >= 2:
                firstName = firstName[1].strip()
                names1 = list(filter(lambda x: len(x) > 1, firstName.split()))
                for name in names1:
                    if(name.lstrip()[1:].startswith(".")):
                        names1.remove(name)
                    if(name.startswith([a-z or A-Z] + "-" + [a-z or A-Z])):  #check or
                        names1.remove(name)
                    else:
                        pass

    return firstNameList

In [6]:
if __name__ == "__main__":
    maxPapers = 20000  # limit the number of papers retrieved

    #delete last entries in file
    with open("2022FirstNames", "w+") as myfile:
        try:
            myfile.write("")
        finally:
            myfile.close()

    year = "2022"
    daysOfMonth = {"01":31, "02":28, "03":31, "04":30, "05":31, "06":30, "07":31, "08":31, "09":30, "10":31, "11":30, "12":31}
    for month in daysOfMonth.keys():
        for day in range(1, daysOfMonth[month] + 1):
            try:
                myQuery = "(\"" + year + "/" + month + "/" + str(day) + "\"[Date - Publication] : \"" + year + "/" + month + "/" + str(day) + "\"[Date - Publication])"
                records = getPapers(myQuery, maxPapers)
                
                firstnames = retrieveAllFirstNames(records)

                with open("2022FirstNames", "a") as myfile:
                    try:
                        tmp = ""
                        for name in firstnames:
                            name = name.lower()
                            tmp += name + " "
                        myfile.write(tmp)
                    finally:
                        myfile.close()
                print(str(len(firstnames)) + " new names read in")
            except Exception as e: 
                print(str(e))  
            pass
        pass


There are 9999 records for ("2022/01/1"[Date - Publication] : "2022/01/1"[Date - Publication]).
name 'a' is not defined

There are 984 records for ("2022/01/2"[Date - Publication] : "2022/01/2"[Date - Publication]).


KeyboardInterrupt: ignored

#Clustering
Clustern der Namen mit einer DBSCAN-ähnlichen Methode.

Unterschiede:
- zu große Cluster werden strenger erneut geclustert (maximale Größe: 1000 Namen)
- beim Prüfen, ob ein Knoten ein Kernknoten ist, zählen bereits erkundete Knoten nicht mehr
- Namen mit Länge 2 oder kürzer werden ignoriert

Ein Kernknoten braucht mindestens 6 unbesuchte Nachbarn um als Kernknoten zu gelten.

Eliminiert nicht erreichbare Knoten.

Knoten: einzelne Namen

Knotenübergänge (ungerichtet) zwischen Namen gleicher Länge mit maximaler Hamming-Distanz 1

In [None]:
def getNames() -> list:
    """reads names from external file

    Returns:
        list: list of firstnames
    """
    print("reads in names from external list")
    names = list()
    with open("2022FirstNames", "r") as myfile:
        try:
            names = myfile.read().split()
        finally:
            myfile.close()

    print("names in total: " + str(len(names)))
    
    return names

def getClusters() -> list:
    """reads in clusters from external file

    Returns:
        list: is list of clusters (list of names)
    """
    print("reads in clusters from external list")
    clusters = list()
    with open("clusteredNames", "r") as myfile:
        try:
            clusterAsStrings = myfile.read().split("\n")
            for string in clusterAsStrings:
                clusters.append(string.split())
        finally:
            myfile.close()

    print("clusters in total: " + str(len(clusters)))
    
    return clusters
    

def Dbscan(names : list, min_samples : int, min_word_length : int, max_cluster_length : int) -> list:
    """clusters list of names with DBSCAN

    Args:
        names (list): list of names
        min_samples (int): the number of samples in a neighborhood for a point to be considered a core point
        min_word_length (int): only words with this length or longer are clustered
        max_cluster_length (int): maximum size of cluster. If found cluster are bigger, they will devided
    Returns:
        list: list of clusters (each cluster is a list of names)
    """
    
    tree = RadixTreesByWordLength()
    clusters = list()
    names = list(set(names)) #remove duplicates
    names = list(filter(lambda x : len(x) >= min_word_length, names))
    names.sort(reverse=True)
    
    tmp = list()
    for n in names:
        tmp.append(n.lower())
    names = tmp
    
    #construct the RadixTree
    for n in names:
        tree.insertWord(n)
    
    #clustering
    while len(names) > 0:
        name = names.pop()
        cluster = set()
        stack = [name]
        
        #fill cluster
        while len(stack) > 0:
            node = stack.pop()
            
            #check if name is a core-sample
            neighbors = tree.getSimilarWordsOfSameLength(1, node)
            #already found nodes are not considered as neighbors
            toBeRemoved = list()
            for n in neighbors:
                if n in cluster:
                    toBeRemoved.append(n)
                    continue
                for c in clusters:
                    if n in c:
                        toBeRemoved.append(n)
            for n in toBeRemoved:
                neighbors.remove(n) 
                
            #add new names to cluster
            if len(neighbors) >= min_samples:
                for n in filter(lambda x : x in names, neighbors):
                    names.remove(n)
                stack.extend(neighbors)
                cluster.update(set(neighbors))
        
        cluster = list(cluster)
        #add filled cluster
        if len(cluster) > 1 and len(cluster) <= max_cluster_length:
            clusters.append(cluster)
        #if a cluster is too big, cluster it again
        elif len(cluster) > max_cluster_length:
            smallerClusters = Dbscan(cluster, min_samples + 5, min_word_length, max_cluster_length)
            clusters.extend(smallerClusters)    

    return clusters
            

In [None]:
names = getNames() #gets names from file 2022FirstNames (needs to exist)

print("clustering (takes a while):")
clusters = Dbscan(names, 3, 3, 600)

print(str(len(clusters)) + " different clusters")
i = 0
for c in clusters:
    i += len(c)
print(str(i) + " names clustered")

#save in file
with open("clusteredNames", "w+") as myfile:
    try:
        tmp = ""
        for cluster in clusters:
            for name in cluster:
                tmp += name + " "
            tmp += "\n"
        myfile.write(tmp)
    finally:
        myfile.close()

reads in names from external list


FileNotFoundError: ignored

# SubstitutionMatrix
Implementation einer Substitutionsmatrix, angelehnt an BLOSUM. Ignoriert aktuell alle Buchstaben, die nicht im ASCII kodiert sind.

Matrixeinträge werden nach folgender Formel berechnet: 
$$ S_{i,j} = \frac{1}{\lambda_2} \log_2{(\lambda_1 \cdot \frac{p_{ij}}{q_i \cdot q_j})} $$

$ p_{ij} $ ... Wahrscheinlichkeit, dass (innerhalb eines Clusters) i durch j ersetzt wird \\
$ q_i, q_j $ ... Wahrscheinlichkeiten, dass i bzw. j in einem Wort auftritt \\
$ \lambda_i $ ... (frei wählbare) Skalierungsfaktoren \\
Dabei wird $ \lambda_1 $ genutzt, um die Werte in der Matrix in positive oder negative Richtung zu verschieben. 
$ \lambda_2 $ skaliert größe der Werte linear. 
Die Funktion SubstitutionMatrix.autoscale() skaliert die Werte der Matrix standardmäßig so, dass für alle Werte gilt $ S_{i,j} \in [-5, 5] $ (Abweichungen von bis zu 0.1% möglich).

---

Die Substitutionswahrscheinlichkeit $p_{ij}$ wird iterativ für jedes Cluster $C$ berechnet. Dabei werden alle Wörter des Clusters zeichenweise an den Stellen $k=1,...,l_C$ betrachtet, wobei $l_C$ die Länge der Wörter im Cluster $C$ angibt. Nun wird für jedes Buchstabenpaar $(i, j)$ die Wahrscheinlichkeit des gemeinsamen Auftretens in $C$ an Stelle $k$ wie folgt berechnet:

$$ p_{ij, k}^{(C)} = \frac{f_k^{(C)}(i, j)}{\displaystyle \sum_{a = 1} ^{n} \sum_{b = a} ^{n} f_k^{(C)}(a, b)} $$

$n$ ... Anzahl Buchstaben im betrachteten Alphabet (bei uns gilt n = 26)

Dabei gilt für die Anzahl möglicher Buchstabenpaare $(i, j)$ in Cluster $C$ an Stelle $k$:

Für $i \neq j$ gilt: 
$$ f_k^{(C)}(i, j) = o_{k, i}^{(C)} \cdot o_{k, j}^{(C)} $$

sonst: 
$$ f_k^{(C)}(i, i) = \frac{o_{k, i}^{(C)} \cdot (o_{k, i}^{(C)} - 1)}{2} $$

wobei $o_{k, i}^{(C)}$ angibt, wie oft im Cluster $C$ an Stelle $k$ der Buchstabe an Stelle $i$ des Alphabets vorkommt.

---

Die so erhaltenen Substitutionswahrscheinlichkeit $p_{ij, k}^{(C)}$ werden nun aufaddiert. Dabei wird nach der Anzahl der entsprechenden Substitutionen $o_{k, i}^{(C)}$ gewichtet:

$$ p_{ij} = \frac{1}{o_{ij}} \displaystyle \sum_{C \in M} \sum_{k=1}^n ((o_{k, i}^{(C)} + o_{k, j}^{(C)}) \cdot p_{ij, k}^{(C)} ) $$

wobei $M$ die Menge aller Cluster und $o_{ij}$ die Gesamtzahl der gemeinsamen Vorkommen von $i$ und $j$ darstellt.

---

https://bioinformaticshome.com/bioinformatics_tutorials/sequence_alignment/substitution_matrices.html

In [None]:
import numpy as np
from collections import defaultdict


class SubstitutionMatrix:

    def __init__(self, names: list, clusters: list, inner_scaling: float = 10, outer_scaling: float = 1):
        """
        Initializes and calculates a Substitution-Matrix out of the given names, clustering and scaling

        :param names: list of all names used to calculate matrix
        :param clusters: list of clusters which are lists of similar names
        :param scaling: optional scaling factor
        """
        self.letters = [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
            'v', 'w', 'x', 'y', 'z']

        self.name_occurrences = self.calc_name_occurrences(names)

        self.letter_prob = self.calc_letter_prob()

        self.clusters = clusters

        self.inner_scaling = inner_scaling

        self.outer_scaling = outer_scaling

        self.sub_probs = self.calc_sub_probs()

        print("Calculating substitution matrix...")
        self.substitution_matrix = self.calc_substitution_matrix()
        print("done")

    def calc_name_occurrences(self, all_names: list) -> dict:
        """
        Calculates the number of duplicates for each name in all_names

        :param all_names: list of all names used to calculate the matrix
        :return: a dictionary mapping every name to its number of occurrences
        """
        print("Calculating name occurrences...")

        result = dict()
        names, count = np.unique(all_names, return_counts=True)
        for i in range(len(names)):
            result[names[i]] = count[i]

        print("done")
        return result

    def calc_letter_prob(self) -> dict:
        """
        Calculates the probability of every letter to be in a name

        :return: a dictionary mapping every letter to its probability of being in a name
        """
        print("Calculating probability of letters...")

        letter_counts = defaultdict(int)

        n = 0
        for (name, count) in self.name_occurrences.items():
            for x in {c for c in name}:
                if x in self.letters:
                    letter_counts[x] += count
            n += count

        letter_prob = defaultdict(int)
        for (x, occurrences) in letter_counts.items():
            letter_prob[x] = occurrences / n

        print("done")
        return letter_prob

    def calc_sub_probs(self) -> dict:
        """
        Calculates the probability of one letter to be substituted by another letter

        :return: a matrix with substitution probabilities for each letter
        """
        print("Calculating probability of substitutions...")

        sub_probs = dict()
        for x in self.letters:
            sub_probs[x] = dict()
            for y in self.letters:
                sub_probs[x][y] = (0, 0)  # (prob, #examples)

        for cluster in self.clusters:
            for i in range(len(cluster[0])):
                occurrences = defaultdict(int)
                for name in cluster:
                    n = self.name_occurrences[name]
                    letter = name[i]
                    if letter in self.letters:
                        occurrences[letter] += n

                f = lambda x, y: (occurrences[x] * (occurrences[x] - 1) / 2) if x == y else occurrences[x] * occurrences[y]
                keys = list(occurrences.keys())
                for a in keys:
                    for b in keys:
                        (p_old, n_old) = sub_probs[a][b]
                        fab = f(a, b)
                        if fab:
                            p_new = fab / sum([f(x, y) for x in keys for y in keys[keys.index(x):]])
                        else:
                            p_new = 0
                        
                        n_new = occurrences[a] + occurrences[b]
                        sub_probs[a][b] = ((1 / (n_old + n_new)) * (n_old * p_old + n_new * p_new), n_old + n_new)

        for a in self.letters:
            for b in self.letters:
                sub_probs[a][b] = sub_probs[a][b][0]
        print("done")
        return sub_probs

    def calc_substitution_matrix(self) -> dict:
        """
        Calculates the Substitution-Matrix

        :return: a matrix with substitution values for each letter
        """
        matrix = defaultdict(dict)
        min = float("inf")
        for a in self.letters:
            for b in self.letters:
                try:
                    if self.sub_probs[a][b] == 0:
                        matrix[a][b] = "min"
                    else:
                        matrix[a][b] = (1 / self.outer_scaling) * np.log(self.inner_scaling * self.sub_probs[a][b] / (self.letter_prob[a] * self.letter_prob[b]))
                        if matrix[a][b] < min:
                            min = matrix[a][b]
                            # print((a, b))
                except ZeroDivisionError:
                    matrix[a][b] = "min"

        for a in matrix.keys():
            for b in matrix[a].keys():
                if matrix[a][b] == "min":
                    # print((a, b))
                    matrix[a][b] = min

        return matrix

    def autoscale(self, exp_bounds: float = 5, derivation: float = 0.001):
        print(f"Scaling automatically for bounds {-exp_bounds} +/- {derivation} and {exp_bounds} +/- {derivation}...")
        (min, max) = self.get_bounds()
        print(f"Old scalings (l1, l2) = {(self.inner_scaling, self.outer_scaling)}")
        
        correction = self.inner_scaling / 2
        direction = 1 if abs(min) > abs(min) else -1
        while abs(abs(max) - abs(min)) > derivation:
            new_scaling = self.inner_scaling + direction * correction
            if new_scaling <= 0:
                new_scaling = 1
            self.set_scaling(inner_scaling=new_scaling)

            # print((new_scaling, min, max))
            (min, max) = self.get_bounds()
            direction_new = 1 if abs(min) > abs(max) else -1
            if direction_new != direction:
                correction /= 2
                direction = direction_new
        
        (min, max) = self.get_bounds()
        outer_scaling = (max + abs(min)) / (2 * exp_bounds)
        self.set_scaling(self.inner_scaling, outer_scaling)
        print(f"(min, max) = {self.get_bounds()}")

        print(f"New scalings (l1, l2) = {(self.inner_scaling, self.outer_scaling)}")
        print("done")

    def get_bounds(self):
        """
        Returns a triple of values representing minimum, maximum and average of the substitution matrix

        :return:
        """
        vals = [self.substitution_matrix[a][b] for a in self.letters for b in self.letters]
        return (np.min(vals), np.max(vals))

    def set_names(self, names: list, clusters: list = None):
        """
        Sets a new list of names (and a new clustering) as basis and calculates the new Substitution-Matrix

        :param names: list(str)
        :param clusters: list(list(str))
        :return:
        """
        self.__init__(names, clusters if clusters else self.clusters, self.scaling)

    def set_clusters(self, clusters: list):
        """
        Sets a new clustering and calculates the new Substitution-Matrix

        :param clusters: list(list(str))
        :return:
        """
        self.clusters = clusters
        self.sub_probs = self.calc_sub_probs()
        self.substitution_matrix = self.calc_substitution_matrix()

    def set_scaling(self, inner_scaling: float = 1, outer_scaling: float = 1):
        """
        Set a new scaling and calculate the new Substitution-Matrix
        :param scaling:
        :return:
        """
        if self.inner_scaling != inner_scaling or self.outer_scaling != outer_scaling:
            self.inner_scaling = inner_scaling
            self.outer_scaling = outer_scaling
            self.substitution_matrix = self.calc_substitution_matrix()

    def __str__(self):
        tmp = "Substitutionsmatrix:\n  "
        for to in self.letters:
            tmp += "\t" + to
        tmp += "\n"

        for fr in self.letters:
            tmp += fr
            for to in self.letters:
                strEntry = str(self.substitution_matrix[fr][to])
                if len(strEntry) < 4:
                    strEntry = (4 - len(strEntry)) * "0" + strEntry
                tmp += "\t" + strEntry[:4]
            tmp += "\n"
        return tmp


In [None]:
names = [("klara", 2), ("clara", 4), ("clars", 4), ("klaus", 10), ("claus", 5), ("klaas", 15)]
all_names = []
for (name, count) in names:
    for i in range(count):
        all_names.append(name)

clusters = [["klaus", "claus", "klaas"], ["klara", "clara", "clars"]]

matrix = SubstitutionMatrix(all_names, clusters)
matrix.autoscale()
print(matrix)

Calculating name occurrences...
done
Calculating probability of letters...
done
Calculating probability of substitutions...


RuntimeError: ignored

# Demo

In [None]:
maxPapers = 10000  # limit the number of papers retrieved
myQuery = "(\"2021/01/20\"[Date - Publication] : \"2021/01/20\"[Date - Publication])"
records = getPapers(myQuery, maxPapers)


There are 4926 records for ("2021/01/20"[Date - Publication] : "2021/01/20"[Date - Publication]).


In [None]:
names = retrieveAllFirstNames(records)

print(list(np.unique(names)))

['2nd', '3rd', 'a-rang', 'a-reum', 'aa', 'aaditya', 'aafke', 'aakanksha', 'aaliyah', 'aalt', 'aamer', 'aamir', 'aanchal', 'aarent', 'aarno', 'aaron', 'aarooran', 'aarti', 'aasems', 'aayush', 'abani', 'abate', 'abay', 'abbas', 'abbie', 'abby', 'abd', 'abdalazeem', 'abdalghani', 'abdallah', 'abdel', 'abdel-karim', 'abdel-moneim', 'abdel-rahman', 'abdelahhad', 'abdelaziz', 'abdelhafid', 'abdelhamid', 'abdelkader', 'abdellatif', 'abdelmoumen', 'abdelouahed', 'abdelsattar', 'abdelwahab', 'abdenacer', 'abdenbi', 'abderrahim', 'abderraouf', 'abdessamad', 'abdolreza', 'abdou', 'abdoulaye', 'abdul', 'abdul-ilah', 'abdul-rahim', 'abdulaziz', 'abdulhak', 'abdulkadir', 'abdulkarim', 'abdullah', 'abdullatif', 'abdulmajeed', 'abdulmunem', 'abdulrahman', 'abdulwahed', 'abdurrahman', 'abdurrashid', 'abebaw', 'abelardo', 'abhas', 'abhay', 'abhijit', 'abhik', 'abhilash', 'abhilasha', 'abhinav', 'abhiram', 'abhisek', 'abhishek', 'abi', 'abid', 'abigail', 'abimael', 'abina', 'abinaya', 'abiodun', 'abir', 

In [None]:
clusters = Dbscan(names, 3, 4, 600)
for cluster in clusters:
    print(cluster)

In [None]:
matrix = SubstitutionMatrix(names, clusters)
matrix.autoscale()
#matrix.set_scaling(inner_scaling=1)

print(matrix)

Calculating name occurrences...
done
Calculating probability of letters...
done
Calculating probability of substitutions...
done
Calculating substitution matrix...
done
Scaling automatically for bounds -5 +/- 0.001 and 5 +/- 0.001...
Old scalings (l1, l2) = (10, 1)
(min, max) = (-4.999792159289524, 5.000207840710475)
New scalings (l1, l2) = (8.642578125, 1.599317953274027)
done
Substitutionsmatrix:
  	a	b	c	d	e	f	g	h	i	j	k	l	m	n	o	p	q	r	s	t	u	v	w	x	y	z
a	1.73	1.35	0.13	-0.6	1.12	0.94	1.97	0.85	0.66	0.94	0.86	0.54	0.09	0.57	1.20	0.33	1.15	-0.1	0.57	0.34	1.17	-3.6	2.51	0.01	0.94	0.08
b	1.35	4.34	0.62	1.59	1.94	2.29	2.51	2.16	1.78	2.45	2.46	1.72	2.04	1.21	2.33	2.34	3.46	2.02	1.15	2.34	2.71	3.20	2.51	3.17	1.74	2.10
c	0.13	0.62	3.63	1.44	0.18	1.26	1.46	1.80	1.50	2.08	2.93	0.73	1.44	1.64	0.97	1.66	1.91	1.91	2.19	1.58	-2.2	1.61	2.06	2.13	0.17	3.72
d	-0.6	1.59	1.44	3.60	0.35	0.82	1.11	1.91	0.18	1.62	2.54	1.58	1.84	1.17	2.00	1.78	2.84	1.21	1.43	1.73	-4.9	2.19	1.24	2.24	1.90	2.79
e	1.12	1.94	0.1