# Chapter Six - String Clustering

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../../../optimus")

In [3]:
from optimus import Optimus
op = Optimus("pandas")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LuisA\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Fingerprint

In [4]:
df = op.create.dataframe({ 
    "A": ["optimus", "prime optimus", "prime", "bumblebee", "megatron", "MEGATRON"], 
    "B": [1,2,3,4,5,6] 
}) 

In [5]:
df.string_clustering("A", "fingerprint")

{ 'A': { ('bumblebee', 2): { 'cluster': 'bumblebee',
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': 'megatron',
                            'suggestions': ['megatron', 'MEGATRON'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus', 3): { 'cluster': 'optimus',
                           'suggestions': ['optimus'],
                           'suggestions_size': 1,
                           'total_count': 1},
         ('prime', 4): { 'cluster': 'prime',
                         'suggestions': ['prime'],
                         'suggestions_size': 1,
                         'total_count': 1},
         ('prime optimus', 1): { 'cluster': 'optimus prime',
                                 'suggestions': ['prime optimus'],
                                 'suggestions_size': 1

In [6]:
df.cols.fingerprint("A")

A  1 (object),B  2 (int64)
optimus,1
optimus⋅prime,2
prime,3
bumblebee,4
megatron,5
megatron,6


## N-gram fingerprint

In [7]:
df = op.create.dataframe({
    "A": ["optimus", "optimus prime", "prime", "bumblebee", "megatron", "MEGATRON"],
    "B": [1,2,3,4,5,6]
})

In [8]:
df.string_clustering("A", "ngram_fingerprint")

{ 'A': { ('bumblebee', 4): { 'cluster': 'beblbuebeelembum',
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': 'ateggameonrotr',
                            'suggestions': ['megatron', 'MEGATRON'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus', 2): { 'cluster': 'immuoppttius',
                           'suggestions': ['optimus'],
                           'suggestions_size': 1,
                           'total_count': 1},
         ('optimus prime', 3): { 'cluster': 'immemuopprptrisptius',
                                 'suggestions': ['optimus prime'],
                                 'suggestions_size': 1,
                                 'total_count': 1},
         ('prime', 1): { 'cluster': 'immeprri',
                         'suggestions': ['prime'],
          

In [9]:
df.cols.ngram_fingerprint("A", output_cols="C")

A  1 (object),C  2 (object),B  3 (int64)
optimus,immuoppttius,1
optimus⋅prime,immemuopprptrisptius,2
prime,immeprri,3
bumblebee,beblbuebeelembum,4
megatron,ateggameonrotr,5
MEGATRON,ateggameonrotr,6


## Soundex

In [10]:
df = op.create.dataframe({ 
    "A": ["optimus", "prime aptimus", "bumblebee", "megatron", "MaGATRaN"],
    "B": [1,2,3,4,5] 
}) 

In [11]:
df.string_clustering("A", "soundex")

{ 'A': { ('bumblebee', 2): { 'cluster': 'B514',
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': 'M236',
                            'suggestions': ['megatron', 'MaGATRaN'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus', 1): { 'cluster': 'O135',
                           'suggestions': ['optimus'],
                           'suggestions_size': 1,
                           'total_count': 1},
         ('prime aptimus', 3): { 'cluster': 'P651',
                                 'suggestions': ['prime aptimus'],
                                 'suggestions_size': 1,
                                 'total_count': 1}}}

In [12]:
df.cols.soundex("A", output_cols="C")

A  1 (object),C  2 (object),B  3 (int64)
optimus,O135,1
prime⋅aptimus,P651,2
bumblebee,B514,3
megatron,M236,4
MaGATRaN,M236,5


### Metaphone

In [13]:
df.string_clustering("A", "metaphone")

{ 'A': { ('bumblebee', 2): { 'cluster': 'BMBLB',
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': 'MKTRN',
                            'suggestions': ['megatron', 'MaGATRaN'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus', 1): { 'cluster': 'OPTMS',
                           'suggestions': ['optimus'],
                           'suggestions_size': 1,
                           'total_count': 1},
         ('prime aptimus', 3): { 'cluster': 'PRM APTMS',
                                 'suggestions': ['prime aptimus'],
                                 'suggestions_size': 1,
                                 'total_count': 1}}}

In [14]:
df.cols.metaphone("A")

A  1 (object),B  2 (int64)
OPTMS,1
PRM⋅APTMS,2
BMBLB,3
MKTRN,4
MKTRN,5


In [15]:
df = op.create.dataframe({ 
    "A": ["optimus prime", "prime optimus", "prime", "bumblebee", "megatron", "MEGATRON", "argenis leon"], 
    "B": [1,2,3,4,5,6,7] 
}) 

### Double Metaphone

In [16]:
df.string_clustering("A", "double_metaphone")

{ 'A': { ('argenis leon', 2): { 'cluster': ('ARJNSLN', 'ARKNSLN'),
                                'suggestions': ['argenis leon'],
                                'suggestions_size': 1,
                                'total_count': 1},
         ('bumblebee', 3): { 'cluster': ('PMPLP', ''),
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': ('MKTRN', ''),
                            'suggestions': ['megatron', 'MEGATRON'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus prime', 1): { 'cluster': ('APTMSPRM', ''),
                                 'suggestions': ['optimus prime'],
                                 'suggestions_size': 1,
                                 'total_count': 1},
         ('prime', 4): { 'cluster': ('PRM', ''),
                         'suggestions': [

In [17]:
df.cols.double_metaphone("A")

A  1 (object),B  2 (int64)
"('APTMSPRM',⋅'')",1
"('PRMPTMS',⋅'')",2
"('PRM',⋅'')",3
"('PMPLP',⋅'')",4
"('MKTRN',⋅'')",5
"('MKTRN',⋅'')",6
"('ARJNSLN',⋅'ARKNSLN')",7


### Match Rating Codex

In [18]:
df.string_clustering("A", "match_rating_codex")

{ 'A': { ('argenis leon', 4): { 'cluster': 'ARG LN',
                                'suggestions': ['argenis leon'],
                                'suggestions_size': 1,
                                'total_count': 1},
         ('bumblebee', 2): { 'cluster': 'BMBLB',
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': 'MGTRN',
                            'suggestions': ['megatron', 'MEGATRON'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus prime', 3): { 'cluster': 'OPTPRM',
                                 'suggestions': ['optimus prime'],
                                 'suggestions_size': 1,
                                 'total_count': 1},
         ('prime', 1): { 'cluster': 'PRM',
                         'suggestions': ['prime'],
                         'sugg

In [19]:
df.cols.match_rating_codex("A")

A  1 (object),B  2 (int64)
OPTPRM,1
PRMTMS,2
PRM,3
BMBLB,4
MGTRN,5
MGTRN,6
ARG⋅LN,7


### NYSIIS

In [20]:
df.string_clustering("A", "nysiis")

{ 'A': { ('argenis leon', 5): { 'cluster': 'ARGANAS LAN',
                                'suggestions': ['argenis leon'],
                                'suggestions_size': 1,
                                'total_count': 1},
         ('bumblebee', 1): { 'cluster': 'BANBLABY',
                             'suggestions': ['bumblebee'],
                             'suggestions_size': 1,
                             'total_count': 1},
         ('megatron', 0): { 'cluster': 'MAGATRAN',
                            'suggestions': ['megatron', 'MEGATRON'],
                            'suggestions_size': 2,
                            'total_count': 2},
         ('optimus prime', 3): { 'cluster': 'OPTANAS PRAN',
                                 'suggestions': ['optimus prime'],
                                 'suggestions_size': 1,
                                 'total_count': 1},
         ('prime', 2): { 'cluster': 'PRAN',
                         'suggestions': ['prime'],
            

In [21]:
df.cols.nysiis("A")

A  1 (object),B  2 (int64)
OPTANAS⋅PRAN,1
PRANA⋅APTAN,2
PRAN,3
BANBLABY,4
MAGATRAN,5
MAGATRAN,6
ARGANAS⋅LAN,7


### Levenshtein

In [22]:
df = op.create.dataframe({  
    "name": ["John Doe", "alice", "alice", "John Doe", "álice", "john doe", "doe, john", "alice", "joohn dooe"] 
}) 

In [23]:
df.string_clustering("name", "levenshtein")

{ 'name': { ('alice', 1): { 'cluster': 'alice',
                            'suggestions': ['alice'],
                            'suggestions_size': 1,
                            'total_count': 9},
            ('alice', 2): { 'cluster': 'alice',
                            'suggestions': ['alice'],
                            'suggestions_size': 1,
                            'total_count': 9},
            ('alice', 4): { 'cluster': 'alice',
                            'suggestions': ['alice'],
                            'suggestions_size': 1,
                            'total_count': 9},
            ('alice', 7): { 'cluster': 'alice',
                            'suggestions': ['alice'],
                            'suggestions_size': 1,
                            'total_count': 9},
            ('doe john', 0): { 'cluster': 'doe john',
                               'suggestions': ['doe john'],
                               'suggestions_size': 1,
                               '

## Applying Suggestions

In [24]:
df = op.create.dataframe({  
    "name": ["John Doe", "alice", "alice", "John Doe", "álice",  
    "john doe", "doe, john", "alice", "joohn dooe"] 
}) 

In [25]:
clusters = df.string_clustering("name", "fingerprint") 

In [26]:
clusters

{ 'name': { ('John Doe', 0): { 'cluster': 'doe john',
                               'suggestions': [ 'John Doe',
                                                'john doe',
                                                'doe, john'],
                               'suggestions_size': 3,
                               'total_count': 4},
            ('alice', 1): { 'cluster': 'alice',
                            'suggestions': ['alice', 'álice'],
                            'suggestions_size': 2,
                            'total_count': 4},
            ('joohn dooe', 2): { 'cluster': 'dooe joohn',
                                 'suggestions': ['joohn dooe'],
                                 'suggestions_size': 1,
                                 'total_count': 1}}}

In [27]:
clusters.set_suggestion("alice", "Alice")
clusters.set_suggestion("joohn dooe", "John Doe")
clusters

{ 'name': { ('Alice', 1): { 'cluster': 'alice',
                            'suggestions': ['alice', 'álice'],
                            'suggestions_size': 2,
                            'total_count': 4},
            ('John Doe', 0): { 'cluster': 'doe john',
                               'suggestions': [ 'John Doe',
                                                'john doe',
                                                'doe, john'],
                               'suggestions_size': 3,
                               'total_count': 4},
            ('John Doe', 2): { 'cluster': 'dooe joohn',
                               'suggestions': ['joohn dooe'],
                               'suggestions_size': 1,
                               'total_count': 1}}}

In [28]:
clusters = df.string_clustering("name", "fingerprint") 
clusters.set_suggestion(0, "Alice")
clusters.set_suggestion(2, "John Doe") 
clusters

{ 'name': { ('Alice', 0): { 'cluster': 'doe john',
                            'suggestions': [ 'John Doe',
                                             'john doe',
                                             'doe, john'],
                            'suggestions_size': 3,
                            'total_count': 4},
            ('John Doe', 2): { 'cluster': 'dooe joohn',
                               'suggestions': ['joohn dooe'],
                               'suggestions_size': 1,
                               'total_count': 1},
            ('alice', 1): { 'cluster': 'alice',
                            'suggestions': ['alice', 'álice'],
                            'suggestions_size': 2,
                            'total_count': 4}}}

In [29]:
clusters = df.string_clustering("name", "fingerprint") 
clusters.set_suggestion(2, "John Doe", "name") 
clusters.set_suggestion(0, "Alice", "name")
clusters

{ 'name': { ('Alice', 0): { 'cluster': 'doe john',
                            'suggestions': [ 'John Doe',
                                             'john doe',
                                             'doe, john'],
                            'suggestions_size': 3,
                            'total_count': 4},
            ('John Doe', 2): { 'cluster': 'dooe joohn',
                               'suggestions': ['joohn dooe'],
                               'suggestions_size': 1,
                               'total_count': 1},
            ('alice', 1): { 'cluster': 'alice',
                            'suggestions': ['alice', 'álice'],
                            'suggestions_size': 2,
                            'total_count': 4}}}

In [30]:
clusters.set_suggestions(["Alice", "John Doe", "John Doe"])
clusters

{ 'name': { ('Alice', 0): { 'cluster': 'doe john',
                            'suggestions': [ 'John Doe',
                                             'john doe',
                                             'doe, john'],
                            'suggestions_size': 3,
                            'total_count': 4},
            ('John Doe', 1): { 'cluster': 'alice',
                               'suggestions': ['alice', 'álice'],
                               'suggestions_size': 2,
                               'total_count': 4},
            ('John Doe', 2): { 'cluster': 'dooe joohn',
                               'suggestions': ['joohn dooe'],
                               'suggestions_size': 1,
                               'total_count': 1}}}

In [31]:
display(df)
display(clusters.to_dict())

name  1 (object)
John⋅Doe
alice
alice
John⋅Doe
álice
john⋅doe
"doe,⋅john"
alice
joohn⋅dooe


{'name': {'Alice': ['John Doe', 'john doe', 'doe, john'],
  'John Doe': ['alice', 'álice', 'joohn dooe']}}

In [32]:
df.cols.replace(clusters)

name  1 (object)
Alice
John⋅Doe
John⋅Doe
Alice
John⋅Doe
Alice
Alice
John⋅Doe
John⋅Doe
