In [1]:
import pandas, numpy, time
from multiprocessing import Pool, cpu_count

N_ROWS = 5

# III. Grouping and Normalizing Officers:
## Abstrat 
By grouping the typos and spanish translations of the 'bearer' and 'to the bearer' nodes, the amount of nodes to be matched is reduced by ~35%. Around 25% of the remaining nodes are exact duplicates (or 14% of the database).
Furthermore, the same fuzzy matching algortihm used for the address dataset, found 11% of the remaining names to be over 85% similar.

The officers database is hereby reduced by over 24%, while 11% of the remaining are similar.
## 0. Import and cleansing:
Importing the dataset all lowercase, adding a parent_id column. Removing details from some names; ex: { boshen ltd./137-93, boshen ltd./136-83, boshen ltd./134-61 }. An upgrade could move these details to the notes. 

In [2]:
officer_nodes = pandas.read_csv('data/nodes.officer.csv', usecols=['node_id', 'name', 'countries'])

officer_nodes = officer_nodes.applymap( lambda _str: _str.lower() if isinstance( _str,str ) else _str )
officer_nodes.name = officer_nodes.name.map( lambda _str: _str.split( '/' )[0] if isinstance( _str,str ) else _str )
officer_nodes.name = officer_nodes.name.map( lambda _str: _str.replace( '.',' ' ) if isinstance( _str,str) else _str )
officer_nodes['parent_id'] = 0

officer_nodes.head( N_ROWS )

Unnamed: 0,node_id,name,countries,parent_id
0,12000001,kim soo in,south korea,0
1,12000002,tian yuan,china,0
2,12000003,gregory john solomon,australia,0
3,12000004,matsuda masumi,japan,0
4,12000005,ho thuy nga,viet nam,0


This database is relatively large, thankfully over 50% of the entries seem to be duplicates:

In [3]:
def count_duplicates( df, col ):
    duplicates = df[ df.duplicated([ col ]) ]

    total = df.shape[0]
    unique = len( df[ col ].unique() )

    duplicate_count = total - unique
    duplicate_percent = int( (duplicate_count * 100) / total )

    print( f'* Total entries: {total} \n* Duplicates: {duplicate_count} or {duplicate_percent}%' )

count_duplicates( officer_nodes, 'name' )

* Total entries: 238402 
* Duplicates: 121511 or 50%


## 1. Normalizing "bearer" nodes:
Bearers cannot be grouped into a single ID as they could represent different officers, thus only typos and translations are normalized.
### TO CHECK:
The conditions "officer of the bearer" and "officer: the bearer" are importand distinctions.

In [4]:
officer_nodes.groupby('name').size().sort_values(ascending=False).head( N_ROWS )

name
the bearer        71421
el portador        9351
bearer              674
the  bearer         114
formia limited      107
dtype: int64

Grouping 'the barer/to the bearer', to their typos and spanish translations ('el portador/al portador'):

In [5]:
bearer_nodes = officer_nodes.name.isin( 
    ['the  bearer', 'el portador', 'bearer', 'baerer',
    'the baerer', '-the bearer', 'the beare', 'the ebarer'] ) 
    
to_bearer_nodes = officer_nodes.name.isin( 
    ['al protador', 'al portador', 'to bearer'] ) 

officer_nodes.loc[ bearer_nodes, 'name' ] = 'the bearer'
officer_nodes.loc[ to_bearer_nodes, 'name' ] = 'to the bearer'

all_bearer_nodes = officer_nodes.loc[ (officer_nodes.name == 'to the bearer') | (officer_nodes.name == 'the bearer') ]

print( 'ALL BEARER:', all_bearer_nodes.shape[0] )
officer_nodes.groupby('name').size().sort_values(ascending=False).head( N_ROWS )

ALL BEARER: 81740


name
the bearer                         81629
to the bearer                        111
formia limited                       107
north atlantic services limited      103
first court limited                   96
dtype: int64

## 2. Removing direct duplicates:
25% of non "bearer" nodes are exact duplicates:

In [6]:
all_bearer_nodes = officer_nodes.loc[ (officer_nodes.name == 'to the bearer') | (officer_nodes.name == 'the bearer') ]
other_nodes = officer_nodes[ ~officer_nodes.isin(all_bearer_nodes) ].dropna()

print('OTHER NODES')
count_duplicates( other_nodes, 'name' )

OTHER NODES
* Total entries: 135871 
* Duplicates: 34261 or 25%


Maping the parent's ids to their exact duplicates.

In [7]:
other_duplicates = other_nodes.name.duplicated(keep=False)
group_duplicates = other_nodes.loc[ other_duplicates ].groupby('name')

def map_parent_id(df):
    # parent_id is the first row's node_id
    parent_id = df.node_id.iloc[0]
    # children are the rest of the rows
    df.iloc[ 1:, df.columns.get_indexer(['parent_id']) ] = parent_id
    return df

def gapply_parallel(df_group, func):
    t1 = time.time()

    df_list = [ group for name, group in df_group ]

    with Pool( cpu_count() ) as pool:
        results = pool.map(func, df_list)

    result_df = pandas.concat( results )

    print( f'* Total in:', time.time()-t1 )
    return result_df

exact_match = gapply_parallel( group_duplicates, map_parent_id )
exact_match[[ 'node_id', 'parent_id' ]] = exact_match[ ['node_id', 'parent_id'] ].astype('Int64')

officer_nodes.update( exact_match )
officer_nodes[ officer_nodes.parent_id > 0 ].head( N_ROWS )

* Total in: 22.011976718902588


Unnamed: 0,node_id,name,countries,parent_id
43,12000044,noble nominees limited,belize,12000042
183,12002698,tan sun-hua,philippines,12000007
459,12002971,omni capital assets ltd,anguilla,12002704
462,12002974,omni capital assets ltd,anguilla,12002970
616,12003131,lilay ltd,anguilla,12002703


In [8]:
no_exact_duplicates = officer_nodes.loc[ officer_nodes.parent_id == 0 ]
total_nodes = officer_nodes.shape[0]
reduction = int( 100 - (no_exact_duplicates.shape[0] * 100) / total_nodes )

print( f'* Total Reduction: {reduction}%')

* Total Reduction: 14%


## 3. Fuzzy matching names:
Separating the nodes to match:
### NOTE: 
Due to the small string size, this method is not 100% accurate, another, better, matching algorithm might be;
- Pruning; a simple linear algorithm to filter out irrelevant choices before the fuzzywuzzy match.
- Combining: combine multiple fuzzy match algorithm; metaphone, soundex, et.. into a single result.


In [9]:
to_match = no_exact_duplicates.loc[ (no_exact_duplicates.name != 'the bearer') & (no_exact_duplicates.name != 'to the bearer') ]
to_match.shape[0]

122401

Comparing 122,549 nodes to each other would take an impossible amount of time and resources, thus we match the nodes by country groups, forgetting about outliers for now. Due to the small lenght of the strings a high threshold is set for the same matching algorithm used in the address dataset. 


In [10]:
to_match.groupby('countries').size().sort_values(ascending=False).head( N_ROWS )

countries
china                     17938
hong kong                 10238
british virgin islands     7788
united kingdom             3971
russia                     3357
dtype: int64

Fuzzy matching the strings:

Again, this notebook has been limited for demonstration purposes and to spare you 40min and the real results are loaded later.

In [11]:
from multiprocessing import Pool, cpu_count
from fuzzywuzzy import fuzz, process
import time

# TODO: sort the groups by size, so the computation can start on the biggest group first (i.e: China with 20k address). This would decrease the total computation time.

THRESHOLD = 85
SELECTOR = 'name'
GROUP = to_match.groupby('countries')
LIMIT = 50

def calculate_string_similarity(df):
    t1 = time.time()

    choices = set( df[ SELECTOR ].unique() )
    seen = set()
    results = dict()

    choice_count = len(choices)
    for i in range( choice_count-1 ):
        choice = list(choices)[i]
        if len(choice) < 2:
            continue

        seen.add(choice)
        new_choices = choices.difference(seen)
        if len(new_choices) == 0:
            break

        res = process.extract(choice, new_choices, scorer=fuzz.token_sort_ratio, limit=10000)
        res = [ r[0] for r in res if r[1] > THRESHOLD ]

        if len(res):
            seen.update(res)
            results[choice] = res
        i += 1 

    tt = time.time() - t1
    print( f'* Compared {choice_count} in {tt} sec' )
    return results

def gapply_parallel(df_group, func):
    t1 = time.time()

    if LIMIT:
        df_list = [ group for name, group in df_group if group.shape[0] < LIMIT ]
    else:
        df_list = [ group for name, group in df_group ]

    results = {}
    with Pool( cpu_count() ) as pool:
        for res in pool.map(func, df_list):
            results.update( res )

    results = pandas.DataFrame.from_dict(results,  orient='index')
    results.reset_index(inplace=True)

    print( f'* Total Elapsed Time:', time.time()-t1 )
    return results


similar_str = gapply_parallel(GROUP, calculate_string_similarity)
similar_str = similar_str.rename( {'index': 'parent' } , axis=1)

* Compared 3 in 0.0018012523651123047 sec
* Compared 21 in 0.008484601974487305 sec
* Compared 23 in 0.009030580520629883 sec
* Compared 18 in 0.004841327667236328 sec
* Compared 23 in 0.008645296096801758 sec
* Compared 28 in 0.016125917434692383 sec
* Compared 20 in 0.006034374237060547 sec
* Compared 14 in 0.005669116973876953 sec
* Compared 26 in 0.029726028442382812 sec
* Compared 1 in 0.0005209445953369141 sec
* Compared 34 in 0.016704797744750977 sec
* Compared 16 in 0.0035254955291748047 sec
* Compared 30 in 0.015091896057128906 sec
* Compared 45 in 0.025414228439331055 sec
* Compared 8 in 0.00164794921875 sec
* Compared 14 in 0.004046916961669922 sec
* Compared 41 in 0.021744489669799805 sec
* Compared 34 in 0.02644968032836914 sec
* Compared 23 in 0.01836109161376953 sec
* Compared 20 in 0.004438877105712891 sec
* Compared 38 in 0.022194623947143555 sec
* Compared 17 in 0.004699230194091797 sec
* Compared 17 in 0.004713773727416992 sec
* Compared 7 in 0.0012426376342773438 se

In [12]:
similar_str.head( N_ROWS )

Unnamed: 0,parent,0,1,2,3
0,maloku endrit,endrit maloku,,,
1,"norma graciela woscoff, roberto goldwaser, gab...","roberto goldwaser, norma graciela woscoff, gab...","gabriela gisela goldwaser, roberto goldwaser, ...","pablo javier isaac goldwaser, roberto goldwase...",
2,maria inmaculada zamora bonet,maria immaculada zamora bonet,,,
3,collette laurens,colette laurens,ms colette laurens,,
4,julia bonet fite,julia bonet fiter,,,


Creating the edges:

In [13]:
to_match.set_index('node_id', inplace=True)

def string_to_id(df):
    def func(x):
        if isinstance(x, str):
            return to_match.index[ to_match.name == x ][0]
        return 0
        
    return df.applymap(func)

def flatten_to_pair(df):
    return ( df.astype('Int64').where(df.ne(0))
                    .set_index('parent')
                    .stack()
                    .reset_index(level=0, name='child_id') )

def apply_parallel(df, func):
    t1 = time.time()

    _cpu_count = cpu_count()
    df_split = numpy.array_split( df, _cpu_count )

    with Pool( _cpu_count ) as pool:
        res = pool.map( func, df_split )
        try:
            df = pandas.concat( res )
        except ValueError:
            # result could be a list of Nones
            pass 
            
    print(f'Total time:', time.time()-t1 )
    return df

similar_nodes = apply_parallel(similar_str, string_to_id)
similar_nodes = apply_parallel(similar_nodes, flatten_to_pair)
similar_nodes.rename( { 'parent' : 'parent_id', 'child_id' : 'node_id' }, axis=1, inplace=True)
similar_nodes.reset_index(drop=True, inplace=True)

similar_nodes.head( N_ROWS )

Total time: 1.6565921306610107
Total time: 0.29891467094421387


Unnamed: 0,parent_id,node_id
0,13009880,12212223
1,12130663,12130662
2,12130663,12130661
3,12130663,12130660
4,12132951,12142237


## 4. Quantifying and saving results:
Loading real similarity results and updating the officer dataset.

In [14]:
similar_nodes = pandas.read_csv('data/edges.officers.csv')

res = officer_nodes.set_index('node_id')
similar_ = similar_nodes.set_index('node_id')

res.update(similar_)
res.parent_id = res.parent_id.astype('Int64')

res.head( N_ROWS )

Unnamed: 0_level_0,name,countries,parent_id
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12000001,kim soo in,south korea,0
12000002,tian yuan,china,12169128
12000003,gregory john solomon,australia,0
12000004,matsuda masumi,japan,0
12000005,ho thuy nga,viet nam,0


Quantifying result:
## WRONG 

In [15]:
no_dup = res[ res.parent_id == 0 ].shape[0]
total = res.shape[0]
reduction = int( 100 - (no_dup*100)/total)

print( f'Total reduction: {reduction}%')

Total reduction: 24%


# Results might be better saved as new "similar to" nodes in a "similar.nodes.csv"

In [16]:
# TODO: update "data/clean.edges.csv"