In [20]:
import pandas

N_ROWS = 2

# I. Normalizing Edges:
Edges represent links or relationships. They are the connections between the entities, officers, intermediaries and address present in the dataset. 

In [21]:
connections = pandas.read_csv('data/raw/edges.csv', usecols=[0, 1, 2, 3], skiprows=1,
                            names=['start_id', 'type', 'end_id', 'text'])
connections.head(N_ROWS)

Unnamed: 0,start_id,type,end_id,text
0,10000035,registered_address,14095990,registered address
1,10000044,registered_address,14091035,registered address


## 1. Normalizing relationship types:
Types and counts of the 'relationships':

In [22]:
connections.groupby('type').size()

type
intermediary_of       213634
officer_of            309363
registered_address    151105
dtype: int64

Sanitizing the types to better fit the adjacent data.

In [23]:
def normalize_type(txt):
    if '_of' in txt:
        return txt.replace('_of', '')
    else:
        return txt.replace('registered_', '')

connections.type = connections.type.apply(normalize_type)
connections.groupby('type').size()

type
address         151105
intermediary    213634
officer         309363
dtype: int64

## 2. Normalizing the relationship descriptions:
The text column has a lot of outliers and the address' description is a descrepency.

In [24]:
connections.groupby('text').size().sort_values(ascending=False)

text
shareholder of                                  293961
intermediary of                                 213634
registered address                              151105
beneficiary of                                   15150
beneficial owner of                                 79
director of                                         48
power of attorney of                                29
owner, director and shareholder of                  28
owner of                                            16
beneficiary, shareholder and director of             6
director / shareholder of                            6
director / beneficial owner of                       4
sole shareholder of                                  4
connected of                                         3
secretary of                                         3
member of foundation council of                      3
principal beneficiary of                             3
director (rami makhlouf) of                          2
direc

Normalizing the type 'address' to read "start_id is address of end_id" just like the other types. We thus swap start_id and end_id and update the description.

In [25]:
connections['temp'] = connections.start_id.copy()
selector = connections.type == 'address'
# swap start_id for end_id 
connections.loc[selector, 'start_id'] = connections.loc[selector, 'end_id']
# swap end_id for temp
connections.loc[selector, 'end_id'] = connections.loc[selector, 'temp']
del connections['temp']

connections.loc[connections.type == 'address', 'text'] = 'address of'

connections.head(N_ROWS)

Unnamed: 0,start_id,type,end_id,text
0,14095990,address,10000035,address of
1,14091035,address,10000044,address of


Some descriptions like "director / shareholder / beneficial owner of"  descrbes three different relationships, we expand them into different rows.

In [26]:
def normalize_separator(txt):
    for ch in [',', '-', ' and']:
        if ch in txt:
            txt = txt.replace(ch, '/')
    return txt

connections.text = connections['text'].map(normalize_separator)
connections.text = connections['text'].str.split('/', expand=True)

connections.groupby('text').size().sort_values(ascending=False)

text
shareholder of                               293961
intermediary of                              213634
address of                                   151105
beneficiary of                                15150
beneficial owner of                              79
director of                                      48
power of attorney of                             29
owner                                            28
owner of                                         16
director                                         12
beneficiary                                       6
sole shareholder of                               4
principal beneficiary of                          3
secretary of                                      3
connected of                                      3
member of foundation council of                   3
signatory of                                      2
director (rami makhlouf) of                       2
sole signatory of                                 2
preside

We futher normalise the descriptions by enforcing 'of' and omiting unwanted details, these details could be moved to the "note" column if needed.

In [27]:
def enforce_description(txt):
    to_del = ['first', 'sole', 'principal', 'authorized', 'power of ', 'of a mortgage ', 
            'of foundation council of', '(rami makhlouf) ', '(through julex foundation)']
    for word in to_del:
        if word in txt:
            txt = txt.replace(word, '')
    if 'of' not in txt:
        txt += ' of'
    return ' '.join(txt.split())

connections.text = connections['text'].map(enforce_description)
connections.groupby('text').size().sort_values(ascending=False)

text
shareholder of         293966
intermediary of        213634
address of             151105
beneficiary of          15160
beneficial owner of        79
director of                63
owner of                   44
attorney of                30
signatory of                6
member of                   4
secretary of                3
president of                3
connected of                3
protector of                1
grantee of                  1
dtype: int64

NOTE: There are two types of ownerships: legal owners, whom legaly own the property, and beneficiary owners, whom enjoy the ownership while not legaly owning the property.

In [28]:
connections.to_csv('data/processing/edges.csv', index=False)