In [1]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd
import math

from cleaner import Cleaner
from constants import csv_headers

magellan version:0.1.0


In [2]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

# STEP 1 - PRE-PROCESSING DATA

In this stage, we need to preprocess data before applying Megellan. This is because our datasets (especially the AOM dataset) are quite dirty, and therefore adversely affecting Megellan's blocking and matching functions. For example, states can take any value of "CA", "California", or "CA - California".

In this step, we will clean the following variables:
* Country name (e.g. Whed data has 2 Belgiums: (1) Belgium - French Community and (2) Belgium - Flemish Community)
* State name
* City name
* Affiliation name
* Email server domain (we will only capture the university information from the email server domain - if there is any)


### 1.A. Clean AOM data

In [4]:
aom_cleaner = Cleaner(path_to_csv_dir + '_aom.csv', csv_headers.AOM_INDEX, csv_headers.AOM)

aom_cleaner.clean_affiliation('a_name')
aom_cleaner.clean_email_server('a_email_server')
aom_cleaner.clean_city('a_city')
aom_cleaner.clean_country('a_country')
aom_cleaner.clean_states('a_prov', 'a_country')

aom_cleaner.to_csv(path_to_csv_dir + '_aom_cleaned.csv')
aom_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_city,a_prov,a_country,a_email_server
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,academy management,briarcliff manor,new york,united states,aom
4,northeastern university,boston,massachusetts,united states,gmail
5,skidmore college,saratoga springs,new york,united states,skidmore


### 1.B. Clean WHED data

In [3]:
whed_cleaner = Cleaner(path_to_csv_dir + '_whed.csv', csv_headers.WHED_INDEX, csv_headers.WHED, encoding = 'ISO-8859-1')

whed_cleaner.clean_affiliation('a_name')
whed_cleaner.clean_city('a_city')
whed_cleaner.clean_country('a_country')

whed_cleaner.to_csv(path_to_csv_dir + '_whed_cleaned.csv')
whed_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_country,a_city,a_prov,a_web
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,pampanga state agricultural university,philippines,magalang,Pampanga,http://www.pac.edu.ph
4,les roches international school hotel management,switzerland,bluche crans montana,Bluche-Crans-Montana,http://www.lesroches.edu
6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu


# STEP 2 - MAGELLAN - BLOCKING

In [3]:

AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
print(em.get_key(AOM))

WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)
print(em.get_key(WHED))


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


person_id
a_id


In [4]:
# building inverted index based on B
# sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=3000, y_param=10)
sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=300, y_param=10)
print(len(sample_WHED))
print(len(sample_AOM))
em.show_properties(sample_WHED)
em.show_properties(sample_AOM)

0%                          100%
[##############################] | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00

2195
300
id: 4536078632
key: a_id
id: 4536079080
key: person_id



Total time elapsed: 00:00:01


In [16]:
def match_country_prov(ltuple, rtuple):
#   return True  - if the intention is to drop the pair
#          False - if the intention is to keep the tuple pair
    l_country = ltuple['a_country']
    r_country = rtuple['a_country']
    l_prov = ltuple['a_prov']
    r_prov = rtuple['a_prov']
    
    if l_country != r_country:
        return True
    elif l_country == "united states":
        if l_prov != r_prov:
            return True
        else:
            return False
    else:
        return False

def match_overlap(ltuple, rtuple):
    # There are cases where the AOM does not have affliation name
    # This does not happen with WHED
    if pd.isnull(ltuple['a_name']) or pd.isnull(rtuple['a_name']):
        return True
    
    l_name = ltuple['a_name']
    r_name = rtuple['a_name']
    
    l_name = re.sub(r"(university|school|institute)", "", l_name).strip()
    r_name = re.sub(r"(university|school|institute)", "", r_name).strip()
    
    
#     print(em.overlap_coeff(l_tokens, r_tokens))

    l_tokens = em.tok_wspace(ltuple['a_name'])
    r_tokens = em.tok_wspace(rtuple['a_name'])
    
    if len(l_tokens) > 1 and len(r_tokens) > 1:
#         if "university" in l_tokens:
#             l_tokens.remove("university") 
#         if "university" in r_tokens:
#             r_tokens.remove("university") 
        return em.overlap_coeff(l_tokens, r_tokens) < 1
    else:
        l_tokens = em.get_tokenizers_for_blocking()['qgm_3'](ltuple['a_name'])
        r_tokens = em.get_tokenizers_for_blocking()['qgm_3'](rtuple['a_name'])
        return em.overlap_coeff(l_tokens, r_tokens) < 0.8
#         return False
    
def match_combined(ltuple, rtuple):
    if match_country_prov(ltuple, rtuple):
        return True
    else:
        return match_overlap(ltuple, rtuple)
    
def blocking(A, B, A_headers, B_headers):
    bb = em.BlackBoxBlocker()
    bb.set_black_box_function(match_combined)
    C = bb.block_tables(A, B, l_output_attrs=A_headers, r_output_attrs=B_headers)
    return C

C = blocking(sample_WHED, sample_AOM, csv_headers.WHED, csv_headers.AOM)


0%                          100%
[##############################] | ETA: 00:00:30 | ETA: 00:00:27 | ETA: 00:00:26 | ETA: 00:00:25 | ETA: 00:00:24 | ETA: 00:00:23 | ETA: 00:00:22 | ETA: 00:00:21 | ETA: 00:00:19 | ETA: 00:00:18 | ETA: 00:00:17 | ETA: 00:00:16 | ETA: 00:00:15 | ETA: 00:00:15 | ETA: 00:00:14 | ETA: 00:00:13 | ETA: 00:00:12 | ETA: 00:00:11 | ETA: 00:00:10 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:28


In [17]:
C.head(n = 50000)


Unnamed: 0,_id,ltable_a_id,rtable_person_id,ltable_a_name,ltable_a_city,ltable_a_prov,ltable_a_country,ltable_a_web,rtable_a_name,rtable_a_city,rtable_a_prov,rtable_a_country,rtable_a_email_server
0,0,17266,58431,carlos iii university madrid,getafe,Madrid,spain,http://www.uc3m.es,university carlos iii de madrid,madrid,,spain,uc3m
1,1,223,24865,ajou university,suwon si,Gyeonggi-do,south korea,http://www.ajou.ac.kr,ajou university,suwon,gyunggi-do,south korea,ajou
2,2,17460,52224,university granada,granada,,spain,http://www.ugr.es,university granada,granada,provincia,spain,ugr
3,3,6790,57992,griffith university,southport,Queensland,australia,http://www.griffith.edu.au,griffith university,nathan,qld,australia,griffith
4,4,6790,56196,griffith university,southport,Queensland,australia,http://www.griffith.edu.au,griffith university,brisbane,qld,australia,griffithuni
5,5,867,22172,klagenfurt university,klagenfurt,Carinthia,austria,http://www.uni-klu.ac.at,alpen adria university klagenfurt austria,klagenfurt,,austria,aau
6,6,6977,45293,stockholm school economics,stockholm,,sweden,http://www.hhs.se,stockholm school economics,stockholm,,sweden,hhs
7,7,6978,31817,copenhagen business school,frederiksberg,,denmark,http://www.cbs.dk,copenhagen business school,copenhagen,frederiksberg,denmark,cbs
8,8,6978,4695,copenhagen business school,frederiksberg,,denmark,http://www.cbs.dk,copenhagen business school,frederiksberg,,denmark,gmail
9,9,6979,6377,bi norwegian business school,oslo,,norway,http://www.bi.no,bi norwegian business school,oslo,,norway,bi


In [18]:
C.size

2067

In [19]:

em.save_table(C, path_to_csv_dir + 'matching_pairs_table.pkl')

File already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_table.pkl; Overwriting it
Metadata file already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_table.pklmetadata. Overwriting it


True

In [20]:
em.to_csv_metadata(C, path_to_csv_dir + 'matching_pairs_metadata.csv')

File already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_metadata.csv; Overwriting it
Metadata file already exists at /Users/carepjan/code/website/stage3/csv_files/matching_pairs_metadata.metadata. Overwriting it


True

In [12]:
S = em.sample_table(C, 40)

In [40]:
sample_WHED.set_index([csv_headers.WHED_INDEX])
print(em.get_key(sample_WHED))

a_id


In [41]:
print(em.get_key(sample_AOM))

person_id


In [44]:
print(em.show_properties(sample_AOM))
print(em.show_properties(C))

id: 4673743000
key: person_id
None
id: 4706378920
key: _id
fk_ltable: ltable_a_id
fk_rtable: rtable_person_id
ltable(obj.id): 4560893432
rtable(obj.id): 4673743000
None


In [46]:
print(em.show_properties(C))

id: 4706378920
key: _id
fk_ltable: ltable_a_id
fk_rtable: rtable_person_id
ltable(obj.id): 4560893432
rtable(obj.id): 4673743000
None


In [51]:
sample_AOM.to_csv(path_to_csv_dir + 'sample_AOM.csv', encoding = 'UTF-8', index_label = csv_headers.AOM_INDEX)
sample_WHED.to_csv(path_to_csv_dir + 'sample_WHED.csv', encoding = 'UTF-8', index_label = csv_headers.WHED_INDEX)