In [2]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers

magellan version:0.1.0


In [3]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

# STEP 1 - PRE-PROCESSING DATA

In this stage, we need to preprocess data before applying Megellan. This is because our datasets (especially the AOM dataset) are quite dirty, and therefore adversely affecting Megellan's blocking and matching functions. For example, states can take any value of "CA", "California", or "CA - California".

In this step, we will clean the following variables:
* Country name (e.g. Whed data has 2 Belgiums: (1) Belgium - French Community and (2) Belgium - Flemish Community)
* State name
* City name
* Affiliation name
* Email server domain (we will only capture the university information from the email server domain - if there is any)


### 1.A. Clean AOM data

In [4]:
aom_cleaner = Cleaner(path_to_csv_dir + '_aom.csv', csv_headers.AOM_INDEX, csv_headers.AOM)

aom_cleaner.clean_affiliation('a_name')
aom_cleaner.clean_email_server('a_email_server')
aom_cleaner.clean_city('a_city')
aom_cleaner.clean_country('a_country')
aom_cleaner.clean_states('a_prov', 'a_country')

aom_cleaner.to_csv(path_to_csv_dir + '_aom_cleaned.csv')
aom_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_city,a_prov,a_country,a_email_server
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,academy management,briarcliff manor,new york,united states,aom
4,nor astern university,boston,massachusetts,united states,gmail
5,skidmore college,saratoga springs,new york,united states,skidmore


### 1.B. Clean WHED data

In [5]:
whed_cleaner = Cleaner(path_to_csv_dir + '_whed.csv', csv_headers.WHED_INDEX, csv_headers.WHED, encoding = 'ISO-8859-1')

whed_cleaner.clean_affiliation('a_name')
whed_cleaner.clean_city('a_city')
whed_cleaner.clean_country('a_country')

whed_cleaner.to_csv(path_to_csv_dir + '_whed_cleaned.csv')
whed_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_country,a_city,a_prov,a_web
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,pampanga state agricultural university,philippines,magalang,Pampanga,http://www.pac.edu.ph
4,les roches international school hotel management,switzerland,bluche crans montana,Bluche-Crans-Montana,http://www.lesroches.edu
6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu


# STEP 2 - MAGELLAN - BLOCKING

In [6]:

AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
print(em.get_key(AOM))

WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)
print(em.get_key(WHED))


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


person_id
a_id


In [8]:
# building inverted index based on B
sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=2000, y_param=10)
print(len(sample_WHED))
print(len(sample_AOM))
em.show_properties(sample_WHED)
em.show_properties(sample_AOM)

0%                          100%
[##############################] | ETA: 00:00:12 | ETA: 00:00:10 | ETA: 00:00:10 | ETA: 00:00:09 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00

8872
2000
id: 4633726312
key: a_id
id: 4633726536
key: person_id



Total time elapsed: 00:00:10


In [9]:
sample_WHED.head(n=5)

Unnamed: 0,a_id,a_name,a_country,a_city,a_prov,a_web
16384,21848,zhongyuan university technology,china,zhengzhou,Henan Province,http://www.zzti.edu.cn
0,2,pampanga state agricultural university,philippines,magalang,Pampanga,http://www.pac.edu.ph
16386,21850,zhytomyr state ivan franko university,ukraine,zytomyr,Zytomyr Region,http://www.academy.zt.ua
16387,21851,ziauddin university,pakistan,karachi,Sindh,http://www.zu.edu.pk
16385,21849,zhoukou normal university,china,zhoukou,Henan,http://www.zknu.edu.cn/


In [10]:
sample_AOM.head(n=5)

Unnamed: 0,person_id,a_name,a_city,a_prov,a_country,a_email_server
4783,35712,renmin university china,beijing,,china,ruc
9311,59456,north china university water conservancy and electric power,zhengzhou,he,china,126
3209,20917,university chicago,chicago,illinois,united states,chicagobooth
5217,39424,university complutense de madrid,madrid,,spain,ucm
7451,55302,washington state university,pullman,washington,united states,wsu


In [None]:
def match_country_prov(ltuple, rtuple):
#   return True  - if the intention is to drop the pair
#          False - if the intention is to keep the tuple pair
    l_country = ltuple['a_country']
    r_country = rtuple['a_country']
    l_prov = ltuple['a_prov']
    r_prov = rtuple['a_prov']
    
    if l_country != r_country:
        return True
    elif l_country == "united states":
        if l_prov != r_prov:
            return True
        else:
            return False
    else:
        return False

def match_domain(ltuple, rtuple):
    l_web = ltuple['a_web']
    r_email = rtuple['a_email_server']
    if str(r_email) in str(l_web):
        return False
    else:
        return True

    
def match_overlap(ltuple, rtuple):
    l_tokens = em.tok_wspace(ltuple['a_name'])
    r_tokens = em.tok_wspace(rtuple['a_name'])
    print(l_tokens)
    print(r_tokens)    
    
    if len(l_tokens) > 1 and len(r_tokens) > 1:
        if "university" in l_tokens:
            l_tokens.remove("university") 
        if "university" in r_tokens:
            r_tokens.remove("university") 
        return em.overlap_coeff(l_tokens, r_tokens) == 0
    else:
        return False
    
def match_combined(ltuple, rtuple):
    if match_country_prov(ltuple, rtuple):
        return True
    elif match_domain(ltuple, rtuple):        
        return match_overlap(ltuple, rtuple)
    else:
        return False
    
def blocking(A, B):
    bb = em.BlackBoxBlocker()
    bb.set_black_box_function(match_combined)
    C = bb.block_tables(A, B, l_output_attrs=['a_name','a_country','a_city','a_prov','a_web'], r_output_attrs=['a_name','a_country','a_city','a_prov','a_email_server'] )
    return C

C = blocking(sample_WHED, sample_AOM)


0%                          100%
[                              ]

['zhongyuan', 'university', 'technology']
['renmin', 'university', 'china']
['zhongyuan', 'university', 'technology']
['north', 'china', 'university', 'water', 'conservancy', 'and', 'electric', 'power']
['zhongyuan', 'university', 'technology']
['huazhong', 'university', 'science', 'technology']
['zhongyuan', 'university', 'technology']
['university', 'hong', 'kong']
['zhongyuan', 'university', 'technology']
['ceibs']
['zhongyuan', 'university', 'technology']
['sichuan', 'university']
['zhongyuan', 'university', 'technology']
['xi', 'an', 'jiaotong', 'university']
['zhongyuan', 'university', 'technology']
['gsm', 'peking', 'university']
['zhongyuan', 'university', 'technology']
['chinese', 'university', 'hong', 'kong']
['zhongyuan', 'university', 'technology']
['jilin', 'university']
['zhongyuan', 'university', 'technology']
['beijing', 'normal', 'university']
['zhongyuan', 'university', 'technology']
['shandong', 'university', '/', 'ludong', 'university']
['zhongyuan', 'university', '

[#                             ] | ETA: 00:19:37

['friedrich', 'alexander', 'university', 'erlangen', 'nuremberg']
['alanus', 'university', 'arts', 'social', 'sciences']
['ludwig', 'maximilian', 'university', 'munich']
['alanus', 'university', 'arts', 'social', 'sciences']
['whu', 'otto', 'beisheim', 'school', 'management']
['alanus', 'university', 'arts', 'social', 'sciences']
['university', 'mannheim']
['alanus', 'university', 'arts', 'social', 'sciences']
['university', 'erlangen', 'nuremberg']
['alanus', 'university', 'arts', 'social', 'sciences']
['otto', 'von', 'guericke', 'university', 'magdeburg']
['alanus', 'university', 'arts', 'social', 'sciences']
['freie', 'university', 'berlin']
['alanus', 'university', 'arts', 'social', 'sciences']
['ruhr', 'university', 'bochum']
['alanus', 'university', 'arts', 'social', 'sciences']
['technische', 'university', 'dresden']
['alanus', 'university', 'arts', 'social', 'sciences']
['university', 'applied', 'management']
['alanus', 'university', 'arts', 'social', 'sciences']
['university',

In [11]:
C.head(n = 50000)


Unnamed: 0,_id,ltable_person_id,rtable_a_id,ltable_a_name,ltable_a_country,ltable_a_city,ltable_a_prov,ltable_a_email_server,rtable_a_name,rtable_a_country,rtable_a_city,rtable_a_prov,rtable_a_web
0,0,46555,13931,university de barcelona,spain,barcelona,,gmail,royal school dramatic art madrid,spain,madrid,,http://www.resad.es
1,1,46555,17165,university de barcelona,spain,barcelona,,gmail,nebrija university,spain,hoyos de manzanares,Madrid,http://www.nebrija.com
2,2,46555,19344,university de barcelona,spain,barcelona,,gmail,jaume i university,spain,castellon de la plana,Castellon,http://www.uji.es
3,3,46555,4784,university de barcelona,spain,barcelona,,gmail,school dramatic art valencia,spain,valencia,,http://www.esadvalencia.com
4,4,46555,17309,university de barcelona,spain,barcelona,,gmail,saint vincent martyr catholic university valencia,spain,valencia,,http://www.ucv.es
5,5,46555,3204,university de barcelona,spain,barcelona,,gmail,rafael orozco music conservatoire cordoba,spain,cordoba,,http://www.csmcordoba.com
6,6,46555,17285,university de barcelona,spain,barcelona,,gmail,catholic university avila,spain,avila,,http://www.ucavila.es
7,7,46555,17362,university de barcelona,spain,barcelona,,gmail,complutense university madrid,spain,madrid,,http://www.ucm.es
8,8,46555,3215,university de barcelona,spain,barcelona,,gmail,oscar espla music conservatoire alicante,spain,alicante,,http://www.csmalicante.es
9,9,46555,17564,university de barcelona,spain,barcelona,,gmail,university oviedo,spain,oviedo,Asturias,http://www.uniovi.es


In [13]:
C.size

3426683