In [4]:
import sys
import os
import py_entitymatching as em
print('magellan version:' + em.__version__)
import re
import csv
import pandas as pd

from cleaner import Cleaner
from constants import csv_headers

magellan version:0.1.0


In [5]:
working_dir = os.path.dirname(os.getcwd())
path_to_csv_dir = working_dir + os.sep + 'csv_files'+ os.sep

# STEP 1 - PRE-PROCESSING DATA

In this stage, we need to preprocess data before applying Megellan. This is because our datasets (especially the AOM dataset) are quite dirty, and therefore adversely affecting Megellan's blocking and matching functions. For example, states can take any value of "CA", "California", or "CA - California".

In this step, we will clean the following variables:
* Country name (e.g. Whed data has 2 Belgiums: (1) Belgium - French Community and (2) Belgium - Flemish Community)
* State name
* City name
* Affiliation name
* Email server domain (we will only capture the university information from the email server domain - if there is any)


### 1.A. Clean AOM data

In [6]:
aom_cleaner = Cleaner(path_to_csv_dir + '_aom.csv', csv_headers.AOM_INDEX, csv_headers.AOM)

aom_cleaner.clean_affiliation('a_name')
aom_cleaner.clean_email_server('a_email_server')
aom_cleaner.clean_city('a_city')
aom_cleaner.clean_country('a_country')
aom_cleaner.clean_states('a_prov', 'a_country')

aom_cleaner.to_csv(path_to_csv_dir + '_aom_cleaned.csv')
aom_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_city,a_prov,a_country,a_email_server
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,academy management,briarcliff manor,new york,united states,aom
4,nor astern university,boston,massachusetts,united states,gmail
5,skidmore college,saratoga springs,new york,united states,skidmore


### 1.B. Clean WHED data

In [8]:
whed_cleaner = Cleaner(path_to_csv_dir + '_whed.csv', csv_headers.WHED_INDEX, csv_headers.WHED, encoding = 'ISO-8859-1')

whed_cleaner.clean_affiliation('a_name')
whed_cleaner.clean_city('a_city')
whed_cleaner.clean_country('a_country')
whed_cleaner.clean_states('a_prov', 'a_country')

whed_cleaner.to_csv(path_to_csv_dir + '_whed_cleaned.csv')
whed_cleaner.data.head(n=3)

Unnamed: 0_level_0,a_name,a_country,a_city,a_prov,a_web
a_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,pampanga state agricultural university,philippines,magalang,pampanga,http://www.pac.edu.ph
4,les roches international school hotel management,switzerland,bluche crans montana,bluche-crans-montana,http://www.lesroches.edu
6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu


# STEP 2 - MAGELLAN - BLOCKING

In [9]:

AOM = em.read_csv_metadata(path_to_csv_dir + '_aom_cleaned.csv', key = csv_headers.AOM_INDEX)
print(em.get_key(AOM))

WHED = em.read_csv_metadata(path_to_csv_dir + '_whed_cleaned.csv', key = csv_headers.WHED_INDEX)
print(em.get_key(WHED))


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.


person_id
a_id


In [10]:
# building inverted index based on B
sample_WHED, sample_AOM = em.down_sample(WHED, AOM, size=3000, y_param=10)
print(len(sample_WHED))
print(len(sample_AOM))
em.show_properties(sample_WHED)
em.show_properties(sample_AOM)

0%                          100%
[##############################] | ETA: 00:00:18 | ETA: 00:00:17 | ETA: 00:00:16 | ETA: 00:00:16 | ETA: 00:00:15 | ETA: 00:00:14 | ETA: 00:00:13 | ETA: 00:00:13 | ETA: 00:00:12 | ETA: 00:00:11 | ETA: 00:00:11 | ETA: 00:00:10 | ETA: 00:00:10 | ETA: 00:00:09 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00

11178
3000
id: 4572803816
key: a_id
id: 4620538656
key: person_id



Total time elapsed: 00:00:18


In [11]:
sample_WHED.head(n=5)

Unnamed: 0,a_id,a_name,a_country,a_city,a_prov,a_web
1,4,les roches international school hotel management,switzerland,bluche crans montana,bluche-crans-montana,http://www.lesroches.edu
2,6,dharma gate budapest buddhist university,hungary,budapest,,http://www.tkbf.eu
5,9,aalborg university,denmark,aalborg,,http://www.aau.dk
6,10,aalto university,finland,espoo,,http://www.aalto.fi/fi/
7,15,aarhus university,denmark,aarhus,c,http://www.au.dk/en


In [12]:
sample_AOM.head(n=5)

Unnamed: 0,person_id,a_name,a_city,a_prov,a_country,a_email_server
4016,28976,eberhard karls university tubingen,tubingen,,germany,uni-tuebingen
2670,15691,babson college,babson park,massachusetts,united states,babson
2197,12025,providence college,providence,rhode island,united states,providence
3222,21062,university central lancashire,preston,,united kingdom,uclan
3873,27742,university georgia,athens,georgia,united states,uga


In [13]:
def match_country(ltuple, rtuple):
    l_country = ltuple['a_country']
    r_country = rtuple['a_country']
    if (l_country == "") or (r_country == ""):
        return None
    else:
        return ( l_country == r_country)

def match_country_us(ltuple, rtuple):
    return (rtuple['a_country'] == "united states")

def match_prov(ltuple, rtuple):
    l_prov = ltuple['a_prov']
    r_prov = rtuple['a_prov']
    if (l_prov == "") or (r_prov == ""):
        return None
    else:
        return (l_prov == r_prov)

def match_city(ltuple, rtuple):
    l_city = ltuple['a_city']
    r_city = rtuple['a_city']
    if (l_city  == "") or (r_city == ""):
        return None
    else:
        return (l_city == r_city)

def match_provcity(ltuple, rtuple):
    l_city = ltuple['a_city']
    r_city = rtuple['a_city']
    l_prov = ltuple['a_prov']
    r_prov = rtuple['a_prov']
    if (l_city  == "") or (r_city == "") or (l_prov == "") or (r_prov == ""):
        return None
    else:
        return (str(l_city) == str(r_prov)) or (str(l_prov) == str(r_city))

def match_domain(ltuple, rtuple):
    l_web = str(ltuple['a_web'])
    r_email = str(rtuple['a_email_server'])
    if (l_web == "") or (r_email == ""):
        return None
    else:
        l_web = l_web.split('.')
        return (r_email in l_web)

def match_overlap(ltuple, rtuple):
    l_name = str(ltuple['a_name'])
    r_name = str(rtuple['a_name'])
    
    if (l_name == "") or (r_name == ""):
        return None
    else:
        if l_name.count(" ") > 0 and r_name.count(" ") > 0:
            l_name = re.sub(r"(university|school|institute|college)","",l_name)
            r_name = re.sub(r"(university|school|institute|college)","",r_name)    
            l_tokens = em.tok_wspace(l_name)
            r_tokens = em.tok_wspace(r_name)
            return em.overlap_coeff(l_tokens, r_tokens) > 0.5
        else:
            return None
    
def allFalse(array):
    num_none = array.count(None)
    num_false = array.count(False)    
    return (num_none + num_false) == len(array) and num_none != len(array)
        
    
def match_combined(ltuple, rtuple):
    if match_country(ltuple, rtuple) is False:
        return True
    elif (match_country_us(ltuple,rtuple) is True) and (match_prov(ltuple,rtuple) is False):
        return True
    elif (match_country_us(ltuple,rtuple) is True) and (allFalse([match_overlap(ltuple, rtuple), 
                                                                   match_domain(ltuple, rtuple)]) is True):
        return True
    elif (match_country_us(ltuple,rtuple) is False) and allFalse([match_overlap(ltuple, rtuple), 
                                                                   match_domain(ltuple, rtuple)]) is True:
        return True
    else:
        return False
    
def blocking(A, B):
    bb = em.BlackBoxBlocker()
    bb.set_black_box_function(match_combined)
    C = bb.block_tables(A, B, l_output_attrs=['a_name','a_country','a_city','a_prov','a_web'], r_output_attrs=['a_name','a_country','a_city','a_prov','a_email_server'] )
    return C

C = blocking(sample_WHED, sample_AOM)


0%                          100%
[##############################] | ETA: 00:17:25 | ETA: 00:17:59 | ETA: 00:16:52 | ETA: 00:16:16 | ETA: 00:15:18 | ETA: 00:14:25 | ETA: 00:13:28 | ETA: 00:12:45 | ETA: 00:12:08 | ETA: 00:11:27 | ETA: 00:10:43 | ETA: 00:10:05 | ETA: 00:09:30 | ETA: 00:08:55 | ETA: 00:08:20 | ETA: 00:07:52 | ETA: 00:07:20 | ETA: 00:06:44 | ETA: 00:06:10 | ETA: 00:05:40 | ETA: 00:05:03 | ETA: 00:04:28 | ETA: 00:03:53 | ETA: 00:03:19 | ETA: 00:02:46 | ETA: 00:02:14 | ETA: 00:01:40 | ETA: 00:01:07 | ETA: 00:00:33 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:16:39


In [17]:
C.size

137722

In [18]:
C.head(n=1000)

Unnamed: 0,_id,ltable_a_id,rtable_person_id,ltable_a_name,ltable_a_country,ltable_a_city,ltable_a_prov,ltable_a_web,rtable_a_name,rtable_a_country,rtable_a_city,rtable_a_prov,rtable_a_email_server
0,0,10,58872,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto
1,1,10,52765,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,finland,aalto
2,2,10,60032,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,0,,aalto
3,3,10,47009,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto
4,4,10,59274,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto
5,5,10,50544,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto
6,6,10,33443,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,,aalto
7,7,10,56106,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,aalto,,aalto
8,8,10,14261,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,espoo,,stratnet
9,9,10,48739,aalto university,finland,espoo,,http://www.aalto.fi/fi/,aalto university,finland,helsinki,uusimaa,aalto


In [5]:
em.to_csv_metadata(C, path_to_csv_dir + 'matching_pairs_table_overlap2_emailserver.csv')
em.save_table(C, path_to_csv_dir + 'matching_pairs_table_overlap2_emailserver.pkl')

NameError: name 'C' is not defined

In [None]:
sample_AOM.to_csv(path_to_csv_dir + 'sample_AOM.csv', encoding = 'UTF-8', index_label = csv_headers.AOM_INDEX)
sample_WHED.to_csv(path_to_csv_dir + 'sample_WHED.csv', encoding = 'UTF-8', index_label = csv_headers.WHED_INDEX)