In [1]:
import py_entitymatching as em
import pandas as pd
import os, sys
import logging
from os.path import expanduser

In [2]:
# Set up logger
logging.basicConfig()

# Set paths
folder = "/Documents/CS838/Stage3/datasets"
table_a_file = "songs.csv"
table_b_file = "tracks.csv"

# Set directory path
datasets_dir = expanduser("~") + folder

# Set path for tables
path_A = datasets_dir + os.sep + table_a_file
path_B = datasets_dir + os.sep + table_b_file

In [3]:
# Read csv files
A = em.read_csv_metadata(path_A, key='id')
B = em.read_csv_metadata(path_B, key='id')



In [4]:
print('Number of tuples in A: ' + str(len(A)))
print('Number of tuples in B: ' + str(len(B)))
print('Number of tuples in A X B (i.e the cartesian product): ' + str(len(A)*len(B)))

Number of tuples in A: 961593
Number of tuples in B: 734485
Number of tuples in A X B (i.e the cartesian product): 706275634605


In [5]:
# Downsample tables
sample_A, sample_B = em.down_sample(A, B, 10000, 1.5, show_progress=True, verbose=True)

0%                          100%
[##############################] | ETA: 00:04:32 | ETA: 00:04:42 | ETA: 00:04:58 | ETA: 00:04:45 | ETA: 00:04:35 | ETA: 00:04:20 | ETA: 00:04:11 | ETA: 00:04:00 | ETA: 00:03:47 | ETA: 00:03:33 | ETA: 00:03:26 | ETA: 00:03:16 | ETA: 00:03:06 | ETA: 00:02:54 | ETA: 00:02:41 | ETA: 00:02:28 | ETA: 00:02:15 | ETA: 00:02:05 | ETA: 00:01:55 | ETA: 00:01:45 | ETA: 00:01:34 | ETA: 00:01:23 | ETA: 00:01:13 | ETA: 00:01:01 | ETA: 00:00:51 | ETA: 00:00:41 | ETA: 00:00:30 | ETA: 00:00:20 | ETA: 00:00:10 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:05:06


In [34]:
# Save downsampled tables to csv
sample_A.to_csv(datasets_dir + os.sep + "sampleA.csv")
sample_B.to_csv(datasets_dir + os.sep + "sampleB.csv")

In [39]:
sample_B.head()

Unnamed: 0,id,title,year,episode,song,artists
505168,276654,The Tonight Show Starring Johnny Carson,1962.0,(1989-11-14),Johnnys Theme,paul anka
226722,437417,Father of the Bird,1997.0,,The Flying Trapeze,gaston lyle
641892,608033,Secretary,2002.0,,Its So Strange (The Way Love Works),the honeydogs+jeff barry+ellie greenwich+neil diamond
406210,170096,OchÌ©ntame... otra vez,2014.0,Vidas de copla (#2.10),Torbellino de colores,lola flores+rafael de leÌ_n+juan solano
429181,595711,River of No Return,1954.0,,Down in the Meadow,marilyn monroe+lionel newman


In [42]:
# Perform 1-word overlap blocking on artist name
ob = em.OverlapBlocker()
C1 = ob.block_tables(sample_A, sample_B, 'artist_name', 'artists', 
                     rem_stop_words=True, word_level=True, overlap_size=1, 
                     l_output_attrs=['id', 'title', 'artist_name', 'year'], 
                     r_output_attrs=['id', 'title', 'year', 'episode', 'song', 'artists'],
                     show_progress=True)
C1.to_csv(datasets_dir + os.sep + "C1.csv")

0%                          100%
[##############################] | ETA: 00:00:01 | ETA: 00:00:06 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:02


In [43]:
# Perform 1-word overlap blocking on title
C2 = ob.block_tables(sample_A, sample_B, 'title', 'song', 
                     rem_stop_words=True, word_level=True, overlap_size=1, 
                     l_output_attrs=['id', 'title', 'artist_name', 'year'], 
                     r_output_attrs=['id', 'title', 'year', 'episode', 'song', 'artists'], 
                     show_progress=True)
C2.to_csv(datasets_dir + os.sep + "C2.csv")

0%                          100%
[##############################] | ETA: 00:00:09 | ETA: 00:00:11 | ETA: 00:00:11 | ETA: 00:00:11 | ETA: 00:00:11 | ETA: 00:00:11 | ETA: 00:00:10 | ETA: 00:00:10 | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:12


In [44]:
# Combine blocker outputs for 1 work overlap on artist and title
C = em.combine_blocker_outputs_via_union([C1, C2])
C.to_csv(datasets_dir + os.sep + "C.csv")

In [202]:
# Read in files as a hack to fix KeyError (explained in writeup)
path_sA = datasets_dir + os.sep + "sampleA.csv"
path_sB = datasets_dir + os.sep + "sampleB.csv"
path_C = datasets_dir + os.sep + "C.csv"

# Read and set metadata for downsampled B
sA = em.read_csv_metadata(path_sA, key='id')

# Read and set metadata for downsampled B
sB = em.read_csv_metadata(path_sB, key='id')

# Read and set metadata for blocked C1
C = em.read_csv_metadata(path_C, key='_id', ltable=sA, rtable=sB, fk_ltable='ltable_id', fk_rtable='rtable_id')



In [58]:
# Debug overlap blocker
corres = [('title','song'), ('artist_name','artists'), ('year', 'year')]
dbg = em.debug_blocker(C, sA, sB, output_size=200, attr_corres=corres)

In [47]:
dbg.to_csv(datasets_dir + os.sep + 'dbg.csv')

In [48]:
# Sample candidate set
S = em.sample_table(C, 450)
S.to_csv(datasets_dir + os.sep + 'S.csv')

In [62]:
# Too many non-matches so try rule based blocking blocking
block_t = em.get_tokenizers_for_blocking()
block_s = em.get_sim_funs_for_blocking()
block_c = em.get_attr_corres(sample_A, sample_B)
block_c['corres']

[('id', 'id'), ('title', 'title'), ('year', 'year')]

In [63]:
block_c['corres'] = corres
block_c['corres']

[('title', 'song'), ('artist_name', 'artists'), ('year', 'year')]

In [64]:
atypes1 = em.get_attr_types(sample_A)
atypes2 = em.get_attr_types(sample_B)
block_f = em.get_features(sample_A, sample_B, atypes1, atypes2, block_c, block_t, block_s)
block_f

Unnamed: 0,feature_name,left_attribute,right_attribute,left_attr_tokenizer,right_attr_tokenizer,simfunction,function,function_source,is_auto_generated
0,title_song_jac_qgm_3_qgm_3,title,song,qgm_3,qgm_3,jaccard,<function title_song_jac_qgm_3_qgm_3 at 0x10f298f50>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
1,title_song_cos_dlm_dc0_dlm_dc0,title,song,dlm_dc0,dlm_dc0,cosine,<function title_song_cos_dlm_dc0_dlm_dc0 at 0x10f298de8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
2,title_song_jac_dlm_dc0_dlm_dc0,title,song,dlm_dc0,dlm_dc0,jaccard,<function title_song_jac_dlm_dc0_dlm_dc0 at 0x121010c80>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
3,title_song_mel,title,song,,,monge_elkan,<function title_song_mel at 0x121010c08>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
4,title_song_lev_dist,title,song,,,lev_dist,<function title_song_lev_dist at 0x121010b90>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
5,title_song_lev_sim,title,song,,,lev_sim,<function title_song_lev_sim at 0x121010d70>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
6,title_song_nmw,title,song,,,needleman_wunsch,<function title_song_nmw at 0x120f20050>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
7,title_song_sw,title,song,,,smith_waterman,<function title_song_sw at 0x120f200c8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
8,artist_name_artists_jac_qgm_3_qgm_3,artist_name,artists,qgm_3,qgm_3,jaccard,<function artist_name_artists_jac_qgm_3_qgm_3 at 0x120f20140>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True
9,artist_name_artists_cos_dlm_dc0_dlm_dc0,artist_name,artists,dlm_dc0,dlm_dc0,cosine,<function artist_name_artists_cos_dlm_dc0_dlm_dc0 at 0x120f201b8>,from py_entitymatching.feature.simfunctions import *\nfrom py_entitymatching.feature.tokenizers ...,True


In [67]:
# 3-gram Jaccard on song and title
rule1 = ['title_song_jac_qgm_3_qgm_3(ltuple, rtuple) < 0.5']
rb = em.RuleBasedBlocker()
rb.add_rule(rule1, block_f, rule_name='rule_1')
D = rb.block_candset(C)
D.to_csv(datasets_dir + os.sep + "D.csv")

0%                          100%
[                              ]0%                          100%
[##############################] | ETA: 00:06:09 | ETA: 00:05:54 | ETA: 00:05:37 | ETA: 00:05:17 | ETA: 00:05:01 | ETA: 00:04:46 | ETA: 00:04:32 | ETA: 00:04:19 | ETA: 00:04:10 | ETA: 00:03:59 | ETA: 00:03:47 | ETA: 00:03:34 | ETA: 00:03:22 | ETA: 00:03:09 | ETA: 00:02:57 | ETA: 00:02:45 | ETA: 00:02:32 | ETA: 00:02:20 | ETA: 00:02:08 | ETA: 00:01:57 | ETA: 00:01:45 | ETA: 00:01:33 | ETA: 00:01:21 | ETA: 00:01:09 | ETA: 00:00:58 | ETA: 00:00:49 | ETA: 00:00:36 | ETA: 00:00:24 | ETA: 00:00:12 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:06:11


In [80]:
# Debug rule-based blocking (3-gram Jaccard) on C
dbg2 = em.debug_blocker(D, sA, sB, output_size=200, attr_corres=corres)
dbg2.to_csv(datasets_dir + os.sep + "dbg2.csv")

In [71]:
# D has too many matches removed so try going back
# and doing overlap blocking on songs in C1 with
# more stop words
stop = ['de', 'del', 'du', 'of', 'la', 'le']

for word in stop:
    ob.stop_words.append(word)
    
E = ob.block_candset(C1, 'title', 'song', rem_stop_words=True, verbose=True)

E.to_csv(datasets_dir + os.sep + "E.csv")

0%                          100%
[##############################] | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:07


In [72]:
ob.stop_words

['a',
 'an',
 'and',
 'are',
 'as',
 'at',
 'be',
 'by',
 'for',
 'from',
 'has',
 'he',
 'in',
 'is',
 'it',
 'its',
 'on',
 'that',
 'the',
 'to',
 'was',
 'were',
 'will',
 'with',
 'de',
 'del',
 'du',
 'of',
 'la',
 'le',
 'de',
 'del',
 'du',
 'of',
 'la',
 'le',
 'de',
 'del',
 'du',
 'of',
 'la',
 'le']

In [88]:
# Title-Song 1 word overlap blocking on C1 with more stop words added
stop2 = ['i', 'im', 'my', 'me', 'you', 'your', 'we', 'our', 'el', 'after', 'theme', 'y', 'that', 'like', 'little', 'all', 'love']

for word in stop2:
    ob.stop_words.append(word)
    
ob.stop_words = list(set(ob.stop_words))

E1 = ob.block_candset(C1, 'title', 'song', rem_stop_words=True, verbose=True)

E1.to_csv(datasets_dir + os.sep + "E1.csv")

0%                          100%
[##############################] | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:07


In [89]:
# Debug title-song overlap blocker on C1
dbg3 = em.debug_blocker(E1, sA, sB, output_size=200, attr_corres=corres)
dbg3.to_csv(datasets_dir + os.sep + "dbg3.csv")

In [203]:
# Modify sample_B2 to remove "+" between artist names and replace with a single whitespace
# to improve recall
# Rerun 1-word overlap blocking on artist_name and artists

# Read and set metadata for downsampled B1 (B with '+' removed from artists)
sB1 = em.read_csv_metadata(datasets_dir + os.sep + "sampleB1.csv", key='id')



In [161]:
C1a = ob.block_tables(sA, sB1, 'artist_name', 'artists', 
                     rem_stop_words=True, word_level=True, overlap_size=1, 
                     l_output_attrs=['id', 'title', 'artist_name', 'year'], 
                     r_output_attrs=['id', 'title', 'year', 'episode', 'song', 'artists'],
                     show_progress=True)

C1a.to_csv(datasets_dir + os.sep + "C1a.csv")

0%                          100%
[##############################] | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:02


In [162]:
# Title-Song 1 word overlap blocking on C1a
C1b = ob.block_candset(C1a, 'title', 'song', rem_stop_words=True, verbose=True)

C1b.to_csv(datasets_dir + os.sep + "C1b.csv")

0%                          100%
[##############################] | ETA: 00:00:09 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:08 | ETA: 00:00:07 | ETA: 00:00:07 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:06 | ETA: 00:00:05 | ETA: 00:00:05 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:09


In [163]:
# Debug title-song overlap blocker on C1
dbg4 = em.debug_blocker(C1b, sA, sB1, output_size=200, attr_corres=corres)
dbg4.to_csv(datasets_dir + os.sep + "dbg4.csv")

In [165]:
# Sample candidate set
S1 = em.sample_table(C1b, 450)
S1.to_csv(datasets_dir + os.sep + 'S1.csv')

In [166]:
# Too many non-matches in C1b
# Add rule to check if title begin with the same word
# Check against the first two words in case one title starts with
# an article and the other omits it (ex. "The New Machine" and "New Machine")
def title_song_function(x, y):
    if len(x)==0 or len(y)==(0):
        return True
    
    x_title = x['title'].lower().split()
    y_title = y['song'].lower().split()
    
    x1 = x_title[0]
    
    x2 = ''
    if len(x_title) > 1:
        x2 = x_title[1]
        
    y1 = y_title[0]
    
    y2 = ''
    if len(y_title) > 1:
        y2 = y_title[1]
    
    if (x1 == y1 or x1 == y2 or x2 == y1 or x2 == y2):
        return False
    else:
        return True

In [167]:
bb = em.BlackBoxBlocker()
bb.set_black_box_function(title_song_function)

In [168]:
C3 = bb.block_candset(C1b)

0%                          100%
[##############################] | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:01


In [169]:
# Debug title-song overlap blocker on C3
dbg5 = em.debug_blocker(C3, sA, sB1, output_size=200, attr_corres=corres)
dbg5.to_csv(datasets_dir + os.sep + "dbg5.csv")

In [170]:
# Sample candidate set
S2 = em.sample_table(C3, 450)
S2.to_csv(datasets_dir + os.sep + 'S2.csv')

In [173]:
# Missing some matches in C3
# Create another overlap block on 2-word overlap on song
# and 1 word overlap on artist
C4 = ob.block_tables(sA, sB1, 'artist_name', 'artists', 
                     rem_stop_words=True, word_level=True, overlap_size=2, 
                     l_output_attrs=['id', 'title', 'artist_name', 'year'], 
                     r_output_attrs=['id', 'title', 'year', 'episode', 'song', 'artists'],
                     show_progress=True)

C4 = ob.block_candset(C4, 'title', 'song', 
                     rem_stop_words=True, word_level=True, overlap_size=2, 
                     show_progress=True)

C4.to_csv(datasets_dir + os.sep + "C4.csv")

0%                          100%
[##############################] | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:00
0%                          100%
[##############################] | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 

In [174]:
# Combine C3 and C4: 1-word overlap on artist and title and 
# either the one of the few words of the title match or 
# there is a 2-word overlap for the title
C5 = em.combine_blocker_outputs_via_union([C3, C4])
C5.to_csv(datasets_dir + os.sep + "C5.csv")

In [175]:
# Debug title-song overlap blocker on C5
dbg6 = em.debug_blocker(C5, sA, sB1, output_size=200, attr_corres=corres)
dbg6.to_csv(datasets_dir + os.sep + "dbg6.csv")

In [176]:
# Sample candidate set
S3 = em.sample_table(C5, 400)
S3.to_csv(datasets_dir + os.sep + 'S3.csv')

In [177]:
# Read in labelled data
path_G = datasets_dir + os.sep + "G.csv"
G = em.read_csv_metadata(path_G, encoding='utf-8',
                         key='_id',
                         ltable=A, rtable=B, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')



In [178]:
len(G)

390

In [179]:
# Split D into development set (I) and evaluation set (J)
IJ = em.split_train_test(G, train_proportion=0.7, random_state=0)
I = IJ['train']
J = IJ['test']

In [180]:
# Create a set of ML-matchers
dt = em.DTMatcher(name='DecisionTree', random_state=0)
svm = em.SVMMatcher(name='SVM', random_state=0)
rf = em.RFMatcher(name='RF', random_state=0)
lg = em.LogRegMatcher(name='LogReg', random_state=0)
ln = em.LinRegMatcher(name='LinReg')
nb = em.NBMatcher(name='NaiveBayes')

In [207]:
# Generate features
# sample_A, sample_B, atypes1, atypes2, block_c, block_t, block_s
atypes1 = em.get_attr_types(sA)
atypes2 = em.get_attr_types(sB1)
block_c = em.get_attr_corres(sA, sB1)
F = em.get_features(sA, sB1, atypes1, atypes2, block_c, block_t, block_s)

SyntaxError: invalid syntax (<string>, line 3)

In [211]:
# Delete first column from each table to resolve error
del sA['Unnamed: 0']
del sB1['Unnamed: 0']

In [249]:
# Generate features
# sample_A, sample_B, atypes1, atypes2, block_c, block_t, block_s
atypes1 = em.get_attr_types(sA)
atypes2 = em.get_attr_types(sB1)
block_c = em.get_attr_corres(sA, sB1)
block_c['corres'] = [('id', 'id'), ('title','song'), ('artist_name','artists'), ('year', 'year')]
F = em.get_features(sA, sB1, atypes1, atypes2, block_c, block_t, block_s)

In [250]:
# Convert the I into a set of feature vectors using F
H = em.extract_feature_vecs(I, 
                            feature_table=F, 
                            attrs_after='gold',
                            show_progress=True)

0%                          100%
[##############################] | ETA: 00:00:04 | ETA: 00:00:03 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:02


In [251]:
# Select the best ML matcher using CV
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
        k=5,
        target_attr='gold', metric='precision', random_state=0)

result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x121f2d690>,5,0.692308,0.923077,0.913043,0.88,0.961538,0.873993
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x121f2d490>,5,0.904762,0.913043,0.88,1.0,1.0,0.939561
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x121f2d2d0>,5,1.0,1.0,0.928571,1.0,1.0,0.985714
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x121f2d510>,5,0.863636,0.925926,0.913043,1.0,0.933333,0.927188
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x121f2de10>,5,0.863636,0.923077,0.88,1.0,0.964286,0.9262
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x121f2d710>,5,0.9,0.916667,0.846154,1.0,0.961538,0.924872


In [252]:
result = em.select_matcher([dt, rf, svm, ln, lg, nb], table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'],
        k=5,
        target_attr='gold', metric='recall', random_state=0)
result['cv_stats']

Unnamed: 0,Name,Matcher,Num folds,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Mean score
0,DecisionTree,<py_entitymatching.matcher.dtmatcher.DTMatcher object at 0x121f2d690>,5,0.947368,0.923077,0.954545,0.846154,0.78125,0.890479
1,RF,<py_entitymatching.matcher.rfmatcher.RFMatcher object at 0x121f2d490>,5,1.0,0.807692,1.0,0.884615,0.84375,0.907212
2,SVM,<py_entitymatching.matcher.svmmatcher.SVMMatcher object at 0x121f2d2d0>,5,0.789474,0.5,0.590909,0.5,0.28125,0.532327
3,LinReg,<py_entitymatching.matcher.linregmatcher.LinRegMatcher object at 0x121f2d510>,5,1.0,0.961538,0.954545,0.884615,0.875,0.93514
4,LogReg,<py_entitymatching.matcher.logregmatcher.LogRegMatcher object at 0x121f2de10>,5,1.0,0.923077,1.0,0.923077,0.84375,0.937981
5,NaiveBayes,<py_entitymatching.matcher.nbmatcher.NBMatcher object at 0x121f2d710>,5,0.947368,0.846154,1.0,0.846154,0.78125,0.884185


In [254]:
# Convert J into a set of feature vectors using F
L = em.extract_feature_vecs(J, feature_table=F, attrs_after='gold', show_progress=False)

In [255]:
# Test matchers
models = [dt, svm, rf, lg, ln, nb]

def train(models):
    for model in models:
        # Train using feature vectors from I 
        model.fit(table=H, 
        exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
        target_attr='gold')

        # Predict on L 
        predictions = model.predict(table=L, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold'], 
              append=True, target_attr='predicted', inplace=False)
        
        # Evaluate the predictions
        print ('Predictions of ' + model.name + ' on J')
        eval_result = em.eval_matches(predictions, 'gold', 'predicted')
        em.print_eval_summary(eval_result)
        print ('\n')

In [248]:
train(models)

Predictions of DecisionTree on J
Precision : 92.86% (52/56)
Recall : 88.14% (52/59)
F1 : 90.43%
False positives : 4 (out of 56 positive predictions)
False negatives : 7 (out of 61 negative predictions)


Predictions of SVM on J
Precision : 100.0% (25/25)
Recall : 42.37% (25/59)
F1 : 59.52%
False positives : 0 (out of 25 positive predictions)
False negatives : 34 (out of 92 negative predictions)


Predictions of RF on J
Precision : 88.24% (45/51)
Recall : 76.27% (45/59)
F1 : 81.82%
False positives : 6 (out of 51 positive predictions)
False negatives : 14 (out of 66 negative predictions)


Predictions of LogReg on J
Precision : 92.73% (51/55)
Recall : 86.44% (51/59)
F1 : 89.47%
False positives : 4 (out of 55 positive predictions)
False negatives : 8 (out of 62 negative predictions)


Predictions of LinReg on J
Precision : 94.44% (51/54)
Recall : 86.44% (51/59)
F1 : 90.27%
False positives : 3 (out of 54 positive predictions)
False negatives : 8 (out of 63 negative predictions)


Predictio