In [2]:
import os
import pandas as pd
import random
import numpy as np
import operator
import math

In [3]:
movies_dataset_path = 'movies_metadata.csv'
cols = ['genres', 'imdb_id', 'title', 'production_companies', 'production_countries']

# Read movies_metadata.csv & chosen columns & drop NA
movies_df = pd.read_csv(movies_dataset_path, usecols=cols).dropna()

# Drop rows whose 'genres' == [] or 'production_companies' == [] or 'production_countries' == []
movies_df = movies_df[(movies_df['genres'] != '[]')  & \
                        (movies_df['production_companies'] != '[]') & \
                        (movies_df['production_countries'] != '[]')]

In [4]:
# Choose 200 random rows
random_movies_df = movies_df.sample(n=200).reset_index(drop=True)

# Make 2 seperate KBs from random_movies_df
kb1 = random_movies_df.sample(n=120)
kb2 = random_movies_df.sample(n=120)

In [5]:
random_movies_df.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",tt0070122,"[{'name': 'Toho Film (Eiga) Co. Ltd.', 'id': 6...","[{'iso_3166_1': 'JP', 'name': 'Japan'}]",Godzilla vs. Megalon
1,"[{'id': 18, 'name': 'Drama'}]",tt1850419,"[{'name': 'Mij Film Co.', 'id': 2440}, {'name'...","[{'iso_3166_1': 'TR', 'name': 'Turkey'}, {'iso...",Rhino Season
2,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",tt1817771,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",Freaks of Nature
3,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",tt0080166,"[{'name': 'Golden Harvest Company Ltd.', 'id':...","[{'iso_3166_1': 'HK', 'name': 'Hong Kong'}]",Knockabout
4,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",tt1326238,"[{'name': 'Virtual Audiovisuais', 'id': 16406}]","[{'iso_3166_1': 'PT', 'name': 'Portugal'}, {'i...",Backlight


In [6]:
kb1.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
29,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",tt0060125,"[{'name': 'Fair Film', 'id': 1399}, {'name': '...","[{'iso_3166_1': 'ES', 'name': 'Spain'}, {'iso_...",For Love and Gold
149,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",tt0273300,"[{'name': 'Eureka Pictures', 'id': 1002}, {'na...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",Jump Tomorrow
36,"[{'id': 10770, 'name': 'TV Movie'}, {'id': 18,...",tt3320502,"[{'name': 'Lighthouse Pictures', 'id': 6345}, ...","[{'iso_3166_1': 'US', 'name': 'United States o...",Let It Snow
135,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",tt0381681,"[{'name': 'Castle Rock Entertainment', 'id': 9...","[{'iso_3166_1': 'US', 'name': 'United States o...",Before Sunset
87,"[{'id': 18, 'name': 'Drama'}, {'id': 27, 'name...",tt0021884,"[{'name': 'Universal Pictures', 'id': 33}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Frankenstein


In [7]:
kb2.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
115,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",tt0097584,"[{'name': 'Mosfilm', 'id': 5120}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}, {'iso...",Intergirl
59,"[{'id': 99, 'name': 'Documentary'}]",tt6146460,"[{'name': 'Radical Media', 'id': 11152}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Hamilton's America
118,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",tt0082095,"[{'name': 'Österreichischer Rundfunk (ORF)', '...","[{'iso_3166_1': 'AT', 'name': 'Austria'}, {'is...",The Boat Is Full
110,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",tt0111470,"[{'name': 'Overseas FilmGroup', 'id': 888}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",Trading Mom
147,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",tt0061418,"[{'name': 'Tatira-Hiller Productions', 'id': 2...","[{'iso_3166_1': 'US', 'name': 'United States o...",Bonnie and Clyde


In [8]:
# Preprocessing: Make a string of list of dictionaries into list of names of features.
# E.g. 'genres' column output only contains 'Drama, Action, Documentary'
def dicts_list_to_str(df: pd.DataFrame, columns_to_modify):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        for col in columns_to_modify:
            list_of_dicts = eval(row[col])
            if col == 'production_countries':
                col_elements = [col_dict['iso_3166_1'] for col_dict in list_of_dicts]
            else:
                col_elements = [col_dict['name'] for col_dict in list_of_dicts]
            col_str = ' '.join(col_elements)
            # Replace back to the column name
            row[col] = col_str
    
    return df_clone

In [9]:
columns_to_modify = ['genres', 'production_companies', 'production_countries']
kb1_modified = dicts_list_to_str(kb1, columns_to_modify)

In [10]:
kb1_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
29,Adventure Comedy,tt0060125,Fair Film Cecchi Gori Group,ES FR IT,For Love and Gold
149,Drama Comedy Romance,tt0273300,Eureka Pictures Filmfour,GB US,Jump Tomorrow
36,TV Movie Drama Romance Family,tt3320502,Lighthouse Pictures Craig Anderson Productions...,US,Let It Snow
135,Drama Romance,tt0381681,Castle Rock Entertainment Detour Film Producti...,US,Before Sunset
87,Drama Horror Science Fiction,tt0021884,Universal Pictures,US,Frankenstein


In [11]:
kb2_modified = dicts_list_to_str(kb2, columns_to_modify)

In [12]:
kb2_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
115,Drama Romance,tt0097584,Mosfilm,RU SE,Intergirl
59,Documentary,tt6146460,Radical Media,US,Hamilton's America
118,Drama History War,tt0082095,Österreichischer Rundfunk (ORF) Zweites Deutsc...,AT DE CH,The Boat Is Full
110,Comedy Family,tt0111470,Overseas FilmGroup First Look International Mo...,US,Trading Mom
147,Crime Drama,tt0061418,Tatira-Hiller Productions Warner Brothers/Seve...,US,Bonnie and Clyde


In [13]:
def modified_kb2(df: pd.DataFrame, randomly_removed_cols):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        # For 'genres', 'production_countries', if there is only 1 genre, keep it. Otherwise, remove one of them
        for col in randomly_removed_cols:
            feature_list = row[col].split(' ')
            if len(feature_list) > 1:
                # Randomly remove 1 of them
                random.shuffle(feature_list)
                feature_list.pop()
                # Merge back as a string
                features_str = ' '.join(feature_list)
                row[col] = features_str
        
        
        # For 'title', make some changes so it looks like a typo
        title_list = row['title'].split(' ')
        # Make typo changes only with 'title' has more than 2 words
        if len(title_list) > 2:
            row['title'] = row['title'].replace('a', '4').replace('i', 'j')
    return df_clone

In [14]:
randomly_remove_cols = ['genres', 'production_countries']

kb2_processed = modified_kb2(kb2_modified, randomly_remove_cols)

In [15]:
# Modify KBs' attributes (column name) so that they are different, and easier for later blocking methods
def name_modification(Kb, appended_str:str):
    Kb_clone = Kb.copy()
    name_modification_dict = {}
    for col in Kb_clone.columns:
        name_modification_dict[col] = col + appended_str
    Kb_clone.rename(name_modification_dict, axis=1, inplace=True)

    return Kb_clone

In [16]:
kb1_processed = name_modification(kb1_modified, '_kb1')
kb2_processed_2 = name_modification(kb2_processed, '_kb2')

### Final KBs

kb1_processed & kb2_processed_2

In [17]:
# Final KBs
kb1_processed.head()

Unnamed: 0,genres_kb1,imdb_id_kb1,production_companies_kb1,production_countries_kb1,title_kb1
29,Adventure Comedy,tt0060125,Fair Film Cecchi Gori Group,ES FR IT,For Love and Gold
149,Drama Comedy Romance,tt0273300,Eureka Pictures Filmfour,GB US,Jump Tomorrow
36,TV Movie Drama Romance Family,tt3320502,Lighthouse Pictures Craig Anderson Productions...,US,Let It Snow
135,Drama Romance,tt0381681,Castle Rock Entertainment Detour Film Producti...,US,Before Sunset
87,Drama Horror Science Fiction,tt0021884,Universal Pictures,US,Frankenstein


In [18]:
kb2_processed_2.head()

Unnamed: 0,genres_kb2,imdb_id_kb2,production_companies_kb2,production_countries_kb2,title_kb2
115,Romance,tt0097584,Mosfilm,RU,Intergirl
59,Documentary,tt6146460,Radical Media,US,Hamilton's America
118,Drama History,tt0082095,Österreichischer Rundfunk (ORF) Zweites Deutsc...,CH AT,The Bo4t Is Full
110,Comedy,tt0111470,Overseas FilmGroup First Look International Mo...,US,Trading Mom
147,Drama,tt0061418,Tatira-Hiller Productions Warner Brothers/Seve...,US,Bonnje 4nd Clyde


# Token blocking

In [19]:
test_db1 = kb1_processed.copy()
test_db2 = kb2_processed_2.copy()

In [20]:
def tokens_extraction(Kb, appended_str):
    dict_kb = {}
    
    for idxRow, row_ in Kb.iterrows():
        for idxCol, col_ in row_.iteritems():
            col_tokens = col_.split(' ')
            
            for token in col_tokens:
                if token in dict_kb.keys():
                    dict_kb[token].append(str(idxRow) + appended_str)
                else:
                    dict_kb[token] = [str(idxRow) + appended_str]
                    
    # Make values in each key appears once
    for key, value in dict_kb.items():
        dict_kb[key] = list(set(value))
        
    return dict_kb

def token_blocking(Kb1, Kb2):
    
    dict_db1 = tokens_extraction(test_db1, '_kb1')
    dict_db2 = tokens_extraction(test_db2, '_kb2')
    
    blocks = {}

    for key in dict_db1.keys():
        if key in dict_db2.keys():
            rows_kb1 = dict_db1[key][:]
            rows_kb1.extend(dict_db2[key])
            blocks[key] = rows_kb1
            
    return blocks

In [21]:
token_blocks = token_blocking(test_db1, test_db2)

# Attribute Clustering Blocking

In [22]:
def get_attribute_tokens(Kb):
    attribute_token = {}
    # Loop through all attribute names
    for i in list(Kb):
        attribute_values = Kb[i].tolist()
        tokens_list = []
        # Loop each entry (row) in attribute_values
        for entry in attribute_values:
            tokens = entry.split(' ')
            tokens_list.extend(tokens)
        # Turn it into a set so that each token appears once
        tokens_set = list(set(tokens_list))
        attribute_token[i] = tokens_set
    
    return attribute_token

def get_links(ref_token_dict, target_token_dict):
    links = []
    standalone_attribute = []
    for attribute_ref in ref_token_dict.keys():
        link_exist = False
        # Compare with all attributes name in target_token_dict:
        for attribute_target in target_token_dict.keys():
            # Mutual tokens
            mutual_tokens = set(ref_token_dict[attribute_ref]).intersection(target_token_dict[attribute_target])
            # Total tokens
            tokens_ref = ref_token_dict[attribute_ref][:]
            tokens_ref.extend(target_token_dict[attribute_target])
            total_tokens = set(tokens_ref)
            # Add link if the Jaccard similarity score is > 0
            jaccard_similarity_score = len(mutual_tokens)/len(total_tokens)
            if jaccard_similarity_score > 0:
                links.append([attribute_ref, attribute_target])
                link_exist = True

        # This is when no link is added, so that the attribute_ref key does not exist yet
        if not link_exist:
            standalone_attribute.append(attribute_ref)
    
    return links, standalone_attribute

def transitive_closure(edges):
    clusters = []
    for edge in edges:
        u = edge[0]
        v = edge[1]
        if len(clusters) == 0:
            clusters.append({u,v}) 
        else:
            if any(not {u,v}.isdisjoint(c) for c in clusters):
                for c in clusters:
                    if (u in c) or (v in c):
                        c.add(u)
                        c.add(v)          
            else:
                clusters.append({u,v})
    return clusters

def attribute_clustering_blocking(kb1, kb2):
    # Get tokens of all attributes name in the Kb
    attribute_token_kb1 = get_attribute_tokens(kb1)
    attribute_token_kb2 = get_attribute_tokens(kb2)
    
    # Get links by Jaccard similarity
    links_by_kb1, standalone_attributes_1 = get_links(attribute_token_kb1, attribute_token_kb2)
    links_by_kb2, standalone_attributes_2 = get_links(attribute_token_kb2, attribute_token_kb1)
    
    links = links_by_kb1 + links_by_kb2
    standalone_attributes = standalone_attributes_1 + standalone_attributes_2
    
    clusters = transitive_closure(links)

    if len(standalone_attributes) != 0:
        clusters.append(set(standalone_attributes))
    
    return clusters

In [23]:
clusters = attribute_clustering_blocking(test_db1, test_db2)

In [24]:
def get_cluster_name(clusters, idxCol):
    for idx, c in enumerate(clusters):
        if idxCol in c:
            return 'c'+ str(idx)

In [25]:
test_db1 = kb1_processed.copy()
test_db2 = kb2_processed_2.copy()

dict_db1 = {}
dict_db2 = {}

# Get token of every KBs
for (idxRow1, row1), (idxRow2, row2) in zip(test_db1.iterrows(), test_db2.iterrows()):
    for (idxCol1, col1), (idxCol2, col2) in zip(row1.iteritems(), row2.iteritems()):
        col1_string = col1.split(' ')
        col2_string = col2.split(' ')
        
        # Add into the dictionaries, with key is the word token and the value is the rowIdx
        for token in col1_string:
            token = get_cluster_name(clusters, idxCol1) + '.' + token
            if token in dict_db1.keys():
                dict_db1[token].append(str(idxRow1) + '_kb1')
            else:
                dict_db1[token] = [str(idxRow1) + '_kb1']
                
        for token in col2_string:
            token = get_cluster_name(clusters, idxCol2) + '.' + token
            if token in dict_db2.keys():
                dict_db2[token].append(str(idxRow2) + '_kb2')
            else:
                dict_db2[token] = [str(idxRow2) + '_kb2']

# Make values in each key appears once
for key, values in dict_db1.items():
    dict_db1[key] = list(set(values))
    
for key, values in dict_db2.items():
    dict_db2[key] = list(set(values))

In [26]:
# Blocking
attribute_blocks = {}

for key in dict_db1.keys():
    if key in dict_db2.keys():
        rows_kb1 = dict_db1[key][:]
        rows_kb1.extend(dict_db2[key])
        attribute_blocks[key] = rows_kb1

## Blocking results

In [43]:
for i, block in enumerate(token_blocks):
    # Print 10 blocks for example
    if i < 10:
        print("{}:{}".format(block, token_blocks[block]))
    else:
        break

Adventure:['112_kb1', '6_kb1', '94_kb1', '106_kb1', '84_kb1', '132_kb1', '44_kb1', '64_kb1', '50_kb1', '107_kb1', '166_kb1', '123_kb1', '29_kb1', '8_kb1', '166_kb2', '3_kb2', '44_kb2', '112_kb2', '107_kb2', '132_kb2', '8_kb2', '7_kb2']
Comedy:['20_kb1', '149_kb1', '16_kb1', '132_kb1', '108_kb1', '190_kb1', '159_kb1', '65_kb1', '128_kb1', '189_kb1', '23_kb1', '17_kb1', '63_kb1', '178_kb1', '116_kb1', '191_kb1', '133_kb1', '95_kb1', '26_kb1', '13_kb1', '101_kb1', '162_kb1', '117_kb1', '148_kb1', '181_kb1', '110_kb1', '123_kb1', '70_kb1', '160_kb1', '29_kb1', '142_kb2', '78_kb2', '17_kb2', '63_kb2', '113_kb2', '133_kb2', '26_kb2', '160_kb2', '179_kb2', '149_kb2', '197_kb2', '86_kb2', '12_kb2', '13_kb2', '171_kb2', '148_kb2', '62_kb2', '49_kb2', '184_kb2', '30_kb2', '189_kb2', '102_kb2', '110_kb2']
Film:['108_kb1', '168_kb1', '50_kb1', '130_kb1', '83_kb1', '135_kb1', '190_kb1', '67_kb1', '58_kb1', '42_kb1', '61_kb1', '128_kb1', '23_kb1', '24_kb1', '166_kb1', '29_kb1', '1_kb1', '95_kb1', '1

In [45]:
for i, block in enumerate(attribute_blocks):
    # Print 10 blocks for example
    if i < 10:
        print("{}:{}".format(block, attribute_blocks[block]))
    else:
        break

c0.Adventure:['112_kb1', '6_kb1', '94_kb1', '106_kb1', '84_kb1', '132_kb1', '44_kb1', '64_kb1', '50_kb1', '107_kb1', '166_kb1', '123_kb1', '29_kb1', '8_kb1', '166_kb2', '3_kb2', '44_kb2', '112_kb2', '107_kb2', '132_kb2', '8_kb2', '7_kb2']
c0.Comedy:['20_kb1', '149_kb1', '16_kb1', '132_kb1', '108_kb1', '190_kb1', '159_kb1', '65_kb1', '128_kb1', '189_kb1', '23_kb1', '17_kb1', '63_kb1', '178_kb1', '116_kb1', '191_kb1', '133_kb1', '95_kb1', '26_kb1', '13_kb1', '101_kb1', '162_kb1', '117_kb1', '148_kb1', '181_kb1', '110_kb1', '123_kb1', '70_kb1', '160_kb1', '29_kb1', '142_kb2', '78_kb2', '17_kb2', '63_kb2', '113_kb2', '133_kb2', '26_kb2', '160_kb2', '179_kb2', '149_kb2', '197_kb2', '86_kb2', '12_kb2', '13_kb2', '171_kb2', '148_kb2', '62_kb2', '49_kb2', '184_kb2', '30_kb2', '189_kb2', '102_kb2', '110_kb2']
c0.Film:['108_kb1', '168_kb1', '50_kb1', '130_kb1', '83_kb1', '135_kb1', '190_kb1', '67_kb1', '58_kb1', '42_kb1', '61_kb1', '128_kb1', '23_kb1', '24_kb1', '166_kb1', '29_kb1', '1_kb1', '95

# Meta-Blocking

In [27]:
def create_entity_index(blocks, entities):
    entity_index = {}
    for entity in entities:
        entity_index[entity] = []
    for key, value in blocks.items():
        for entity in value:
            entity_index[entity].append(key)
    return entity_index

In [28]:
def getAllEntities(kb, kb_name):
    entities = [str(idxRow) + kb_name for idxRow, row_ in kb.iterrows()]
    return entities

In [29]:
all_entities = getAllEntities(test_db1, '_kb1')
all_entities.extend(getAllEntities(test_db2, '_kb2'))

In [30]:
entity_index_token_blocks = create_entity_index(token_blocks, all_entities)

In [31]:
entity_index_attribute_blocks = create_entity_index(attribute_blocks, all_entities)

## Jaccard scheme and the common blocks scheme

In [32]:
def Scheme(blocks_i, entity_i, blocks_j, entity_j, type_of_scheme="Common"):
    inter_blocks_ij = []
    for block in blocks_i:
        if entity_j in block and entity_i in block:
            inter_blocks_ij.append(block)
    if type_of_scheme == "Jaccard":
        return len(inter_blocks_ij) / (len(blocks_i) + len(blocks_j) - len(inter_blocks_ij))
    return len(inter_blocks_ij)

In [33]:
def make_graph(blocks, entity_index, type_of_scheme="Jaccard"):
    E = []
    V = set()
    W = []
    for block in blocks:
        ps_1 = [p for p in blocks[block] if p[-3:] == "kb1"]
        ps_2 = [p for p in blocks[block] if p[-3:] == "kb2"]
        for p1 in ps_1:
            V.add(p1)
            for p2 in ps_2:
                V.add(p2)
                blocks_1_id = entity_index[p1]
                blocks_2_id = entity_index[p2]
                blocks_1 = [blocks[id1] for id1 in blocks_1_id]
                blocks_2 = [blocks[id2] for id2 in blocks_2_id]
                weight = Scheme(blocks_1, p1, blocks_2, p2, type_of_scheme)
                E.append((p1,p2))
                W.append(weight)            
    norm_W = [(float(i)-min(W))/(max(W)-min(W)) for i in W]
    return V, E, norm_W

In [34]:
graph_token_jaccard = make_graph(token_blocks, entity_index_token_blocks)
graph_attribute_jaccard = make_graph(attribute_blocks, entity_index_attribute_blocks)
graph_token_common = make_graph(token_blocks, entity_index_token_blocks, type_of_scheme="Common")
graph_attribute_common = make_graph(attribute_blocks, entity_index_attribute_blocks, type_of_scheme="Common")

## Weight edge pruning and the cardinality node pruning methods

In [35]:
def WEP(graph):
    V = graph[0]
    E = graph[1]
    W = graph[2]
    W_min = np.mean(W)
    low_W_index = []
    for i in range(0,len(E)):
        if W[i] < W_min:
            low_W_index.append(i)
    E = np.delete(E, low_W_index, axis=0)       
    W = np.delete(W, low_W_index)
    return V, E, W

In [36]:
def CEP(graph, K):
    V = graph[0]
    E = graph[1]
    W = graph[2]
    sorted_stack = {}
    for i in range(0,len(E)):
        sorted_stack[i] = W[i]
        sorted_stack = {k: v for k, v in sorted(sorted_stack.items(), key=lambda item: item[1], reverse=True)}
        
        if K < len(sorted_stack):
            sorted_stack.popitem()
    retain_index = sorted_stack.keys()

    E = [E[i] for i in retain_index] 
    W = [W[i] for i in retain_index] 
    return V, E, W

## Meta-blocking results

In [37]:
def blocking_cardinality(blocks, entities_size):
    output = 0
    for block in blocks:
        output = output + len(block)
    return output / entities_size 

In [38]:
K_token = math.floor(blocking_cardinality(token_blocks.values(), 
                                          len(all_entities))*len(all_entities))/2

graph_token_jaccard_WEP = WEP(graph_token_jaccard)
graph_token_jaccard_CEP = CEP(graph_token_jaccard, K_token)

In [39]:
K_attribute = math.floor(blocking_cardinality(attribute_blocks.values(), 
                                              len(all_entities))*len(all_entities))/2

graph_attribute_jaccard_WEP = WEP(graph_attribute_jaccard)
graph_attribute_jaccard_CEP = CEP(graph_attribute_jaccard, K_attribute)

In [40]:
graph_token_common_WEP = WEP(graph_token_common)
graph_token_common_CEP = CEP(graph_token_common, K_token)

In [41]:
graph_attribute_common_WEP = WEP(graph_attribute_common)
graph_attribute_common_CEP = CEP(graph_attribute_common, K_attribute)