In [126]:
import os
import pandas as pd
import random
import numpy as np

In [2]:
movies_dataset_path = 'movies_metadata.csv'
cols = ['genres', 'imdb_id', 'title', 'production_companies', 'production_countries']

# Read movies_metadata.csv & chosen columns & drop NA
movies_df = pd.read_csv(movies_dataset_path, usecols=cols).dropna()

# Drop rows whose 'genres' == [] or 'production_companies' == [] or 'production_countries' == []
movies_df = movies_df[(movies_df['genres'] != '[]')  & \
                        (movies_df['production_companies'] != '[]') & \
                        (movies_df['production_countries'] != '[]')]

In [3]:
# Choose 200 random rows
random_movies_df = movies_df.sample(n=200).reset_index(drop=True)

# Make 2 seperate KBs from random_movies_df
kb1 = random_movies_df.sample(n=120)
kb2 = random_movies_df.sample(n=120)

In [4]:
random_movies_df.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
0,"[{'id': 28, 'name': 'Action'}, {'id': 27, 'nam...",tt0080827,"[{'name': 'Bo Ho Film Company Ltd.', 'id': 3106}]","[{'iso_3166_1': 'HK', 'name': 'Hong Kong'}]",Encounters of the Spooky Kind
1,"[{'id': 35, 'name': 'Comedy'}]",tt0069747,"[{'name': 'Les Films Pomereu', 'id': 459}, {'n...","[{'iso_3166_1': 'IT', 'name': 'Italy'}, {'iso_...",The Mad Adventures of Rabbi Jacob
2,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",tt0177068,"[{'name': 'Shooting Gallery', 'id': 1596}, {'n...","[{'iso_3166_1': 'US', 'name': 'United States o...",Once in the Life
3,"[{'id': 18, 'name': 'Drama'}]",tt0014532,"[{'name': 'Paramount Pictures', 'id': 4}]","[{'iso_3166_1': 'US', 'name': 'United States o...",The Ten Commandments
4,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",tt0088944,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Commando


In [5]:
kb1.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
126,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",tt0079111,"[{'name': 'Albatros Filmproduktion', 'id': 8568}]","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",Ernesto
37,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",tt0023940,"[{'name': 'Paramount Pictures', 'id': 4}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Design for Living
83,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",tt0456047,"[{'name': 'Yeni Stüdyo', 'id': 16725}]","[{'iso_3166_1': 'TR', 'name': 'Turkey'}]",Bizim Aile
109,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",tt1985981,"[{'name': 'Eros International', 'id': 3653}]","[{'iso_3166_1': 'IN', 'name': 'India'}, {'iso_...",Desi Boyz
149,"[{'id': 9648, 'name': 'Mystery'}, {'id': 53, '...",tt0039288,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",Cry Wolf


In [6]:
kb2.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
145,"[{'id': 53, 'name': 'Thriller'}]",tt1084733,"[{'name': 'Art of War Films', 'id': 22218}, {'...","[{'iso_3166_1': 'US', 'name': 'United States o...",Necessary Evil
84,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",tt0086074,"[{'name': 'Vides Cinematografica', 'id': 12111}]","[{'iso_3166_1': 'IT', 'name': 'Italy'}]",Hearts and Armour
199,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",tt0046728,"[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8...","[{'iso_3166_1': 'US', 'name': 'United States o...",Athena
85,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",tt0109015,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",3 Ninjas Kick Back
18,"[{'id': 27, 'name': 'Horror'}]",tt1001333,"[{'name': 'Finney/Thompson Entertainment', 'id...","[{'iso_3166_1': 'US', 'name': 'United States o...",Carver


In [7]:
# Preprocessing: Make a string of list of dictionaries into list of names of features.
# E.g. 'genres' column output only contains 'Drama, Action, Documentary'
def dicts_list_to_str(df: pd.DataFrame, columns_to_modify):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        for col in columns_to_modify:
            list_of_dicts = eval(row[col])
            if col == 'production_countries':
                col_elements = [col_dict['iso_3166_1'] for col_dict in list_of_dicts]
            else:
                col_elements = [col_dict['name'] for col_dict in list_of_dicts]
            col_str = ' '.join(col_elements)
            # Replace back to the column name
            row[col] = col_str
    
    return df_clone

In [8]:
columns_to_modify = ['genres', 'production_companies', 'production_countries']
kb1_modified = dicts_list_to_str(kb1, columns_to_modify)

In [9]:
kb1_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
126,Drama Music Romance,tt0079111,Albatros Filmproduktion,IT,Ernesto
37,Comedy Romance,tt0023940,Paramount Pictures,US,Design for Living
83,Comedy Family Drama,tt0456047,Yeni Stüdyo,TR,Bizim Aile
109,Comedy Drama,tt1985981,Eros International,IN TH GB,Desi Boyz
149,Mystery Thriller,tt0039288,Warner Bros. Thomson Productions,US,Cry Wolf


In [10]:
kb2_modified = dicts_list_to_str(kb2, columns_to_modify)

In [11]:
kb2_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
145,Thriller,tt1084733,Art of War Films Lenz Films,US,Necessary Evil
84,Action Adventure Fantasy,tt0086074,Vides Cinematografica,IT,Hearts and Armour
199,Romance Comedy,tt0046728,Metro-Goldwyn-Mayer (MGM),US,Athena
85,Adventure Action Comedy Family,tt0109015,TriStar Pictures Sheen Productions Ben-Ami/Lee...,JP US,3 Ninjas Kick Back
18,Horror,tt1001333,Finney/Thompson Entertainment Two For Flinchin...,US,Carver


In [12]:
def modified_kb2(df: pd.DataFrame, randomly_removed_cols):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        # For 'genres', 'production_countries', if there is only 1 genre, keep it. Otherwise, remove one of them
        for col in randomly_removed_cols:
            feature_list = row[col].split(' ')
            if len(feature_list) > 1:
                # Randomly remove 1 of them
                random.shuffle(feature_list)
                feature_list.pop()
                # Merge back as a string
                features_str = ' '.join(feature_list)
                row[col] = features_str
        
        
        # For 'title', make some changes so it looks like a typo
        title_list = row['title'].split(' ')
        # Make typo changes only with 'title' has more than 2 words
        if len(title_list) > 2:
            row['title'] = row['title'].replace('a', '4').replace('i', 'j')
    return df_clone

In [13]:
randomly_remove_cols = ['genres', 'production_countries']

kb2_processed = modified_kb2(kb2_modified, randomly_remove_cols)

In [14]:
# Modify KBs' attributes (column name) so that they are different, and easier for later blocking methods
def name_modification(Kb, appended_str:str):
    Kb_clone = Kb.copy()
    name_modification_dict = {}
    for col in Kb_clone.columns:
        name_modification_dict[col] = col + appended_str
    Kb_clone.rename(name_modification_dict, axis=1, inplace=True)

    return Kb_clone

In [15]:
kb1_processed = name_modification(kb1_modified, '_kb1')
kb2_processed_2 = name_modification(kb2_processed, '_kb2')

### Final KBs

kb1_processed & kb2_processed_2

In [16]:
# Final KBs
kb1_processed.head()

Unnamed: 0,genres_kb1,imdb_id_kb1,production_companies_kb1,production_countries_kb1,title_kb1
126,Drama Music Romance,tt0079111,Albatros Filmproduktion,IT,Ernesto
37,Comedy Romance,tt0023940,Paramount Pictures,US,Design for Living
83,Comedy Family Drama,tt0456047,Yeni Stüdyo,TR,Bizim Aile
109,Comedy Drama,tt1985981,Eros International,IN TH GB,Desi Boyz
149,Mystery Thriller,tt0039288,Warner Bros. Thomson Productions,US,Cry Wolf


In [17]:
kb2_processed_2.head()

Unnamed: 0,genres_kb2,imdb_id_kb2,production_companies_kb2,production_countries_kb2,title_kb2
145,Thriller,tt1084733,Art of War Films Lenz Films,US,Necessary Evil
84,Adventure Fantasy,tt0086074,Vides Cinematografica,IT,He4rts 4nd Armour
199,Romance,tt0046728,Metro-Goldwyn-Mayer (MGM),US,Athena
85,Action Comedy Family,tt0109015,TriStar Pictures Sheen Productions Ben-Ami/Lee...,US,3 Njnj4s Kjck B4ck
18,Horror,tt1001333,Finney/Thompson Entertainment Two For Flinchin...,US,Carver


# Token blocking

In [56]:
test_db1 = kb1_processed.copy()
test_db2 = kb2_processed_2.copy()

In [53]:
def tokens_extraction(Kb, appended_str):
    dict_kb = {}
    
    for idxRow, row_ in Kb.iterrows():
        for idxCol, col_ in row_.iteritems():
            col_tokens = col_.split(' ')
            
            for token in col_tokens:
                if token in dict_kb.keys():
                    dict_kb[token].append(str(idxRow) + appended_str)
                else:
                    dict_kb[token] = [str(idxRow) + appended_str]
                    
    # Make values in each key appears once
    for key, value in dict_kb.items():
        dict_kb[key] = list(set(value))
        
    return dict_kb

def token_blocking(Kb1, Kb2):
    
    dict_db1 = tokens_extraction(test_db1, '_kb1')
    dict_db2 = tokens_extraction(test_db2, '_kb2')
    
    blocks = {}

    for key in dict_db1.keys():
        if key in dict_db2.keys():
            rows_kb1 = dict_db1[key][:]
            rows_kb1.extend(dict_db2[key])
            blocks[key] = rows_kb1
            
    return blocks

In [55]:
token_blocks = token_blocking(test_db1, test_db2)

# Attribute Clustering Blocking

In [128]:
def get_attribute_tokens(Kb):
    attribute_token = {}
    # Loop through all attribute names
    for i in list(Kb):
        attribute_values = Kb[i].tolist()
        tokens_list = []
        # Loop each entry (row) in attribute_values
        for entry in attribute_values:
            tokens = entry.split(' ')
            tokens_list.extend(tokens)
        # Turn it into a set so that each token appears once
        tokens_set = list(set(tokens_list))
        attribute_token[i] = tokens_set
    
    return attribute_token

# def get_links(ref_token_dict, target_token_dict):
#     links = {}
#     for attribute_ref in ref_token_dict.keys():
#         # Compare with all attributes name in target_token_dict:
#         for attribute_target in target_token_dict.keys():
#             # Mutual tokens
#             mutual_tokens = set(ref_token_dict[attribute_ref]).intersection(target_token_dict[attribute_target])
#             # Total tokens
#             tokens_ref = ref_token_dict[attribute_ref][:]
#             tokens_ref.extend(target_token_dict[attribute_target])
#             total_tokens = set(tokens_ref)
#             # Add link if the Jaccard similarity score is > 0
#             jaccard_similarity_score = len(mutual_tokens)/len(total_tokens)
#             if jaccard_similarity_score > 0:
#                 if attribute_ref in links.keys():
#                     links[attribute_ref].append(attribute_target)
#                 else:
#                     links[attribute_ref] = [attribute_target]
#         # This is when no link is added, so that the attribute_ref key does not exist yet
#         if attribute_ref not in links.keys():
#             links[attribute_ref] = []
    
#     return links

# def transitive_closure(links_1, links_2, initial_attribute='genres_kb1', visited=[]):
#     visited.append(initial_attribute)
#     similar_attributes_1 = links_1[initial_attribute][:]
#     for attribute_2 in similar_attributes_1:
#         if attribute_2 not in visited:
#             transitive_closure(links_2, links_1, attribute_2, visited)
#     return visited

def get_links(ref_token_dict, target_token_dict):
    links = []
    standalone_attribute = []
    for attribute_ref in ref_token_dict.keys():
        link_exist = False
        # Compare with all attributes name in target_token_dict:
        for attribute_target in target_token_dict.keys():
            # Mutual tokens
            mutual_tokens = set(ref_token_dict[attribute_ref]).intersection(target_token_dict[attribute_target])
            # Total tokens
            tokens_ref = ref_token_dict[attribute_ref][:]
            tokens_ref.extend(target_token_dict[attribute_target])
            total_tokens = set(tokens_ref)
            # Add link if the Jaccard similarity score is > 0
            jaccard_similarity_score = len(mutual_tokens)/len(total_tokens)
            if jaccard_similarity_score > 0:
                links.append([attribute_ref, attribute_target])
                link_exist = True

        # This is when no link is added, so that the attribute_ref key does not exist yet
        if not link_exist:
            standalone_attribute.append(attribute_ref)
    
    return links, standalone_attribute

def transitive_closure(edges):
    clusters = []
    for edge in edges:
        u = edge[0]
        v = edge[1]
        if len(clusters) == 0:
            clusters.append({u,v}) 
        else:
            if any(not {u,v}.isdisjoint(c) for c in clusters):
                for c in clusters:
                    if (u in c) or (v in c):
                        c.add(u)
                        c.add(v)          
            else:
                clusters.append({u,v})
    return clusters

def attribute_clustering_blocking(kb1, kb2):
    # Get tokens of all attributes name in the Kb
    attribute_token_kb1 = get_attribute_tokens(kb1)
    attribute_token_kb2 = get_attribute_tokens(kb2)
    
    # Get links by Jaccard similarity
    links_by_kb1, standalone_attributes_1 = get_links(attribute_token_kb1, attribute_token_kb2)
    links_by_kb2, standalone_attributes_2 = get_links(attribute_token_kb2, attribute_token_kb1)
    
    links = links_by_kb1 + links_by_kb2
    standalone_attributes = standalone_attributes_1 + standalone_attributes_2
    
    # Transitive closure
#     clusters = {}
#     visited_attributes = []
#     i = 0
#     for attribute in links_by_kb1.keys():
#         if attribute not in visited_attributes:
#             transitive_attributes = transitive_closure(links_by_kb1, links_by_kb2, attribute, visited=[])
#             clusters['c' + str(i)] = transitive_attributes
#             visited_attributes.extend(transitive_attributes)
#             i += 1
    clusters = transitive_closure(links)

    # Examine for singleton cluster
#     glue_clusters = []
    
#     for cluster, attributes in clusters.items():
#         if len(attributes) == 1:
#             clusters.pop(cluster, None)
#             glue_clusters.append(attributes)
#     if len(glue_clusters) != 0:
#         cluster['glue_cluster'] = glue_clusters
    if len(standalone_attributes) != 0:
        clusters.append(set(standalone_attributes))
    
    return clusters

In [129]:
clusters = attribute_clustering_blocking(test_db1, test_db2)
clusters

[{'genres_kb1',
  'genres_kb2',
  'production_companies_kb1',
  'production_companies_kb2',
  'title_kb1',
  'title_kb2'},
 {'imdb_id_kb1', 'imdb_id_kb2'},
 {'production_countries_kb1', 'production_countries_kb2'}]

In [146]:
def get_cluster_name(clusters, idxCol):
    for idx, c in enumerate(clusters):
        if idxCol in c:
            return 'c'+ str(idx)

In [148]:
test_db1 = kb1_processed.copy()
test_db2 = kb2_processed_2.copy()

dict_db1 = {}
dict_db2 = {}

# Get token of every KBs
for (idxRow1, row1), (idxRow2, row2) in zip(test_db1.iterrows(), test_db2.iterrows()):
    for (idxCol1, col1), (idxCol2, col2) in zip(row1.iteritems(), row2.iteritems()):
        col1_string = col1.split(' ')
        col2_string = col2.split(' ')
        
        # Add into the dictionaries, with key is the word token and the value is the rowIdx
        for token in col1_string:
            token = get_cluster_name(clusters, idxCol1) + '.' + token
            if token in dict_db1.keys():
                dict_db1[token].append(str(idxRow1) + '_kb1')
            else:
                dict_db1[token] = [str(idxRow1) + '_kb1']
                
        for token in col2_string:
            token = get_cluster_name(clusters, idxCol2) + '.' + token
            if token in dict_db2.keys():
                dict_db2[token].append(str(idxRow2) + '_kb2')
            else:
                dict_db2[token] = [str(idxRow2) + '_kb2']

# Make values in each key appears once
for key, values in dict_db1.items():
    dict_db1[key] = list(set(values))
    
for key, values in dict_db2.items():
    dict_db2[key] = list(set(values))

In [149]:
# Blocking
blocks = {}

for key in dict_db1.keys():
    if key in dict_db2.keys():
        rows_kb1 = dict_db1[key][:]
        rows_kb1.extend(dict_db2[key])
        blocks[key] = rows_kb1
blocks

{'c0.Drama': ['21_kb1',
  '71_kb1',
  '102_kb1',
  '161_kb1',
  '22_kb1',
  '97_kb1',
  '119_kb1',
  '178_kb1',
  '177_kb1',
  '158_kb1',
  '78_kb1',
  '168_kb1',
  '20_kb1',
  '49_kb1',
  '155_kb1',
  '165_kb1',
  '115_kb1',
  '107_kb1',
  '116_kb1',
  '183_kb1',
  '72_kb1',
  '61_kb1',
  '121_kb1',
  '81_kb1',
  '2_kb1',
  '6_kb1',
  '3_kb1',
  '114_kb1',
  '153_kb1',
  '57_kb1',
  '126_kb1',
  '194_kb1',
  '93_kb1',
  '129_kb1',
  '99_kb1',
  '189_kb1',
  '150_kb1',
  '39_kb1',
  '86_kb1',
  '134_kb1',
  '76_kb1',
  '52_kb1',
  '103_kb1',
  '7_kb1',
  '147_kb1',
  '74_kb1',
  '109_kb1',
  '27_kb1',
  '101_kb1',
  '50_kb1',
  '160_kb1',
  '188_kb1',
  '113_kb1',
  '83_kb1',
  '55_kb1',
  '9_kb1',
  '56_kb1',
  '30_kb1',
  '110_kb1',
  '32_kb1',
  '101_kb2',
  '126_kb2',
  '106_kb2',
  '94_kb2',
  '9_kb2',
  '55_kb2',
  '114_kb2',
  '121_kb2',
  '155_kb2',
  '131_kb2',
  '25_kb2',
  '170_kb2',
  '15_kb2',
  '24_kb2',
  '173_kb2',
  '2_kb2',
  '78_kb2',
  '153_kb2',
  '82_kb2',
  '66_k