In [1]:
import os
import pandas as pd
import random

In [2]:
movies_dataset_path = 'movies_metadata.csv'
cols = ['genres', 'imdb_id', 'title', 'production_companies', 'production_countries']

# Read movies_metadata.csv & chosen columns & drop NA
movies_df = pd.read_csv(movies_dataset_path, usecols=cols).dropna()

# Drop rows whose 'genres' == [] or 'production_companies' == [] or 'production_countries' == []
movies_df = movies_df[(movies_df['genres'] != '[]')  & \
                        (movies_df['production_companies'] != '[]') & \
                        (movies_df['production_countries'] != '[]')]

In [3]:
# Choose 200 random rows
random_movies_df = movies_df.sample(n=200).reset_index(drop=True)

# Make 2 seperate KBs from random_movies_df
kb1 = random_movies_df.sample(n=120)
kb2 = random_movies_df.sample(n=120)

In [4]:
random_movies_df.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
0,"[{'id': 53, 'name': 'Thriller'}, {'id': 27, 'n...",tt1930294,[{'name': 'Submarine Entertainment Distributor...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Black Rock
1,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",tt0092854,"[{'name': 'Balcor Film Investors', 'id': 2165}...","[{'iso_3166_1': 'US', 'name': 'United States o...",Death Before Dishonor
2,"[{'id': 18, 'name': 'Drama'}]",tt3973410,"[{'name': 'Wunderbar Films', 'id': 12466}, {'n...","[{'iso_3166_1': 'IN', 'name': 'India'}]",Kaaka Muttai
3,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",tt0095593,"[{'name': 'Orion Pictures', 'id': 41}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",Married to the Mob
4,"[{'id': 35, 'name': 'Comedy'}]",tt1082051,"[{'name': 'Epithète Films', 'id': 7286}]","[{'iso_3166_1': 'FR', 'name': 'France'}]","Musée haut, musée bas"


In [5]:
kb1.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
15,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",tt0074108,"[{'name': 'Eichberg-Film', 'id': 1328}, {'name...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",Albino
144,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'n...",tt0114134,"[{'name': 'Kasander & Wigman Productions', 'id...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",The Pillow Book
124,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",tt0323872,"[{'name': 'Svenska Filminstitutet', 'id': 2302...","[{'iso_3166_1': 'NO', 'name': 'Norway'}, {'iso...",Kitchen Stories
83,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",tt0115759,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",Broken Arrow
21,"[{'id': 35, 'name': 'Comedy'}, {'id': 878, 'na...",tt0069945,"[{'name': 'University of Southern California',...","[{'iso_3166_1': 'US', 'name': 'United States o...",Dark Star


In [6]:
kb2.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
59,"[{'id': 37, 'name': 'Western'}, {'id': 18, 'na...",tt0479537,"[{'name': 'Icon Productions', 'id': 152}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Seraphim Falls
66,"[{'id': 12, 'name': 'Adventure'}]",tt0083741,"[{'name': 'CBS Fox Video', 'id': 38925}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Cold River
75,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",tt0282120,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",The Wild Thornberrys Movie
111,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",tt0034716,"[{'name': 'RKO Radio Pictures', 'id': 6}]","[{'iso_3166_1': 'US', 'name': 'United States o...",The Falcon Takes Over
101,"[{'id': 18, 'name': 'Drama'}]",tt0029769,"[{'name': 'Warner Bros.', 'id': 6194}]","[{'iso_3166_1': 'US', 'name': 'United States o...",White Bondage


In [7]:
# Preprocessing: Make a string of list of dictionaries into list of names of features.
# E.g. 'genres' column output only contains 'Drama, Action, Documentary'
def dicts_list_to_str(df: pd.DataFrame, colnames):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        for col in colnames:
            list_of_dicts = eval(row[col])
            if col == 'production_countries':
                col_elements = [col_dict['iso_3166_1'] for col_dict in list_of_dicts]
            else:
                col_elements = [col_dict['name'] for col_dict in list_of_dicts]
            col_str = ' '.join(col_elements)
            # Replace back to the column name
            row[col] = col_str
            
    return df_clone

In [8]:
columns_to_modify = ['genres', 'production_companies', 'production_countries']
kb1_modified = dicts_list_to_str(kb1, columns_to_modify)

In [9]:
kb1_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
15,Adventure Action,tt0074108,Eichberg-Film Lord Film Produktion,DE ZW GB ZA,Albino
144,Drama Foreign Romance,tt0114134,Kasander & Wigman Productions Alpha Films Nede...,FR LU GB,The Pillow Book
124,Drama Comedy,tt0323872,Svenska Filminstitutet BOB Film Sweden AB Bulb...,NO SE,Kitchen Stories
83,Action Adventure Drama Thriller,tt0115759,Twentieth Century Fox Film Corporation WCG Ent...,US,Broken Arrow
21,Comedy Science Fiction,tt0069945,University of Southern California Jack H. Harr...,US,Dark Star


In [10]:
kb2_modified = dicts_list_to_str(kb2, columns_to_modify)

In [11]:
kb2_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
59,Western Drama,tt0479537,Icon Productions,US,Seraphim Falls
66,Adventure,tt0083741,CBS Fox Video,US,Cold River
75,Animation Adventure Family,tt0282120,Paramount Pictures Nickelodeon Movies Paramoun...,US,The Wild Thornberrys Movie
111,Crime Mystery,tt0034716,RKO Radio Pictures,US,The Falcon Takes Over
101,Drama,tt0029769,Warner Bros.,US,White Bondage


In [12]:
def modified_kb2(df: pd.DataFrame, randomly_removed_cols):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        # For 'genres', 'production_countries', if there is only 1 genre, keep it. Otherwise, remove one of them
        for col in randomly_removed_cols:
            feature_list = row[col].split(' ')
            if len(feature_list) > 1:
                # Randomly remove 1 of them
                random.shuffle(feature_list)
                feature_list.pop()
                # Merge back as a string
                features_str = ' '.join(feature_list)
                row[col] = features_str
        
        
        # For 'title', make some changes so it looks like a typo
        title_list = row['title'].split(' ')
        # Make typo changes only with 'title' has more than 2 words
        if len(title_list) > 2:
            row['title'] = row['title'].replace('a', '4').replace('i', 'j')
    return df_clone

In [13]:
randomly_remove_cols = ['genres', 'production_countries']

kb2_processed = modified_kb2(kb2_modified, randomly_remove_cols)

### Final KBs

In [14]:
# Final KBs
kb1_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
15,Adventure Action,tt0074108,Eichberg-Film Lord Film Produktion,DE ZW GB ZA,Albino
144,Drama Foreign Romance,tt0114134,Kasander & Wigman Productions Alpha Films Nede...,FR LU GB,The Pillow Book
124,Drama Comedy,tt0323872,Svenska Filminstitutet BOB Film Sweden AB Bulb...,NO SE,Kitchen Stories
83,Action Adventure Drama Thriller,tt0115759,Twentieth Century Fox Film Corporation WCG Ent...,US,Broken Arrow
21,Comedy Science Fiction,tt0069945,University of Southern California Jack H. Harr...,US,Dark Star


In [15]:
kb2_processed.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
59,Western,tt0479537,Icon Productions,US,Seraphim Falls
66,Adventure,tt0083741,CBS Fox Video,US,Cold River
75,Family Adventure,tt0282120,Paramount Pictures Nickelodeon Movies Paramoun...,US,The Wjld Thornberrys Movje
111,Crime,tt0034716,RKO Radio Pictures,US,The F4lcon T4kes Over
101,Drama,tt0029769,Warner Bros.,US,White Bondage


### Token blocking

In [85]:
test_db1 = kb1_modified.copy()
test_db2 = kb2_processed.copy()

dict_db1 = {}
dict_db2 = {}

# Get token of every KBs
for (idxRow1, row1), (idxRow2, row2) in zip(test_db1.iterrows(), test_db2.iterrows()):
    for (idxCol, col1), (_, col2) in zip(row1.iteritems(), row2.iteritems()):
        col1_string = col1.split(' ')
        col2_string = col2.split(' ')
        
        # Add into the dictionaries, with key is the word token and the value is the rowIdx
        for token in col1_string:
            if token in dict_db1.keys():
                dict_db1[token].append(str(idxRow1) + '_kb1')
            else:
                dict_db1[token] = [str(idxRow1) + '_kb1']
                
        for token in col2_string:
            if token in dict_db2.keys():
                dict_db2[token].append(str(idxRow2) + '_kb2')
            else:
                dict_db2[token] = [str(idxRow2) + '_kb2']

# Make values in each key appears once
for key, values in dict_db1.items():
    dict_db1[key] = list(set(values))
    
for key, values in dict_db2.items():
    dict_db2[key] = list(set(values))

In [86]:
dict_db1

{'Adventure': ['36_kb1',
  '151_kb1',
  '83_kb1',
  '85_kb1',
  '98_kb1',
  '57_kb1',
  '5_kb1',
  '15_kb1',
  '67_kb1'],
 'Action': ['36_kb1',
  '1_kb1',
  '71_kb1',
  '17_kb1',
  '115_kb1',
  '122_kb1',
  '151_kb1',
  '27_kb1',
  '81_kb1',
  '83_kb1',
  '85_kb1',
  '33_kb1',
  '46_kb1',
  '5_kb1',
  '15_kb1',
  '45_kb1',
  '93_kb1',
  '123_kb1'],
 'tt0074108': ['15_kb1'],
 'Eichberg-Film': ['15_kb1'],
 'Lord': ['15_kb1'],
 'Film': ['1_kb1',
  '171_kb1',
  '131_kb1',
  '144_kb1',
  '107_kb1',
  '28_kb1',
  '43_kb1',
  '89_kb1',
  '58_kb1',
  '140_kb1',
  '124_kb1',
  '146_kb1',
  '83_kb1',
  '73_kb1',
  '125_kb1',
  '149_kb1',
  '51_kb1',
  '23_kb1',
  '15_kb1',
  '183_kb1',
  '70_kb1',
  '33_kb1'],
 'Produktion': ['15_kb1', '125_kb1', '28_kb1', '140_kb1'],
 'DE': ['131_kb1',
  '24_kb1',
  '28_kb1',
  '146_kb1',
  '33_kb1',
  '73_kb1',
  '23_kb1',
  '15_kb1',
  '137_kb1',
  '125_kb1',
  '140_kb1'],
 'ZW': ['15_kb1'],
 'GB': ['84_kb1',
  '153_kb1',
  '120_kb1',
  '144_kb1',
  '107_kb1'

In [87]:
dict_db2

{'Western': ['69_kb2',
  '160_kb2',
  '59_kb2',
  '65_kb2',
  '127_kb2',
  '81_kb2',
  '45_kb2'],
 'tt0479537': ['59_kb2'],
 'Icon': ['59_kb2'],
 'Productions': ['172_kb2',
  '141_kb2',
  '26_kb2',
  '27_kb2',
  '29_kb2',
  '59_kb2',
  '68_kb2',
  '67_kb2',
  '83_kb2',
  '144_kb2',
  '49_kb2',
  '181_kb2',
  '180_kb2',
  '115_kb2',
  '105_kb2',
  '152_kb2',
  '74_kb2',
  '130_kb2',
  '91_kb2',
  '19_kb2',
  '65_kb2',
  '174_kb2',
  '79_kb2'],
 'US': ['172_kb2',
  '60_kb2',
  '141_kb2',
  '86_kb2',
  '113_kb2',
  '188_kb2',
  '26_kb2',
  '43_kb2',
  '8_kb2',
  '142_kb2',
  '166_kb2',
  '122_kb2',
  '72_kb2',
  '59_kb2',
  '68_kb2',
  '66_kb2',
  '127_kb2',
  '48_kb2',
  '42_kb2',
  '67_kb2',
  '151_kb2',
  '83_kb2',
  '46_kb2',
  '90_kb2',
  '93_kb2',
  '160_kb2',
  '164_kb2',
  '171_kb2',
  '197_kb2',
  '21_kb2',
  '183_kb2',
  '154_kb2',
  '147_kb2',
  '31_kb2',
  '108_kb2',
  '81_kb2',
  '158_kb2',
  '0_kb2',
  '182_kb2',
  '1_kb2',
  '79_kb2',
  '110_kb2',
  '185_kb2',
  '13_kb2',
 

In [90]:
# Blocking
blocks = {}

for key in dict_db1.keys():
    if key in dict_db2.keys():
        rows_kb1 = dict_db1[key]
        rows_kb1.extend(dict_db2[key])
        blocks[key] = rows_kb1

In [91]:
blocks

{'Adventure': ['36_kb1',
  '151_kb1',
  '83_kb1',
  '85_kb1',
  '98_kb1',
  '57_kb1',
  '5_kb1',
  '15_kb1',
  '67_kb1',
  '83_kb2',
  '75_kb2',
  '15_kb2',
  '63_kb2',
  '39_kb2',
  '36_kb2',
  '66_kb2',
  '158_kb2'],
 'Action': ['36_kb1',
  '1_kb1',
  '71_kb1',
  '17_kb1',
  '115_kb1',
  '122_kb1',
  '151_kb1',
  '27_kb1',
  '81_kb1',
  '83_kb1',
  '85_kb1',
  '33_kb1',
  '46_kb1',
  '5_kb1',
  '15_kb1',
  '45_kb1',
  '93_kb1',
  '123_kb1',
  '151_kb2',
  '46_kb2',
  '93_kb2',
  '162_kb2',
  '74_kb2',
  '27_kb2',
  '33_kb2',
  '8_kb2',
  '123_kb2',
  '122_kb2'],
 'tt0074108': ['15_kb1', '15_kb2'],
 'Eichberg-Film': ['15_kb1', '15_kb2'],
 'Lord': ['15_kb1', '15_kb2'],
 'Film': ['1_kb1',
  '171_kb1',
  '131_kb1',
  '144_kb1',
  '107_kb1',
  '28_kb1',
  '43_kb1',
  '89_kb1',
  '58_kb1',
  '140_kb1',
  '124_kb1',
  '146_kb1',
  '83_kb1',
  '73_kb1',
  '125_kb1',
  '149_kb1',
  '51_kb1',
  '23_kb1',
  '15_kb1',
  '183_kb1',
  '70_kb1',
  '33_kb1',
  '1_kb2',
  '58_kb2',
  '149_kb2',
  '83

### Attribute Clustering Blocking

In [92]:
# Tokens for each KBs
# KB1
attribute_token_kb1 = {}
for i in list(test_db1):
    attribute_list = test_db1[i].tolist()
    tokens_list = []
    for entry in attribute_list:
        tokens = entry.split(' ')
        tokens_list.extend(tokens)
    # Turn it into a set so that each token appears once
    tokens_list = list(set(tokens_list))
    attribute_token_kb1[i] = tokens_list

# KB2
attribute_token_kb2 = {}
for i in list(test_db2):
    attribute_list = test_db2[i].tolist()
    tokens_list = []
    for entry in attribute_list:
        tokens = entry.split(' ')
        tokens_list.extend(tokens)
    # Turn it into a set so that each token appears once
    tokens_list = list(set(tokens_list))
    attribute_token_kb2[i] = tokens_list

In [93]:
attribute_token_kb1

{'genres': ['Family',
  'Mystery',
  'Fiction',
  'Horror',
  'Music',
  'War',
  'TV',
  'Drama',
  'Movie',
  'Science',
  'Animation',
  'History',
  'Foreign',
  'Romance',
  'Thriller',
  'Crime',
  'Action',
  'Adventure',
  'Comedy',
  'Western',
  'Documentary',
  'Fantasy'],
 'imdb_id': ['tt0111386',
  'tt0295700',
  'tt4933782',
  'tt0036929',
  'tt3612032',
  'tt0815241',
  'tt0084725',
  'tt1674047',
  'tt1410272',
  'tt2034139',
  'tt0044085',
  'tt0286499',
  'tt0058500',
  'tt0053715',
  'tt0107157',
  'tt0323872',
  'tt0245837',
  'tt0081114',
  'tt0772178',
  'tt0034386',
  'tt0097236',
  'tt4631532',
  'tt0076760',
  'tt0270288',
  'tt0028402',
  'tt0095593',
  'tt0100680',
  'tt1045670',
  'tt0064177',
  'tt0069316',
  'tt5370828',
  'tt2559458',
  'tt0062865',
  'tt0223880',
  'tt1843840',
  'tt2070803',
  'tt0349169',
  'tt0018097',
  'tt0050870',
  'tt0331282',
  'tt0067384',
  'tt0115759',
  'tt0083789',
  'tt0018054',
  'tt0116016',
  'tt3345472',
  'tt3529010',

In [95]:
# Write a function for Jaccard similarities
for attribute_kb1, values in attribute_token_kb1.items():
    print(attribute_kb1)
    # Compare with all attributes name in attribute_token_kb2:
    for attribute_kb2 in attribute_token_kb2.keys():
        mutual_tokens = set(attribute_token_kb1[attribute_kb1]).intersection(attribute_token_kb2[attribute_kb2])
        attr_kb1 = values
        attr_kb1.extend(attribute_token_kb2[attribute_kb2])
        total_tokens = set(attr_kb1)
        # Jaccarcd similarity
        print(len(mutual_tokens)/len(total_tokens))

genres
0.9545454545454546
0.0
0.008368200836820083
0.0
0.017766497461928935
imdb_id
0.0
0.3882978723404255
0.005714285714285714
0.0
0.016766467065868262
production_companies
0.0027624309392265192
0.0
0.37919463087248323
0.0
0.02564102564102564
production_countries
0.0
0.0
0.005791505791505791
0.05009633911368015
0.017456359102244388
title
0.013888888888888888
0.0
0.020463847203274217
0.0
0.12927350427350429
