In [82]:
import os
import pandas as pd
import random

In [83]:
movies_dataset_path = 'movies_metadata.csv'
cols = ['genres', 'imdb_id', 'title', 'production_companies', 'production_countries']

# Read movies_metadata.csv & chosen columns & drop NA
movies_df = pd.read_csv(movies_dataset_path, usecols=cols).dropna()

# Drop rows whose 'genres' == [] or 'production_companies' == [] or 'production_countries' == []
movies_df = movies_df[(movies_df['genres'] != '[]')  & \
                        (movies_df['production_companies'] != '[]') & \
                        (movies_df['production_countries'] != '[]')].reset_index(drop=True)

In [84]:
# Choose 200 random rows
random_movies_df = movies_df.sample(n=200)

# Make 2 seperate KBs from random_movies_df
kb1 = random_movies_df.sample(n=120)
kb2 = random_movies_df.sample(n=120)

In [85]:
kb1.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
26088,"[{'id': 35, 'name': 'Comedy'}]",tt3115242,"[{'name': 'Canal+', 'id': 5358}, {'name': ""Mot...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",The Conquerors
12287,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",tt0988045,"[{'name': 'Village Roadshow Pictures', 'id': 7...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",Sherlock Holmes
26222,"[{'id': 18, 'name': 'Drama'}]",tt4819804,"[{'name': 'Pica Pica Media Limited', 'id': 639...","[{'iso_3166_1': 'HK', 'name': 'Hong Kong'}]",Hong Kong Trilogy: Preschooled Preoccupied Pre...
20709,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",tt0056138,"[{'name': 'Mirisch Corporation, The', 'id': 13...","[{'iso_3166_1': 'US', 'name': 'United States o...",Kid Galahad
26318,"[{'id': 37, 'name': 'Western'}]",tt0044490,"[{'name': 'Warner Bros.', 'id': 6194}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Cattle Town


In [86]:
kb2.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
19291,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",tt3416742,"[{'name': 'Unison Films', 'id': 2372}, {'name'...","[{'iso_3166_1': 'NZ', 'name': 'New Zealand'}]",What We Do in the Shadows
28061,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",tt3613648,"[{'name': 'ARP Sélection', 'id': 189}, {'name'...","[{'iso_3166_1': 'FR', 'name': 'France'}]",Families
25146,"[{'id': 53, 'name': 'Thriller'}]",tt1877647,"[{'name': 'Chiller Films', 'id': 12671}]","[{'iso_3166_1': 'US', 'name': 'United States o...",Ghoul
22296,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",tt0036886,"[{'name': 'Hunt Stromberg Productions', 'id': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",Guest in the House
12287,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",tt0988045,"[{'name': 'Village Roadshow Pictures', 'id': 7...","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...",Sherlock Holmes


In [87]:
# Preprocessing: Make a string of list of dictionaries into list of names of features.
# E.g. 'genres' column output only contains 'Drama, Action, Documentary'
def dicts_list_to_str(df: pd.DataFrame, colnames):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        for col in colnames:
            list_of_dicts = eval(row[col])
            if col == 'production_countries':
                col_elements = [col_dict['iso_3166_1'] for col_dict in list_of_dicts]
            else:
                col_elements = [col_dict['name'] for col_dict in list_of_dicts]
            col_str = ' '.join(col_elements)
            # Replace back to the column name
            row[col] = col_str
            
    return df_clone

In [88]:
columns_to_modify = ['genres', 'production_companies', 'production_countries']
kb1_modified = dicts_list_to_str(kb1, columns_to_modify)

In [89]:
kb1_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
26088,Comedy,tt3115242,Canal+ Moteur s'il vous plaît Iota Production,FR BE,The Conquerors
12287,Action Adventure Crime Mystery,tt0988045,Village Roadshow Pictures Silver Pictures Warn...,DE GB US,Sherlock Holmes
26222,Drama,tt4819804,Pica Pica Media Limited,HK,Hong Kong Trilogy: Preschooled Preoccupied Pre...
20709,Comedy Drama Music,tt0056138,"Mirisch Corporation, The",US,Kid Galahad
26318,Western,tt0044490,Warner Bros.,US,Cattle Town


In [90]:
kb2_modified = dicts_list_to_str(kb2, columns_to_modify)

In [91]:
kb2_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
19291,Comedy Horror,tt3416742,Unison Films Defender Films Funny or Die,NZ,What We Do in the Shadows
28061,Comedy Drama,tt3613648,ARP Sélection Poisson Rouge Pictures,FR,Families
25146,Thriller,tt1877647,Chiller Films,US,Ghoul
22296,Drama Thriller,tt0036886,Hunt Stromberg Productions,US,Guest in the House
12287,Action Adventure Crime Mystery,tt0988045,Village Roadshow Pictures Silver Pictures Warn...,DE GB US,Sherlock Holmes


In [93]:
def modified_kb2(df: pd.DataFrame, randomly_removed_cols):
    df_clone = df.copy()
    for index, row in df_clone.iterrows():
        # For 'genres', 'production_countries', if there is only 1 genre, keep it. Otherwise, remove one of them
        for col in randomly_removed_cols:
            feature_list = row[col].split(' ')
            if len(feature_list) > 1:
                # Randomly remove 1 of them
                random.shuffle(feature_list)
                feature_list.pop()
                # Merge back as a string
                features_str = ' '.join(feature_list)
                row[col] = features_str
        
        
        # For 'title', make some changes so it looks like a typo
        title_list = row['title'].split(' ')
        # Make typo changes only with 'title' has more than 2 words
        if len(title_list) > 2:
            row['title'] = row['title'].replace('a', '4').replace('i', 'j')
    return df_clone

In [101]:
randomly_remove_cols = ['genres', 'production_countries']

kb2_processed = modified_kb2(kb2_modified, randomly_remove_cols)

### Final KBs

In [97]:
# Final KBs
kb1_modified.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
26088,Comedy,tt3115242,Canal+ Moteur s'il vous plaît Iota Production,FR BE,The Conquerors
12287,Action Adventure Crime Mystery,tt0988045,Village Roadshow Pictures Silver Pictures Warn...,DE GB US,Sherlock Holmes
26222,Drama,tt4819804,Pica Pica Media Limited,HK,Hong Kong Trilogy: Preschooled Preoccupied Pre...
20709,Comedy Drama Music,tt0056138,"Mirisch Corporation, The",US,Kid Galahad
26318,Western,tt0044490,Warner Bros.,US,Cattle Town


In [100]:
kb2_processed.head()

Unnamed: 0,genres,imdb_id,production_companies,production_countries,title
19291,Horror,tt3416742,Unison Films Defender Films Funny or Die,NZ,Wh4t We Do jn the Sh4dows
28061,Drama,tt3613648,ARP Sélection Poisson Rouge Pictures,FR,Families
25146,Thriller,tt1877647,Chiller Films,US,Ghoul
22296,Thriller,tt0036886,Hunt Stromberg Productions,US,Guest jn the House
12287,Crime Adventure Action,tt0988045,Village Roadshow Pictures Silver Pictures Warn...,US GB,Sherlock Holmes
