# Build map from MID to name using Freebase Dump

Parse freebase-rdf-latest to mid_to_name. Script takes 1 hour to run on a 2016 Macbook. The code is setup to play a sound when it finishes.

Download http://commondatastorage.googleapis.com/freebase-public/rdf/freebase-rdf-latest.gz before hand!

In [8]:
import logging
import time
import os
import subprocess

from functools import partial

mid_url = '<http://rdf.freebase.com/ns/m.'
name_attribute = '<http://rdf.freebase.com/ns/type.object.name>'
estimated_number_of_lines_to_read = 3129197591  # Used for progress
start = time.time()
end = time.time()
total_lines_read = 0
total_lines_wrote = 0
slimmed_file_path = '/Users/petrochuk/data/simple_qa/source/freebase_dump/mid_to_name_1.tsv'
slimmed_file = open(slimmed_file_path, 'w')
dump_path = '/Users/petrochuk/data/simple_qa/source/freebase_dump/freebase-rdf-latest.gz'

def iterate_gzip_fast(file_name, visitor):
    """ Reading GZIP fast
    https://stackoverflow.com/questions/36559626/python-vs-perl-performance-reading-a-gzipped-file
    Args:
        file_name (str): gziped file to read
        visitor (func): func to call on every line of gzip read
    """
    gzip = subprocess.Popen(
        ['gzip', '-cdfq', file_name], stdout=subprocess.PIPE)
    for line in gzip.stdout:
        visitor(line)
    gzip.wait()

def sound_alarm():
    """ Sounds an alarm when your code finishes!
    https://stackoverflow.com/questions/16573051/python-sound-alarm-when-code-finishes
    """
    os.system('say "Finished!"')

def write_row(file_, split):
    """ Write row to slimmed file like "01001t69\tAfter the Morning"
    Notes:
        Removes language tag
        Removes mid_url
    Args:
        split (list): line split \t in freebase-rdf-latest.gz like
        [<http://rdf.freebase.com/ns/m.011d7vyl>,<http://rdf.freebase.com/ns/type.object.name>,
        "H.M.K. Smith"@en,.]
        file_ (file): file to write line too after parsing
    """
    line_to_write = split[0].replace(mid_url, "").replace(">", "")
    line_to_write += "\t"
    line_to_write += split[2].replace('"@en', "")[1:]
    line_to_write += "\n"
    file_.write(line_to_write)


def read_line(file_, line):
    """ Reads lines that defined english MID names and writes them to file_.
    Args:
        line (str): line in freebase-rdf-latest.gz
        file_ (file): file to write line too after parsing
    """
    global end
    global start
    global total_lines_read
    global total_lines_wrote

    split = line.decode('utf-8').split("\t")
    is_english_name_attribute_triple = (split[1] == name_attribute and mid_url in split[0] and
                                        "@en" in split[2])
    if is_english_name_attribute_triple:
        write_row(slimmed_file, split)
        if total_lines_wrote % 1000 == 0:
            print('Slim Lines: %d - Done: %.2f%% - %.1fm Elapsed' % (total_lines_wrote,
                        (total_lines_read / float(estimated_number_of_lines_to_read)) * 100.0,
                        (time.time() - start) / 60.0))
        total_lines_wrote += 1
    total_lines_read += 1

iterate_gzip_fast(dump_path, partial(read_line, slimmed_file))
sound_alarm()

Slim Lines: 0 - Done: 0.22% - 0.2m Elapsed
Slim Lines: 1000 - Done: 0.22% - 0.2m Elapsed
Slim Lines: 2000 - Done: 0.22% - 0.2m Elapsed
Slim Lines: 3000 - Done: 0.22% - 0.2m Elapsed
Slim Lines: 4000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 5000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 6000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 7000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 8000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 9000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 10000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 11000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 12000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 13000 - Done: 0.23% - 0.2m Elapsed
Slim Lines: 14000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 15000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 16000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 17000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 18000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 19000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 20000 - Done: 0.24% - 0.2m Elapsed
Slim Lines: 21000 - Done: 

Slim Lines: 175000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 176000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 177000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 178000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 179000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 180000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 181000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 182000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 183000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 184000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 185000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 186000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 187000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 188000 - Done: 0.41% - 0.3m Elapsed
Slim Lines: 189000 - Done: 0.42% - 0.3m Elapsed
Slim Lines: 190000 - Done: 0.42% - 0.3m Elapsed
Slim Lines: 191000 - Done: 0.42% - 0.3m Elapsed
Slim Lines: 192000 - Done: 0.42% - 0.3m Elapsed
Slim Lines: 193000 - Done: 0.42% - 0.3m Elapsed
Slim Lines: 194000 - Done: 0.42% - 0.3m Elapsed
Slim Lines: 195000 - Done: 0.42% - 0.3m 

KeyboardInterrupt: 