# Full Freebase to DB

The dump of Freebase can be found here: https://developers.google.com/freebase

Here are some of the ids in Freebase: https://stackoverflow.com/questions/24272840/freebase-g-vs-m-namespace-ids

In [2]:
import sys
sys.path.insert(0, '../../')
from scripts.utils.connect import get_connection 

connection = get_connection()
cursor = connection.cursor()

fb = '../../data/freebase-rdf-latest.gz'

In [None]:
cursor.execute("""
    CREATE TABLE fb
        (id SERIAL PRIMARY KEY,,
        subject varchar NOT NULL,
        relation varchar NOT NULL,
        object varchar NOT NULL);""")

In [None]:
from tqdm import tqdm_notebook
import subprocess
import random

chunk_size = 10000

    
def insert_chunk(rows):
    insert_query = 'INSERT INTO fb (subject, relation, object) VALUES %s ON CONFLICT DO NOTHING;'
    psycopg2.extras.execute_values(
        cursor, insert_query, rows, template=None, page_size=100
    )
    
def is_relevant(split, all_mids):
    for token in split:
        if 'http://rdf.freebase.com/ns/m.' in token:
            mid = token.rstrip('>')
            mid = mid.lstrip('<')
            mid = mid.replace('http://rdf.freebase.com/ns/m.', '')
            if mid in all_mids:
                return True
    return False

rows = []
gzip = subprocess.Popen(['gzip', '-cdfq', fb], stdout=subprocess.PIPE)
for i, line in tqdm_notebook(enumerate(gzip.stdout), total=3130696870):
    line = line.decode('UTF-8')

    # Build Chunks
    split = line.split('\t')[:3]
    assert len(split) == 3, 'Malformed row'
    if is_relevant(split, all_mids):
        rows.append(tuple(split))
    
    # Insert Chunk
    if len(rows) > chunk_size:
        insert_chunk(rows)
        rows = []

insert_chunk(rows)

In [None]:
connection.commit()

In [None]:
cursor.execute("""
    CREATE INDEX fb_subject_index ON fb(subject);""")
connection.commit()

In [None]:
cursor.close()
connection.close()