# FB2M & FB5M Subject MID to Subject Name

Using the full Freebase KG we query for subject aliases per every MID with relations like `type.object.name`, `common.topic.alias`, `rdf-schema#label` and `medicine.drug_formulation.brand_names`.

In [4]:
import sys
sys.path.insert(0, '../../')
from lib.data import FB5M_KG
from lib.data import FB2M_KG
from lib.data import FB2M_NAME_TABLE
from lib.data import FB5M_NAME_TABLE
from lib.connect import get_connection 

connection = get_connection()
cursor = connection.cursor()

tables = [(FB5M_KG, FB5M_NAME_TABLE), (FB2M_KG, FB2M_NAME_TABLE)]

In [5]:
def get_all_subject_mids(kg_table_name):
    """ Get all subject MIDs in the KG """
    all_subject_mids = set()
    cursor.execute("""SELECT subject_mid FROM %s""" % (kg_table_name,))
    for (mid,) in cursor.fetchall():
        all_subject_mids.add(mid)
    return all_subject_mids

In [6]:
for (_, alias_table_name) in tables:
    cursor.execute("""
        CREATE TABLE %s
            (mid varchar NOT NULL,
            alias varchar NOT NULL,
            PRIMARY KEY(mid, alias));""" % (alias_table_name,))

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 37))



ProgrammingError: relation "fb_two_subject_name" already exists


In [None]:
import psycopg2
from tqdm import tqdm_notebook

chunk_size = 10000

def insert_chunk(rows, alias_table_name):
    insert_query = 'INSERT INTO ' + alias_table_name + ' (mid, alias) VALUES %s ON CONFLICT DO NOTHING;'
    psycopg2.extras.execute_values(
        cursor, insert_query, rows, template=None
    )
    
def get_aliases(mid):
    sql = """
        SELECT object
        FROM fb
        WHERE subject = '<http://rdf.freebase.com/ns/m.%s>'
        AND object LIKE '%%@en'
        AND relation IN ('<http://rdf.freebase.com/ns/type.object.name>',
                         '<http://rdf.freebase.com/ns/common.topic.alias>',
                         '<http://www.w3.org/2000/01/rdf-schema#label>',
                         '<http://rdf.freebase.com/ns/medicine.drug_formulation.brand_names>')
        """ % (mid,)
    cursor.execute(sql)
    aliases = [row[0].replace('@en', '').strip('"').lower() for row in cursor.fetchall()]
    # NOTE: Questions are all under 300 characters some aliases are not
    return set([a for a in aliases if len(a) < 300])

def get_replace_mid(mid):
    # MID may have been replaced if there are no aliases 
    # <http://rdf.freebase.com/ns/dataworld.gardening_hint.replaced_by>
    sql = """
        SELECT object
        FROM fb
        WHERE subject = '<http://rdf.freebase.com/ns/m.%s>'
        AND relation = '<http://rdf.freebase.com/ns/dataworld.gardening_hint.replaced_by>'
    """ % (mid,)
    cursor.execute(sql)
    mids = [row[0] for row in cursor.fetchall()]
    assert len(mids) <= 1
    if len(mids) == 1:
        return mids[0].replace('<http://rdf.freebase.com/ns/m.', '').rstrip('>')
    return None

for (kg_table_name, alias_table_name) in tables:
    all_mids = get_all_subject_mids(kg_table_name)
    print('Got %d mids' % len(all_mids))
    print('Sample:', list(all_mids)[:5])

    rows = []
    for mid in tqdm_notebook(all_mids):
        # Build Chunks
        aliases = get_aliases(mid)
        if len(aliases) == 0:
            other_mid = get_replace_mid(mid)
            if other_mid is not None:
                aliases = get_aliases(other_mid)

        rows.extend([tuple([mid, alias]) for alias in aliases])

        # Insert Chunk
        if len(rows) > chunk_size:
            insert_chunk(rows, alias_table_name)
            rows = []

    insert_chunk(rows, alias_table_name)

In [None]:
connection.commit()

Add indexes to the DB that will be useful in other notebooks.

In [None]:
for (_, alias_table_name) in tables:
    cursor.execute('CREATE INDEX %s_alias_index ON %s (alias);' % (alias_table_name, alias_table_name))
    cursor.execute('CREATE INDEX %s_mid_index ON %s (mid);' % (alias_table_name, alias_table_name))
    cursor.execute('CREATE INDEX %s_mid_alias_index ON %s (mid, alias);' % (alias_table_name, alias_table_name))
    connection.commit()

In [None]:
cursor.close()
connection.close()