In [33]:
import pandas as pd

queries = load_queries()  

simple_queries = []
for q in queries:
    q_id = q[0]
    query_text = q[1]
    start_node = q[2]

    answer_nodes = [a['AnswerArgument'] for a in q[5]]

    simple_queries.append({
        "query_id": q_id,
        "start_node": start_node,
        "query": query_text,
        "answers": answer_nodes
    })

df = pd.DataFrame(simple_queries)
df.to_parquet("queries.parquet")

In [43]:
from collections import defaultdict
from gensim import models
from nltk import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import json



W2V_PATH        = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw6/emd.dat'
GRAPH_PATH      = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw6/graph.txt'
ANNOT_PATH      = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw6/annotations.txt'
NODE_LOOKUP_JS  = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw6/node_label_lookup.json'
QUERY_PARQUET   = '/Users/hunjunsin/Desktop/Jun/Unsupervised/hw6/queries.parquet'

word2vec_model = models.Word2Vec.load(W2V_PATH)

def get_rel_score_word2vecbase(rel: str, query: str) -> float:
    if not rel.startswith('ns:'):
        rel = 'ns:' + rel

    if rel not in word2vec_model.wv:
        return 0.0

    words = word_tokenize(query.lower())
    w_embs = [word2vec_model.wv[w] for w in words if w in word2vec_model.wv]

    if not w_embs:
        return 0.0

    rel_emb = word2vec_model.wv[rel]
    sim = cosine_similarity(w_embs, [rel_emb])
    return float(np.mean(sim))


def load_node_label_lookup(filepath: str) -> dict:
    """

    Load the lookup dictionary for nodes from the provided json file.

    Args:
        filepath: Path to the json file containing the lookup dictionary.

    Returns: Dictionary of node ids to text description of node.

    """
    with open(filepath, 'rb') as fp:
        return json.load(fp)


def load_query_df(filepath: str) -> pd.DataFrame:
    """

    Load a simplified dataframe of queries. Generated from the original queries nested dictionary, this simplified
    version contains all necessary information for performing the graph traversal testing without all the extra
    information and difficult formatting. Simply loop through this dataframe row by row, start at the start node with
    the query for that row, and the expected answers are given in that same row.

    Args:
        filepath: Path to the provided parquet file

    Returns: Dataframe of queries to perform on the graph.

    """
    return pd.read_parquet(filepath)


# Function to load the graph from file
def load_graph() -> dict:
    """

    Load the graph from the given file.

    Returns: Graph, in form of node_id key, and nested list value. Nested list is adjacency list, with each list
    containing the relation, and destination node_id.

    """
    # Preparing the graph
    graph = defaultdict(list)
    for line in open(GRAPH_PATH):
        line = eval(line[:-1])
        graph[line[0]].append([line[1], line[2]])
    return graph


# Function to load the queries from file
# Preparing the queries
def load_queries() -> list:
    """

    Load the original queries file. This format can be extremely confusing, for a simplified format use load_query_df.

    Returns: Nested list, with index, node_id, relation types for answers, text description of start node, and dict of
    answers.

    """
    queries = []
    for line in open(ANNOT_PATH):
        line = eval(line[:-1])
        queries.append(line)
    return queries

In [36]:
lookup=load_node_label_lookup(NODE_LOOKUP_JS)

In [37]:
lookup['m.09c7w0']

'United States of America'

In [38]:
query = load_query_df(QUERY_PARQUET)
query.head(3)

Unnamed: 0,query_id,start_node,query,answers
0,1,m.09c7w0,what time zones are there in the us,"[m.027wj2_, m.027wjl3, m.02fqwt, m.02hcv8, m.0..."
1,2,m.09c7w0,what are major exports of the usa,"[m.015smg, m.03q9wp2, m.03qtd_n, m.03qtf10]"
2,3,m.07b_l,what time is right now in texas,"[m.02fqwt, m.02hczc]"


In [39]:
from collections import deque

def beam_bfs(query, start_node, graph, beam_width=10, max_depth=4, threshold=0.3):
    visited = set()
    queue = deque()
    queue.append((start_node, [], 1.0))  # (current_node, path, score)
    answers = set()

    for depth in range(max_depth):
        candidates = []

        while queue:
            current_node, path, score = queue.popleft()

            if current_node in visited:
                continue
            visited.add(current_node)

            for rel, neighbor in graph.get(current_node, []):
                rel_score = get_rel_score_word2vecbase(rel, query)
                # if the score is lower than threshold -> do not add as answer list and next search
                if rel_score < threshold:
                    continue

                new_path = path + [(rel, neighbor)]
                new_score = score * rel_score
                candidates.append((neighbor, new_path, new_score))

        # select only numner(beam_width) to add to the next search queue
        candidates = sorted(candidates, key=lambda x: x[2], reverse=True)
        queue = deque(candidates[:beam_width])

        # add candidate to the answer
        for node, _, _ in candidates[:beam_width]:
            answers.add(node)

    return answers, path

In [40]:
from sklearn.metrics import f1_score

def evaluate(predicted, actual):
    all_nodes = list(set(predicted) | set(actual))
    y_pred = [1 if n in predicted else 0 for n in all_nodes]
    y_true = [1 if n in actual else 0 for n in all_nodes]
    return f1_score(y_true, y_pred)

In [41]:
graph = load_graph()
df = load_query_df(QUERY_PARQUET)

f1_scores = []
paths = []
for _, row in df.iterrows():
    query = row['query']
    start = row['start_node']
    gold_answers = set(row['answers'])

    pred_answers, path = beam_bfs(query, start, graph)
    f1 = evaluate(pred_answers, gold_answers)
    f1_scores.append(f1)
    paths.append(path)

print(f"Avg F1 Score: {np.mean(f1_scores):.4f}")
print(len(paths))

Avg F1 Score: 0.4104
56


In [47]:
graph['m.01_d4']

[['location.citytown.postal_codes', 'm.07nqmhw'],
 ['location.citytown.postal_codes', 'm.07nqmhf'],
 ['location.citytown.postal_codes', 'm.01_6pxw'],
 ['location.location.contains', 'm.021czc'],
 ['location.citytown.postal_codes', 'm.07nqmj9'],
 ['travel.travel_destination.tourist_attractions', 'm.02vv_6w'],
 ['location.citytown.postal_codes', 'm.01_71cx'],
 ['location.citytown.postal_codes', 'm.07nqmfx'],
 ['location.citytown.postal_codes', 'm.01_6y3n'],
 ['location.citytown.postal_codes', 'm.07nqmj2'],
 ['location.citytown.postal_codes', 'm.01_6p6k'],
 ['location.citytown.postal_codes', 'm.0gs227r'],
 ['location.citytown.postal_codes', 'm.01_6s_6'],
 ['travel.travel_destination.tourist_attractions', 'm.02rvwrv'],
 ['location.location.contains', 'm.01ky66'],
 ['location.citytown.postal_codes', 'm.07nqmkm'],
 ['base.biblioness.bibs_location.country', 'm.09c7w0'],
 ['location.citytown.postal_codes', 'm.01_6qrp'],
 ['location.citytown.postal_codes', 'm.01_73_n'],
 ['location.citytown.pos