In [None]:
from glob import glob
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
import os
def process_ann(sentence: str):
    result = {}
    for score, mention, entity_title, entity_id, uri in Annotate(sentence, theta=0.05).values():
        if entity_title in result:
            if len(result[entity_title]) < len(mention):
                result[entity_title] = mention
        else:
            result[entity_title] = mention
    return result.values()
save_dir = "./entities"
os.makedirs(save_dir, exist_ok=True)
for domain in ("rest", "laptop", "service", "device"):
    entities = []
    for file in glob(f"../data/{domain}.*.txt"):
        counts = sum(1 for _ in open(file))
        sentences = [line.split("***")[0] for line in open(file).read().splitlines()]

        with ThreadPoolExecutor(max_workers=100) as t:
            for future in tqdm(as_completed(
                [t.submit(process_ann, sentence) for sentence in sentences]),
                               total=counts,
                               desc=file):
                entities.extend(future.result())
    with open(os.path.join(save_dir, f"{domain}.pkl"), "wb") as f:
        pickle.dump(entities, f)

In [None]:
from wikidata.client import Client
import urllib
proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}
# 设置代理
handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(handler)
client = Client(opener=opener)


In [11]:
from tag_utils import Annotate, Annotation_mentions

txt = 'The wine list is excellent .'
obj = Annotation_mentions(txt)
for i in obj.keys():
    print(i + "  " + obj[i])
print("=" * 30)
obj = Annotate(txt, theta=0.05)
for k in obj:
    print(obj[k][2], " ---> ", obj[k][1], obj[k][0], obj[k][-1])


wine  0.12134100496768951
wine list  0.04098360612988472
Wine  --->  wine 0.29664087295532227 https://en.wikipedia.org/wiki/Wine
Wine list  --->  wine list 0.2564621865749359 https://en.wikipedia.org/wiki/Wine_list


In [None]:
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

# create an item representing "Douglas Adams"
Q_DOUGLAS_ADAMS = "Q42"
q42_dict = get_entity_dict_from_api(Q_DOUGLAS_ADAMS)
q42 = WikidataItem(q42_dict)

# create a property representing "subclass of"
P_SUBCLASS_OF = "P279"
p279_dict = get_entity_dict_from_api(P_SUBCLASS_OF)
p279 = WikidataProperty(p279_dict)

# create a lexeme representing "bank"
L_BANK = "L3354"
l3354_dict = get_entity_dict_from_api(L_BANK)
l3354 = WikidataLexeme(l3354_dict)

In [1]:
from qwikidata.sparql import return_sparql_query_results, get_subclasses_of_item
# send any sparql query to the wikidata query service and get full result back
# here we use an example that counts the number of humans
qid = "Q730298"
sparql_query = """
SELECT ?pLabel WHERE {{
  wd:{} wdt:P279 ?p .
  SERVICE wikibase:label {{
    bd:serviceParam wikibase:language "en" .
   }}
}}
""".format(qid)
results = return_sparql_query_results(sparql_query)
uris = [binding["pLabel"]["value"] for binding in results["results"]["bindings"]]
uris

['Thai cuisine', 'noodle dish']

In [39]:
from qwikidata.sparql import get_subclasses_of_item
# use convenience function to get subclasses of an item as a list of item ids
Q_RIVER = "Q214276"
get_subclasses_of_item(Q_RIVER)

['Q214276', 'Q28850392', 'Q78982844', 'Q109294125']

In [None]:
from nltk.corpus import wordnet as wn

hyper = lambda s: s.hypernyms()
for domain in ("laptop",):
    for synset in wn.synsets(domain, pos=wn.NOUN):
        print(synset.hyponyms())
        print(list(synset.closure(hyper)))
    print('\n')

In [14]:
import requests
import tagme
for ann in tagme.annotate("MacBook Pro").get_annotations(0.1):
    # json = requests.get(
    # "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&titles=MacBook Pro"
    # ).json()
    # wikidata_id = json['query']['pages'][f'{ann.entity_id}']['pageprops']['wikibase_item']
    print(ann.uri(), ann.entity_id, ann.entity_title, ann.mention)

https://en.wikipedia.org/wiki/MacBook_Pro 3677824 MacBook Pro MacBook Pro


In [None]:
import sys
sys.path.append("../")
from eval import absa_evaluate, evaluate
from tag_utils import ot2bio_absa
text = [
    line.split("***")[0]
    for line in open("/root/graduation/processed/dp_tmp/laptop.train.txt").read().splitlines()
]
gold_Y = [
    ot2bio_absa(line.split("***")[1].split())
    for line in open("/root/graduation/processed/dp_tmp/laptop.train.txt").read().splitlines()
]

pred_Y = [
    ot2bio_absa(line.split("***")[-1].split())
    for line in open("/root/graduation/processed/dp_tmp/laptop.train.txt").read().splitlines()
]
pred_Y_ = []
gold_Y_ = []
for idx, (pred, gold) in enumerate(zip(pred_Y, gold_Y)):
    sentence = text[idx]
    # res = Annotate(sentence, theta=0.05)
    if not all(item == 'O' for item in pred):
        pred_Y_.append(pred)
        gold_Y_.append(gold)
absa_evaluate(pred_Y_, gold_Y_), evaluate(pred_Y_, gold_Y_)

In [3]:
import fasttext

model = fasttext.load_model('/root/autodl-tmp/cc.en.300.bin')



In [6]:
import pickle
from collections import Counter
from typing import List, Tuple
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

wnl = WordNetLemmatizer()
with open("./entities/rest.pkl", "rb") as f:
    entities: List[str] = pickle.load(f)
# remove stopwords
sets = stopwords.words('english')
# lemmatization
entities = [wnl.lemmatize(word, 'n').lower() for word in entities if word not in sets]
counters = Counter(entities)
sorted_entities: List[Tuple[str, int]] = sorted(filter(lambda item: item[1] >= 5, counters.items()),
                                                key=lambda item: item[1],
                                                reverse=True)

vec_dict = {}
for entity in sorted_entities:
    e = entity[0]
    vec_dict[e] = model.get_word_vector(e)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
sorted_entities[:10]

[('food', 559),
 ('service', 325),
 ('restaurant', 324),
 ('good', 236),
 ('time', 185),
 ('nice', 150),
 ("n't", 130),
 ('pizza', 128),
 ('sushi', 127),
 ('one', 105)]

In [19]:
from operator import itemgetter
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
getter = itemgetter(*[entity for entity, _ in sorted_entities[:10]])
mean_vec = np.average(getter(vec_dict), axis=0, weights=[count for _, count in sorted_entities[:10]])
np.save("rest_mean_vec", mean_vec)
res = np.array([cosine_similarity(mean_vec.reshape(1, -1), vec_dict[k].reshape(1, -1)) for k in vec_dict]).squeeze()
topk = torch.topk(torch.from_numpy(res), res.shape[0]).indices.tolist()
itemgetter(*topk)(sorted_entities), sorted_entities

((('food', 559),
  ('restaurant', 324),
  ('meal', 49),
  ('sushi', 127),
  ('pizza', 128),
  ('good', 236),
  ('resturant', 5),
  ('tasty', 13),
  ('eat', 62),
  ('delicious', 21),
  ('dinner', 83),
  ('seafood', 29),
  ('cuisine', 11),
  ('lunch', 48),
  ('burger', 10),
  ('eating', 25),
  ('buffet', 7),
  ('beverage', 5),
  ('beer', 24),
  ('drink', 80),
  ('dessert', 62),
  ('eaten', 27),
  ('meat', 21),
  ('steak', 25),
  ('takeout', 5),
  ('diner', 8),
  ('vegetarian', 15),
  ('service', 325),
  ('taste', 37),
  ('cafe', 10),
  ('tasting', 5),
  ('waiter', 64),
  ('dining', 21),
  ('sandwich', 23),
  ('wine', 101),
  ('appetizer', 71),
  ('bistro', 15),
  ('dine', 5),
  ('soup', 21),
  ('dish', 91),
  ('ate', 18),
  ('nice', 150),
  ('sashimi', 23),
  ('ambience', 22),
  ('cook', 5),
  ('bread', 19),
  ('waitstaff', 11),
  ('brunch', 20),
  ('quality', 39),
  ('salad', 48),
  ('dinning', 5),
  ('pasta', 28),
  ('entree', 39),
  ('taco', 7),
  ('waitress', 39),
  ('lasagna', 6),
 

In [None]:
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import calinski_harabasz_score

max_score, clusters = 0, 0

X = np.array(list(vec_dict.values()))
for i in range(2, 100):
    kmeans = KMeans(n_clusters=i).fit(X, sample_weight=[entity[1] for entity in sorted_entities[::-1]])
    score = calinski_harabasz_score(X, kmeans.labels_)
    if score > max_score:
        max_score = score
        clusters = i
max_score, clusters

In [None]:
kmeans = KMeans(n_clusters=clusters).fit(X, sample_weight=[entity[1] for entity in sorted_entities[::-1]])

In [None]:
Counter(kmeans.labels_)

In [None]:
np.asarray(list(vec_dict.keys()))[kmeans.labels_ == 1]

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(mean_vec.reshape(1, -1),
                  np.array(model.get_word_vector("the staff".lower())).reshape(1, -1))


array([[0.18013402]])