In [1]:
import tagme
import logging
import sys
import os.path

# 标注的“Authorization Token”，需要注册才有
tagme.GCUBE_TOKEN = "58cf013e-71b9-4d8d-a7c1-396f5e842bec-843339462"

logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')


def Annotation_mentions(txt):
    """
    发现那些文本中可以是维基概念实体的概念
    :param txt: 一段文本对象，str类型
    :return: 键值对，键为本文当中原有的实体概念，值为该概念作为维基概念的概念大小，那些属于维基概念但是存在歧义现象的也包含其内
    """
    annotation_mentions = tagme.mentions(txt)
    dic = dict()
    for mention in annotation_mentions.mentions:
        try:
            dic[str(mention).split(" [")[0]] = str(mention).split("] lp=")[1]
        except:
            logger.error('error annotation_mention about ' + mention)
    return dic


def Annotate(txt, language="en", theta=0.1):
    """
    解决文本的概念实体与维基百科概念之间的映射问题
    :param txt: 一段文本对象，str类型
    :param language: 使用的语言 “de”为德语, “en”为英语，“it”为意语.默认为英语“en”
    :param theta:阈值[0, 1]，选择标注得分，阈值越大筛选出来的映射就越可靠，默认为0.1
    :return:键值对[(A, B):score]  A为文本当中的概念实体，B为维基概念实体，score为其得分
    """
    annotations = tagme.annotate(txt, lang=language)
    dic = dict()
    for ann in annotations.get_annotations(theta):
        try:
            dic[(ann.begin, ann.end)] = (ann.score, ann.mention, ann.entity_title, ann.entity_id,
                                         ann.uri())
        except:
            logger.error('error annotation about ' + ann)
    return dic

In [4]:
txt = "I have recently converted back to a mac and I could n't be happier !"
obj = Annotation_mentions(txt)
for i in obj.keys():
    print(i + "  " + obj[i])
print("=" * 30)
obj = Annotate(txt, theta=0.05)
for i in obj.keys():
    print(i[0] + " ---> " + i[1] + "  " + obj[i])

converted  0.007790769450366497
back  0.001090406090952456
mac  0.05660894885659218
n't  0.012315270490944386
happier  0.008940544910728931
converted ---> Christianization  0.06520336866378784
happier ---> Happiness  0.06577825546264648


In [3]:
from wikidata.client import Client
import urllib
proxies = {"http": "http://127.0.0.1:7890", "https": "http://127.0.0.1:7890"}
# 设置代理
handler = urllib.request.ProxyHandler(proxies)
opener = urllib.request.build_opener(handler)
client = Client(opener=opener)


In [None]:
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api

# create an item representing "Douglas Adams"
Q_DOUGLAS_ADAMS = "Q42"
q42_dict = get_entity_dict_from_api(Q_DOUGLAS_ADAMS)
q42 = WikidataItem(q42_dict)

# create a property representing "subclass of"
P_SUBCLASS_OF = "P279"
p279_dict = get_entity_dict_from_api(P_SUBCLASS_OF)
p279 = WikidataProperty(p279_dict)

# create a lexeme representing "bank"
L_BANK = "L3354"
l3354_dict = get_entity_dict_from_api(L_BANK)
l3354 = WikidataLexeme(l3354_dict)

In [None]:
from qwikidata.sparql import return_sparql_query_results

# send any sparql query to the wikidata query service and get full result back
# here we use an example that counts the number of humans
sparql_query = """
SELECT (COUNT(?item) AS ?count)
WHERE {
        ?item wdt:P31/wdt:P279* wd:Q5 .
}
"""
return_sparql_query_results(sparql_query)

In [None]:
from qwikidata.sparql import get_subclasses_of_item
# use convenience function to get subclasses of an item as a list of item ids
Q_RIVER = "Q214276"
get_subclasses_of_item(Q_RIVER)

In [None]:
from nltk.corpus import wordnet as wn

hyper = lambda s: s.hypernyms()
for domain in ("laptop",):
    for synset in wn.synsets(domain, pos=wn.NOUN):
        print(synset.hyponyms())
        print(list(synset.closure(hyper)))
    print('\n')

In [None]:
import requests
for ann in tagme.annotate("MacBook Pro").get_annotations(0.1):
    json = requests.get(
    "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=pageprops&titles=MacBook Pro"
    ).json()
    wikidata_id = json['query']['pages'][f'{ann.entity_id}']['pageprops']['wikibase_item']
    print(ann.uri(), ann.entity_id, ann.entity_title, wikidata_id)

In [None]:
from eval import absa_evaluate, evaluate
from utils import ot2bio_absa
text = gold_Y = [
    line.split("***")[0]
    for line in open("/root/graduation/processed1/dp_tmp/laptop.train.txt").read().splitlines()
]
gold_Y = [
    ot2bio_absa(line.split("***")[1].split())
    for line in open("/root/graduation/processed1/dp_tmp/laptop.train.txt").read().splitlines()
]

pred_Y = [
    ot2bio_absa(line.split("***")[-1].split())
    for line in open("/root/graduation/processed1/dp_tmp/laptop.train.txt").read().splitlines()
]
pred_Y_ = []
gold_Y_ = []
for idx, (pred, gold) in enumerate(zip(pred_Y, gold_Y)):
    sentence = text[idx]
    # res = Annotate(sentence, theta=0.05)
    if not all(item == 'O' for item in pred):
        pred_Y_.append(pred)
        gold_Y_.append(gold)
print(absa_evaluate(pred_Y_, gold_Y_))
evaluate(pred_Y_, gold_Y_)

In [None]:
from glob import glob
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle

def process_ann(sentence: str):
    result = {}
    for key, value in Annotate(sentence, theta=0.05).items():
        if key[1] in result:
            if len(result[key[1]]) < len(key[0]):
                result[key[1]] = key[0]
        else:
            result[key[1]] = key[0]
    # if "n't" in result.values() or 'one' in result.values():
    #     print(sentence)
    return result.values()
save_dir = "./entities"
os.makedirs(save_dir, exist_ok=True)
for domain in ("laptop",):
    entities = []
    for file in glob(f"./data/{domain}.*.txt"):
        counts = sum(1 for _ in open(file))
        sentences = [line.split("***")[0] for line in open(file).read().splitlines()]

        with ThreadPoolExecutor(max_workers=100) as t:
            for future in tqdm(as_completed(
                [t.submit(process_ann, sentence) for sentence in sentences]),
                               total=counts,
                               desc=file):
                entities.extend(future.result())
    with open(os.path.join(save_dir, f"{domain}.pkl"), "wb") as f:
        pickle.dump(entities, f)

In [2]:
import fasttext

model = fasttext.load_model('/root/autodl-tmp/cc.en.300.bin')



In [5]:
from collections import Counter
from typing import List, Tuple
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')

wnl = WordNetLemmatizer()
with open("./laptop_entities.pkl", "rb") as f:
    entities: List[str] = pickle.load(f)
# remove stopwords
sets = stopwords.words('english')
# lemmatization
entities = [wnl.lemmatize(word, 'n').lower() for word in entities if word not in sets]
counters = Counter(entities)
sorted_entities: List[Tuple[str, int]] = sorted(filter(lambda item: item[1] >= 5, counters.items()),
                                                key=lambda item: item[1],
                                                reverse=True)

vec_dict = {}
for entity in sorted_entities:
    e = entity[0].replace(" ", "_")
    vec_dict[e] = model.get_word_vector(e)

KeyboardInterrupt: 

In [None]:
len(vec_dict)

In [None]:
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import calinski_harabasz_score

max_score, clusters = 0, 0

X = np.array(list(vec_dict.values()))
for i in range(2, 100):
    kmeans = KMeans(n_clusters=i).fit(X, sample_weight=[entity[1] for entity in sorted_entities[::-1]])
    score = calinski_harabasz_score(X, kmeans.labels_)
    if score > max_score:
        max_score = score
        clusters = i
max_score, clusters

In [None]:
kmeans = KMeans(n_clusters=clusters).fit(X, sample_weight=[entity[1] for entity in sorted_entities[::-1]])

In [None]:
Counter(kmeans.labels_)

In [None]:
np.asarray(list(vec_dict.keys()))[kmeans.labels_ == 1]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(
    np.array(model.get_word_vector("laptop")).reshape(1, -1),
    np.array(model.get_word_vector("laptops")).reshape(1, -1))


In [None]:
i = 0
centers = kmeans.cluster_centers_
for k in vec_dict:
    if (vec_dict[k] == centers[i]).all():
        print(k)
        i += 1
        if i == len(centers):
            break


In [None]:
Annotate(
    "I upgraded the memory and replaced the base Windows 7 Starter to Win 7 Home , and it runs just fine .",
    theta=0.05)
