In [1]:
from os.path import join

from elasticsearch import Elasticsearch
import bson
import requests
from tqdm import tqdm

import json
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')  # include timestamp

In [3]:
class ESClient(object):
    def __init__(self):
        self.es = Elasticsearch(hosts=['166.111.7.106'], port=9200, timeout=60,
                                http_auth=('nekol', 'kegGER123'))
        self.index = 'cs_paper_full'
        self.initialize()

    @staticmethod
    def lower(string):
        if not string:
            return ''
        else:
            return string.lower()

    def initialize(self):
        cur_prop = {'properties': {'title': {'type': 'text'}}}
        try:
            self.es.indices.create("cs_paper_full")
        except:
            pass

        self.es.indices.put_mapping(index='cs_paper_full',doc_type='cs_paper_full', body=cur_prop)
        print("set up mapping...")

    def add_document(self, document: dict, index='cs_paper_full'):
        data = {'title': self.lower(document.get('title')), 'abstract': self.lower(document.get('abstract'))}
        self.es.create(index=index, doc_type=index, id=str(document.get('_id')), body=data)

    def delete_document(self, index, _id):
        result = self.es.delete(index=index, doc_type=index, id=_id)
        if result.get('result') != 'deleted':
            print("deletion failed!")

    def check_id_exists(self, pid: str, index='cs_paper_full'):
        return self.es.exists(index, id=pid)

    def search(self, key_words: str, index='cs_paper_full'):
        dsl = {'query': {'match': {"title": key_words}}}
        result = self.es.search(index=index, doc_type=index, body=dsl)
        return result.get('hits').get('hits')


def each_chunk(stream, separator):
    buffer = ''
    while True:  # until EOF
        chunk = stream.read(4096)  # I propose 4096 or so
        if not chunk:  # EOF?
            yield buffer
            break
        buffer += chunk
        while True:  # until no separator is found
            try:
                part, buffer = buffer.split(separator, 1)

            except ValueError:
                break
            else:
                yield part+'}'


es = ESClient()

2022-11-23 22:18:55,871 PUT http://166.111.7.106:9200/cs_paper_full [status:400 request:0.005s]
2022-11-23 22:18:55,879 PUT http://166.111.7.106:9200/cs_paper_full/_mapping/cs_paper_full [status:200 request:0.008s]


set up mapping...


In [5]:
res = es.search(key_words='machine learning')
print(len(res))
source = res[8]['_source']
print(source)
title = source['title']
print(title)

2022-11-23 22:19:15,088 GET http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.048s]


10
{'title': 'machine learning for hybrid machine translation', 'abstract': 'we describe a substitution-based system for hybrid machine translation (mt that has been extended with machine learning components controlling its phrase selection. the approach is based on a rule-based mt (rbmt system which creates template translations. based on the rule-based generation parse tree and target-to-target alignments, we identify the set of "interesting" translation candidates from one or more translation engines which could be substituted into our translation templates. the substitution process is either controlled by the output from a binary classifier trained on feature vectors from the different mt engines, or it is depending on weights for the decision factors, which have been tuned using mert. we are able to observe improvements in terms of bleu scores over a baseline version of the hybrid system.'}
machine learning for hybrid machine translation


In [48]:
hit_number  = 0
hit_rank = 0
hit_all = 0

term2id = {}
i = 0

with open('./computer_science.terms') as f:
    line = f.readline()
    while line:
        res_ = line.split('\t')
        term2id[int(res_[0])] = res_[1].replace('\n','')
        line = f.readline()

concept = {}

with open('./computer_science.taxo') as f:
    line = f.readline()
    while line:
        res_ = line.split('\t')

        parent = term2id[int(res_[0])]
        child = term2id[int(res_[1])]
        if child in concept.keys():
            concept[child].append(parent)
        else:
            concept[child] = [parent]
        line = f.readline()

print("size of concept is {}".format(len(concept)))
all = 0
for key in concept:
    all = all + len(concept[key])

print("size of concept is {}".format(all))

save_json_edge = {}

size of concept is 29483
size of concept is 46248


In [49]:
for key in concept:
    parent_list = concept[key]
    res = es.search(key_words=key)
    for parent in parent_list:
        for res_ in res:
            if parent in res_['_source']['title'] or parent in res_['_source']['abstract']:
                hit_number += 1
                hit_all += 1
                if key in save_json_edge:
                    save_json_edge[key + parent].append(res_['_source']['title'])
                else:
                    save_json_edge[key + parent] = [res_['_source']['title']]

print(hit_number)

import json
with open('./Edge.json', 'w') as f:
    json.dump(save_json_edge, f)

2022-08-01 19:25:39,652 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.005s]
2022-08-01 19:25:39,654 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.002s]
2022-08-01 19:25:39,660 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.005s]
2022-08-01 19:25:39,663 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.002s]
2022-08-01 19:25:39,668 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.005s]
2022-08-01 19:25:39,676 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.008s]
2022-08-01 19:25:39,679 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.003s]
2022-08-01 19:25:39,682 POST http://166.111.7.106:9200/cs_paper_full/cs_paper_full/_search [status:200 request:0.002s]
2022-08-01 19:25:39,686 POST http://166.111.7.10

44703


In [32]:
print(term2id[31972630])
print(concept[term2id[31972630]])

computer vision
['computer science']


In [54]:
print(len(save_json_edge))
print(concept[''])

12124
['web service', 'cluster analysis', 'java']
