In [27]:
# docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.8.0

## Crawl 

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

from paper_crawler.paper_crawler.spiders.semanticscholar import SemanticscholarSpider

In [2]:
process = CrawlerProcess(
    settings={
        "FEEDS": {"papers.json": {"format": "json"}},
        "LOG_ENABLED": False
        #     "LOG_LEVEL": 'INFO',
    }
)
process.crawl(SemanticscholarSpider, max_papers=200)
process.start()

## Load Crawled Data

In [1]:
import json

with open("papers.json") as f:
    items = json.load(f)

In [2]:
items[0]

{'id': 'f90720ed12e045ac84beb94c27271d6fb8ad48cf',
 'title': 'The Lottery Ticket Hypothesis: Training Pruned Neural Networks',
 'abstract': 'Recent work on neural network pruning indicates that, at training time, neural networks need to be significantly larger in size than is necessary to represent the eventual functions that they learn. This paper articulates a new hypothesis to explain this phenomenon. This conjecture, which we term the "lottery ticket hypothesis," proposes that successful training depends on lucky random initialization of a smaller subcomponent of the network. Larger networks have more of these "lottery tickets…\xa0',
 'date': '2018',
 'references': ['34f25a8704614163c4095b3ee2fc969b60de4698',
  '1ff9a37d766e3a4f39757f5e1b235a42dacf18ff',
  'b0bd441a0cc04cdd0d0e469fe4c5184ee148a97d',
  'cc46229a7c47f485e090857cbab6e6bf68c09811',
  '642d0f49b7826adcf986616f4af77e736229990f',
  '049fd80f52c0b1fa4d532945d95a24734b62bdf3',
  '2dfef5635c8c44431ca3576081e6cfe6d65d4862',
 

In [3]:
len(items)

201

## Elasticsearch 

In [4]:
from elasticsearch import Elasticsearch

### Connect 

In [6]:
es = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])

### Clear previous index 

In [7]:
es.indices.delete("paper-index", ignore=404)

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [paper-index]',
    'resource.type': 'index_or_alias',
    'resource.id': 'paper-index',
    'index_uuid': '_na_',
    'index': 'paper-index'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [paper-index]',
  'resource.type': 'index_or_alias',
  'resource.id': 'paper-index',
  'index_uuid': '_na_',
  'index': 'paper-index'},
 'status': 404}

### Create new index 

In [8]:
es.indices.create(index="paper-index", ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'paper-index'}

### Create persistent layer 

In [9]:
from elasticsearch_dsl import (
    Document,
    Date,
    Nested,
    Boolean,
    analyzer,
    InnerDoc,
    Completion,
    Keyword,
    Text,
    Integer,
    Float,
)

In [10]:
class Paper(Document):
    title = Text(fields={"raw": Keyword()})
    date = Integer()
    abstract = Text()
    authors = Text()
    references = Text()
    page_rank = Float()

    class Index:
        name = "paper-index"

In [11]:
# create the mappings in Elasticsearch
Paper.init(using=es)

### Insert items 

In [12]:
from elasticsearch import TransportError
from elasticsearch.helpers import bulk

In [13]:
# https://elasticsearch-py.readthedocs.io/en/master/helpers.html#bulk-helpers
# https://www.elastic.co/guide/en/elasticsearch/reference/master/docs-bulk.html

def gendata():
    for idx, item in enumerate(items):
        if item["date"] == "" or not item["date"].isdigit():
            del item["date"]
        yield {
            "_index": "paper-index",
            "_id": item["id"],
            "page_rank": 1.0,
            **item,
        }

bulk(es, gendata())

(499, [])

In [13]:
for item in items:
    paper = Paper(meta={"id": item["id"]}, page_rank=1.0, **item)
    paper.save(using=es)

### Funcions for inserting items and clearing index

In [231]:
def gen_utils(host=None):
    if host is None:
        host = {"host": "localhost", "port": 9200}
    es = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])

    def clear_index():
        es.indices.delete("paper-index", ignore=404)
        es.indices.create(index="paper-index", ignore=400)

    def insert_items(items):
        for item in items:
            paper = Paper(meta={"id": item["id"]}, **item)
            paper.save(using=es)

    return clear_index, insert_items

In [66]:
clear, insert = gen_utils()

In [67]:
clear()

In [68]:
insert(items)

## Calculating page rank

In [14]:
import numpy as np

In [15]:
all_papers = list(items)

In [16]:
all_ids = sorted(list(map(lambda x: x["id"], all_papers)))

In [17]:
p_matrix = np.zeros((len(all_ids), len(all_ids)))

In [18]:
id_loc = dict()
for index, paper_id in enumerate(all_ids):
    id_loc[paper_id] = index

In [19]:
for index, paper_id in enumerate(all_ids):
    paper = Paper.get(id=paper_id, using=es)
    if paper.references is not None:
        for reference_id in paper.references:
            try:
                p_matrix[index, id_loc[reference_id]] = 1
            except KeyError:
                continue

In [20]:
alpha = 0.1
N = len(all_ids)
v = np.ones((1, N))

In [21]:
row_sums = np.sum(p_matrix, axis=1, keepdims=True)

In [22]:
# first part is for rows having nonzero elements
# second part is for dead-ends
p_matrix = ((row_sums > 0) * 1) * (
    (1 - alpha) * p_matrix / (row_sums + np.logical_not(row_sums > 0) * 1)
    + alpha * v / N
) + (np.logical_not(row_sums > 0) * 1) * v / N

In [23]:
x0 = np.ones((1, N)) / N

In [24]:
while True:
    next_state = x0 @ p_matrix
    if np.allclose(next_state, x0, rtol=0.0001):
        break
    x0 = next_state

In [25]:
np.sum(next_state)

0.9999999999999996

In [26]:
for index, paper_id in enumerate(all_ids):
    paper = Paper.get(id=paper_id, using=es)
    paper.update(page_rank=next_state[0, index], using=es)
    paper.save(using=es)

In [None]:
def gendata():
    for index, paper_id in enumerate(all_ids):
        yield {
            "_index": "paper-index",
            "_id": items[index]["id"],
            "_source":{"doc":{"page_rank": next_state[0, index],}},
            "_op_type": "update",
        }

bulk(es, gendata())

In [307]:
# 429
body = ""
for index, paper_id in enumerate(all_ids):
    body += f'''{{ "update" : {{"_id" : "{paper_id}", "_index" : "paper-index"}} }}
{{ "doc" : {{"page_rank" : {next_state[0, index]} }}}}
'''
    break
body += ""
res = es.bulk(body)
res

## Search 

In [184]:
from elasticsearch_dsl import Search

In [185]:
title_search = "lottery"
abstract_search = "language"

In [269]:
res = es.search(
    index="paper-index",
    body={
        "query": {
            # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-weight
            "function_score": {
                "functions": [
                    {"filter": {"match": {"title": "lottery"}}, "weight": 20},
                    {
                        "filter": {"match": {"abstract": "language"}},
                        "weight": 40,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
                    {
                        "filter": {"range": {"date": {"gte": 2018}}},
                        "weight": 10,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/static-scoring-signals.html
                    # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/query-dsl-rank-feature-query.html#rank-feature-query-saturation
                    {
                        "script_score": {
                            "script": {
                                "source": "_score * saturation(doc['page_rank'].value, 10)"
                            }
                        }
                    },
                ]
            }
        }
    },
)

In [270]:
for hit in res["hits"]["hits"]:
    print(hit["_score"])

36.363636
36.363636
36.363636
36.363636
18.181818
3.6363637
3.6363637
3.6363637
3.6363637
3.6363637


In [271]:
res["hits"]["hits"][0]

{'_index': 'paper-index',
 '_type': '_doc',
 '_id': 'df2b0e26d0599ce3e70df8a9da02e51594e0e992',
 '_score': 36.363636,
 '_source': {'page_rank': 1.0,
  'id': 'df2b0e26d0599ce3e70df8a9da02e51594e0e992',
  'title': 'BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding',
  'abstract': 'We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).',
  'date': '2019',
  'references': ['0bb4cadc80c0afaf29c57518dc9c06f8fcfa5f38',
   '0c47cad9729c38d9db1f75491b1ee4bd883a5d4e',
   '204e3073870fae3d05bcbc2f6a8e263d9b72e776',
   '8c1b00128e74

In [16]:
s = Search(using=es, index="paper-index").query("match", title="the lottery")

In [189]:
s = Search(using=es, index="paper-index").query("match", id="323694313")

In [190]:
response = s.execute()

In [191]:
for hit in response:
    print(hit.meta.score, hit.title)

In [192]:
# by calling .search we get back a standard Search object
s = Paper.search(using=es)
# the search is already limited to the index and doc_type of our document
s = s.query("match", title="the lottery")

In [193]:
response = s.execute()

In [194]:
for hit in response:
    print(hit.meta.score, hit.title, hit.page_rank)

8.631361 The Lottery Ticket Hypothesis: Training Pruned Neural Networks 1.0
1.8374821 The projection of the retina in the cat. 1.0
1.7945862 The projection of the lateral geniculate nucleus upon the cortex in the cat 1.0
1.7697966 THE EFFECTS OF SPATIAL SUMMATION IN THE RETINA ON THE EXCITATION OF THE FIBERS OF THE OPTIC NERVE 1.0
1.752861 The Representation of the Visual Field on the Calcarine Cortex 1.0
1.752861 The projection of the retina in the lateral geniculate body 1.0
1.752861 The Spectral Properties of the Visual Receptors of the Cat 1.0
1.752861 The spatial selectivity of the visual cells of the cat. 1.0
1.752861 THE FUNCTION OF THE CALLOSAL CONNECTIONS OF THE VISUAL CORTEX. 1.0
1.752861 The cytoarchitectonic organization of the spinal cord in the cat. 1.0
