In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

from researchgate.researchgate.spiders.paper import PaperSpider

In [3]:
process = CrawlerProcess(
    settings={
        "FEEDS": {"items.json": {"format": "json"}},
        "LOG_ENABLED": False
        #     "LOG_LEVEL": 'INFO',
    }
)
process.crawl(PaperSpider)
process.start()

In [221]:
import json

with open("items.json") as f:
    items = json.load(f)

In [222]:
items[0]

{'id': '323694313',
 'title': 'The Lottery Ticket Hypothesis: Training Pruned Neural Networks',
 'abstract': 'Recent work on neural network pruning indicates that, at training time, neural networks need to be significantly larger in size than is necessary to represent the eventual functions that they learn. This paper articulates a new hypothesis to explain this phenomenon. This conjecture, which we term the "lottery ticket hypothesis," proposes that successful training depends on lucky random initialization of a smaller subcomponent of the network. Larger networks have more of these "lottery tickets," meaning they are more likely to luck out with a subcomponent initialized in a configuration amenable to successful optimization. This paper conducts a series of experiments with XOR and MNIST that support the lottery ticket hypothesis. In particular, we identify these fortuitously-initialized subcomponents by pruning low-magnitude weights from trained networks. We then demonstrate that t

## Elasticsearch 

In [223]:
from elasticsearch import Elasticsearch

### Connect 

In [224]:
es = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])

### Clear previous index 

In [225]:
es.indices.delete("paper-index", ignore=404)

{'acknowledged': True}

### Create new index 

In [226]:
es.indices.create(index="paper-index", ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'paper-index'}

### Create persistent layer 

In [227]:
from elasticsearch_dsl import (
    Document,
    Date,
    Nested,
    Boolean,
    analyzer,
    InnerDoc,
    Completion,
    Keyword,
    Text,
    Integer,
    Float,
)

In [228]:
class Paper(Document):
    title = Text(fields={"raw": Keyword()})
    date = Integer()
    abstract = Text()
    authors = Text()
    references = Text()
    page_rank = Float()

    class Index:
        name = "paper-index"

In [229]:
# create the mappings in Elasticsearch
Paper.init(using=es)

### Insert items 

In [230]:
for item in items:
    paper = Paper(meta={"id": item["id"]}, page_rank=1.0, **item)
    paper.save(using=es)

### Funcions for inserting items and clearing index

In [231]:
def gen_utils(host=None):
    if host is None:
        host = {"host": "localhost", "port": 9200}
    es = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])

    def clear_index():
        es.indices.delete("paper-index", ignore=404)
        es.indices.create(index="paper-index", ignore=400)

    def insert_items(items):
        for item in items:
            paper = Paper(meta={"id": item["id"]}, **item)
            paper.save(using=es)

    return clear_index, insert_items

In [66]:
clear, insert = gen_utils()

In [67]:
clear()

In [68]:
insert(items)

## Calculating page rank

In [232]:
import numpy as np

In [261]:
all_papers = list(items)

In [263]:
all_ids = sorted(list(map(lambda x: x["id"], all_papers)))

In [264]:
p_matrix = np.zeros((len(all_ids), len(all_ids)))

In [265]:
id_loc = dict()
for index, paper_id in enumerate(all_ids):
    id_loc[paper_id] = index

In [266]:
for index, paper_id in enumerate(all_ids):
    paper = Paper.get(id=paper_id, using=es)
    for reference_id in paper.references:
        try:
            p_matrix[index, id_loc[reference_id]] = 1
        except KeyError:
            continue

In [267]:
alpha = 0.1
N = len(all_ids)
v = np.ones((1, N))

In [269]:
row_sums = np.sum(p_matrix, axis=1, keepdims=True)

In [270]:
# first part is for rows having nonzero elements
# second part is for dead-ends
p_matrix = ((row_sums > 0) * 1) * (
    (1 - alpha) * p_matrix / (row_sums + np.logical_not(row_sums > 0) * 1)
    + alpha * v / N
) + (np.logical_not(row_sums > 0) * 1) * v / N

In [273]:
x0 = np.ones((1, N)) / N

In [274]:
while True:
    next_state = x0 @ p_matrix
    if np.allclose(next_state, x0, rtol=0.0001):
        break
    x0 = next_state

In [275]:
next_state

array([[0.0281928 , 0.04087966, 0.0281928 , 0.1900929 , 0.04658897,
        0.04087966, 0.08599568, 0.0277268 , 0.0281928 , 0.02562982,
        0.0277268 , 0.0277268 , 0.07035748, 0.0281928 , 0.06474251,
        0.02562982, 0.04020395, 0.02562982, 0.0277268 , 0.0277268 ,
        0.0360449 , 0.0277268 , 0.0281928 ]])

In [276]:
for index, paper_id in enumerate(all_ids):
    paper = Paper.get(id=paper_id, using=es)
    paper.update(page_rank=next_state[0, index], using=es)
    paper.save(using=es)

## Search 

In [277]:
from elasticsearch_dsl import Search

In [278]:
title_search = "lottery"
abstract_search = "language"

In [287]:
res = es.search(
    index="paper-index",
    body={
        "query": {
            # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-weight
            "function_score": {
                "functions": [
                    {"filter": {"match": {"title": "lottery"}}, "weight": 20},
                    {
                        "filter": {"match": {"abstract": "language"}},
                        "weight": 40,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
                    {
                        "filter": {"range": {"date": {"gte": 2018}}},
                        "weight": 10,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/static-scoring-signals.html
                    # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/query-dsl-rank-feature-query.html#rank-feature-query-saturation
                    {
                        "script_score": {
                            "script": {
                                "source": "_score * saturation(doc['page_rank'].value, 10)"
                            }
                        }
                    },
                ]
            }
        }
    },
)

In [288]:
for hit in res["hits"]["hits"]:
    print(hit["_score"])

1.1060055
1.0225718
0.5112859
0.34104985
0.27946368
0.25730416
0.1854917
0.16285291
0.040042963
0.035915446


In [289]:
res["hits"]["hits"][0]

{'_index': 'paper-index',
 '_type': '_doc',
 '_id': '335685749',
 '_score': 1.1060055,
 '_source': {'page_rank': 0.027726804663415785,
  'id': '335685749',
  'title': 'Character-Level Language Modeling with Deeper Self-Attention',
  'abstract': 'LSTMs and other RNN variants have shown strong performance on character-level language modeling. These models are typically trained using truncated backpropagation through time, and it is common to assume that their success stems from their ability to remember long-term contexts. In this paper, we show that a deep (64-layer) transformer model (Vaswani et al. 2017) with fixed context outperforms RNN variants by a large margin, achieving state of the art on two popular benchmarks: 1.13 bits per character on text8 and 1.06 on enwik8. To get good results at this depth, we show that it is important to add auxiliary losses, both at intermediate network layers and intermediate sequence positions.',
  'date': 2019,
  'references': ['334116459',
   '325

In [16]:
s = Search(using=es, index="paper-index").query("match", title="the lottery")

In [132]:
s = Search(using=es, index="paper-index").query("match", id="323694313")

In [17]:
response = s.execute()

In [18]:
for hit in response:
    print(hit.meta.score, hit.title)

5.3779216 The Lottery Ticket Hypothesis: Training Pruned Neural Networks


In [290]:
# by calling .search we get back a standard Search object
s = Paper.search(using=es)
# the search is already limited to the index and doc_type of our document
s = s.query("match", title="the lottery")

In [291]:
response = s.execute()

In [292]:
for hit in response:
    print(hit.meta.score, hit.title, hit.page_rank)

5.6907296 The Lottery Ticket Hypothesis: Training Pruned Neural Networks 0.025629817277203964
