In [27]:
# docker run -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" docker.elastic.co/elasticsearch/elasticsearch:7.8.0

In [1]:
import numpy as np

## Crawl 

In [4]:
import scrapy
from scrapy.crawler import CrawlerProcess

from paper_crawler.paper_crawler.spiders.semanticscholar import (
    SemanticscholarSpider,
)

In [5]:
process = CrawlerProcess(
    settings={
        "FEEDS": {"papers.json": {"format": "json"}},
        "LOG_ENABLED": False
        #     "LOG_LEVEL": 'INFO',
    }
)
process.crawl(SemanticscholarSpider, max_papers=500)
process.start()

## Load Crawled Data

In [2]:
import json

with open("papers.json") as f:
    items = json.load(f)

In [3]:
items[0]

{'id': 'f90720ed12e045ac84beb94c27271d6fb8ad48cf',
 'title': 'The Lottery Ticket Hypothesis: Training Pruned Neural Networks',
 'abstract': 'Recent work on neural network pruning indicates that, at training time, neural networks need to be significantly larger in size than is necessary to represent the eventual functions that they learn. This paper articulates a new hypothesis to explain this phenomenon. This conjecture, which we term the "lottery ticket hypothesis," proposes that successful training depends on lucky random initialization of a smaller subcomponent of the network. Larger networks have more of these "lottery tickets…\xa0',
 'date': '2018',
 'references': ['34f25a8704614163c4095b3ee2fc969b60de4698',
  '1ff9a37d766e3a4f39757f5e1b235a42dacf18ff',
  'b0bd441a0cc04cdd0d0e469fe4c5184ee148a97d',
  'cc46229a7c47f485e090857cbab6e6bf68c09811',
  '642d0f49b7826adcf986616f4af77e736229990f',
  '049fd80f52c0b1fa4d532945d95a24734b62bdf3',
  '2dfef5635c8c44431ca3576081e6cfe6d65d4862',
 

In [4]:
len(items)

503

## Elasticsearch 

In [5]:
from elasticsearch import Elasticsearch

### Connect 

In [8]:
es = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])

### Clear previous index 

In [11]:
es.indices.delete("paper-index", ignore=404)

{'error': {'root_cause': [{'type': 'index_not_found_exception',
    'reason': 'no such index [paper-index]',
    'resource.type': 'index_or_alias',
    'resource.id': 'paper-index',
    'index_uuid': '_na_',
    'index': 'paper-index'}],
  'type': 'index_not_found_exception',
  'reason': 'no such index [paper-index]',
  'resource.type': 'index_or_alias',
  'resource.id': 'paper-index',
  'index_uuid': '_na_',
  'index': 'paper-index'},
 'status': 404}

### Create new index 

In [9]:
es.indices.create(index="paper-index", ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'paper-index'}

### Create persistent layer 

In [10]:
from elasticsearch_dsl import (
    Document,
    Date,
    Nested,
    Boolean,
    analyzer,
    InnerDoc,
    Completion,
    Keyword,
    Text,
    Integer,
    Float,
)

In [11]:
class Paper(Document):
    title = Text(fields={"raw": Keyword()})
    date = Integer()
    abstract = Text()
    authors = Text()
    references = Text()
    page_rank = Float()

    class Index:
        name = "paper-index"

In [12]:
# create the mappings in Elasticsearch
Paper.init(using=es)

### Insert items 

In [13]:
from elasticsearch import TransportError
from elasticsearch.helpers import bulk

#### Solution #1

In [14]:
# https://elasticsearch-py.readthedocs.io/en/master/helpers.html#bulk-helpers
# https://www.elastic.co/guide/en/elasticsearch/reference/master/docs-bulk.html


def gendata():
    for idx, item in enumerate(items):
        if item["date"] == "" or not item["date"].isdigit():
            del item["date"]
        yield {
            "_index": "paper-index",
            "_id": item["id"],
            "page_rank": 1.0,
            **item,
        }


bulk(es, gendata())

(503, [])

#### Solution #2

In [None]:
for item in items:
    paper = Paper(meta={"id": item["id"]}, page_rank=1.0, **item)
    paper.save(using=es)

### Funcions for inserting items and clearing index

In [231]:
def gen_utils(host=None):
    if host is None:
        host = {"host": "localhost", "port": 9200}
    es = Elasticsearch(hosts=[{"host": "localhost", "port": 9200}])

    def clear_index():
        es.indices.delete("paper-index", ignore=404)
        es.indices.create(index="paper-index", ignore=400)

    def insert_items(items):
        for item in items:
            paper = Paper(meta={"id": item["id"]}, **item)
            paper.save(using=es)

    return clear_index, insert_items

In [66]:
clear, insert = gen_utils()

In [67]:
clear()

In [68]:
insert(items)

## Calculating page rank

In [15]:
import numpy as np

In [16]:
all_papers = list(items)

In [17]:
all_ids = sorted(list(map(lambda x: x["id"], all_papers)))

In [18]:
p_matrix = np.zeros((len(all_ids), len(all_ids)))

In [19]:
id_loc = dict()
for index, paper_id in enumerate(all_ids):
    id_loc[paper_id] = index

In [20]:
for index, paper_id in enumerate(all_ids):
    paper = Paper.get(id=paper_id, using=es)
    if paper.references is not None:
        for reference_id in paper.references:
            try:
                p_matrix[index, id_loc[reference_id]] = 1
            except KeyError:
                continue

In [21]:
alpha = 0.1
N = len(all_ids)
v = np.ones((1, N))

In [22]:
row_sums = np.sum(p_matrix, axis=1, keepdims=True)

In [23]:
# first part is for rows having nonzero elements
# second part is for dead-ends
p_matrix = ((row_sums > 0) * 1) * (
    (1 - alpha) * p_matrix / (row_sums + np.logical_not(row_sums > 0) * 1)
    + alpha * v / N
) + (np.logical_not(row_sums > 0) * 1) * v / N

In [24]:
x0 = np.ones((1, N)) / N

In [25]:
while True:
    next_state = x0 @ p_matrix
    if np.allclose(next_state, x0, rtol=0.0001):
        break
    x0 = next_state

In [None]:
for index, paper_id in enumerate(all_ids):
    paper = Paper.get(id=paper_id, using=es)
    paper.update(page_rank=next_state[0, index], using=es)
    paper.save(using=es)

#### Solution #2 

In [26]:
def gendata():
    for index, paper_id in enumerate(all_ids):
        yield {
            "_index": "paper-index",
            "_id": items[index]["id"],
            "_source": {"doc": {"page_rank": next_state[0, index]}},
            "_op_type": "update",
        }


bulk(es, gendata())

(503, [])

#### Solution #3

In [307]:
# 429
body = ""
for index, paper_id in enumerate(all_ids):
    body += f"""{{ "update" : {{"_id" : "{paper_id}", "_index" : "paper-index"}} }}
{{ "doc" : {{"page_rank" : {next_state[0, index]} }}}}
"""
    break
body += ""
res = es.bulk(body)
res

## Search 

In [27]:
from elasticsearch_dsl import Search

In [62]:
title_search = "classification"
abstract_search = "neural"
year_search = 2000
title_weight = 50
abstract_weight = 40
year_weight = 5

In [63]:
def search(
    title_search: str,
    abstract_search: str,
    year_search: int,
    title_weight: float = 20,
    abstract_weight: float = 10,
    year_weight: float = 5,
    apply_page_rank: bool = True,
):
    if apply_page_rank:
        return es.search(
            index="paper-index",
            body={
                "query": {
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-weight
                    "function_score": {
                        "functions": [
                            {
                                "filter": {"match": {"title": title_search}},
                                "weight": title_weight,
                            },
                            {
                                "filter": {
                                    "match": {"abstract": abstract_search}
                                },
                                "weight": abstract_weight,
                            },
                            # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
                            {
                                "filter": {
                                    "range": {"date": {"gte": year_search}}
                                },
                                "weight": year_weight,
                            },
                            # https://www.elastic.co/guide/en/elasticsearch/reference/current/static-scoring-signals.html
                            # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/query-dsl-rank-feature-query.html#rank-feature-query-saturation
                            {
                                "script_score": {
                                    "script": {
                                        #                                 "source": "_score * saturation(doc['page_rank'].value, 10)"
                                        "source": "_score * doc['page_rank'].value"
                                    }
                                }
                            },
                        ]
                    }
                }
            },
        )
    return es.search(
        index="paper-index",
        body={
            "query": {
                # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-weight
                "function_score": {
                    "functions": [
                        {
                            "filter": {"match": {"title": title_search}},
                            "weight": title_weight,
                        },
                        {
                            "filter": {"match": {"abstract": abstract_search}},
                            "weight": abstract_weight,
                        },
                        # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
                        {
                            "filter": {
                                "range": {"date": {"gte": year_search}}
                            },
                            "weight": year_weight,
                        },
                    ]
                }
            }
        },
    )

In [64]:
res = search(title_search, abstract_search, year_search, apply_page_rank=True)
for hit in res["hits"]["hits"]:
    print(f'{hit["_source"]["title"]}: {hit["_score"]}')

Large-Margin Classification in Infinite Neural Networks: 5.6436925
ImageNet Classification with Deep Convolutional Neural Networks: 2.4965262
High-Performance Neural Networks for Visual Object Classification: 1.8588557
Training CNNs with Low-Rank Filters for Efficient Image Classification: 0.8912874
Character-level Convolutional Networks for Text Classification: 0.7963235
Some Improvements on Deep Convolutional Neural Network Based Image Classification: 0.782767
Convolutional Neural Networks for Sentence Classification: 0.7670854
SVM-KNN: Discriminative Nearest Neighbor Classification for Visual Category Recognition: 0.7627265
Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification: 0.75370294
Visualizing and Understanding Neural Models in NLP: 0.52961636


In [65]:
res = search(title_search, abstract_search, year_search, apply_page_rank=False)
for hit in res["hits"]["hits"]:
    print(f'{hit["_source"]["title"]}: {hit["_score"]}')

Training CNNs with Low-Rank Filters for Efficient Image Classification: 1000.0
Character-level Convolutional Networks for Text Classification: 1000.0
Large-Margin Classification in Infinite Neural Networks: 1000.0
Convolutional Neural Networks for Sentence Classification: 1000.0
Some Improvements on Deep Convolutional Neural Network Based Image Classification: 1000.0
Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification: 1000.0
High-Performance Neural Networks for Visual Object Classification: 1000.0
ImageNet Classification with Deep Convolutional Neural Networks: 1000.0
Approximation algorithms for classification problems with pairwise relationships: metric labeling and Markov random fields: 100.0
Part-based statistical models for object classification and detection: 100.0


In [46]:
# With page rank
res = es.search(
    index="paper-index",
    body={
        "query": {
            # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-weight
            "function_score": {
                "functions": [
                    {
                        "filter": {"match": {"title": title_search}},
                        "weight": title_weight,
                    },
                    {
                        "filter": {"match": {"abstract": abstract_search}},
                        "weight": abstract_weight,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
                    {
                        "filter": {"range": {"date": {"gte": year_search}}},
                        "weight": year_weight,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/static-scoring-signals.html
                    # https://www.elastic.co/guide/en/elasticsearch/reference/7.x/query-dsl-rank-feature-query.html#rank-feature-query-saturation
                    {
                        "script_score": {
                            "script": {
                                #                                 "source": "_score * saturation(doc['page_rank'].value, 10)"
                                "source": "_score * doc['page_rank'].value"
                            }
                        }
                    },
                ]
            }
        }
    },
)

In [47]:
for hit in res["hits"]["hits"]:
    print(f'{hit["_source"]["title"]}: {hit["_score"]}')

Visual Referring Expression Recognition: What Do Systems Actually Learn?: 0.29866457
Memory Architectures in Recurrent Neural Network Language Models: 0.27257547
Towards a Unified Natural Language Inference Framework to Evaluate Sentence Representations: 0.19155078
Dissecting Contextual Word Embeddings: Architecture and Representation: 0.15634124
The Lottery Ticket Hypothesis: Training Pruned Neural Networks: 0.13396016
Natural Language Inference over Interaction Space: 0.12812993
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding: 0.10748777
Attention-Based Convolutional Neural Network for Machine Comprehension: 0.097398795
A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference: 0.09511792
Annotation Artifacts in Natural Language Inference Data: 0.09371895


In [38]:
# Without page rank
res = es.search(
    index="paper-index",
    body={
        "query": {
            # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-function-score-query.html#function-weight
            "function_score": {
                "functions": [
                    {
                        "filter": {"match": {"title": title_search}},
                        "weight": title_weight,
                    },
                    {
                        "filter": {"match": {"abstract": abstract_search}},
                        "weight": abstract_weight,
                    },
                    # https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-range-query.html
                    {
                        "filter": {"range": {"date": {"gte": year_search}}},
                        "weight": year_weight,
                    },
                ]
            }
        }
    },
)

In [39]:
for hit in res["hits"]["hits"]:
    print(f'{hit["_source"]["title"]}: {hit["_score"]}')

The Lottery Ticket Hypothesis: Training Pruned Neural Networks: 100.0
BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding: 50.0
Character-Level Language Modeling with Deeper Self-Attention: 50.0
U-Net: Machine Reading Comprehension with Unanswerable Questions: 50.0
GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding: 50.0
Memory Architectures in Recurrent Neural Network Language Models: 50.0
Annotation Artifacts in Natural Language Inference Data: 50.0
Transforming Question Answering Datasets Into Natural Language Inference Datasets: 50.0
Visual Referring Expression Recognition: What Do Systems Actually Learn?: 50.0
Visual Dialog: 50.0


In [38]:
s = Search(using=es, index="paper-index").query("match", title="the lottery")

In [189]:
s = Search(using=es, index="paper-index").query("match", id="323694313")

In [39]:
response = s.execute()

In [40]:
for hit in response:
    print(hit.meta.score, hit.title)

7.1711254 The Lottery Ticket Hypothesis: Training Pruned Neural Networks
2.6979883 The dropout learning algorithm
2.5801163 Return of the Devil in the Details: Delving Deep into Convolutional Nets
2.3857658 Exploring the Limits of Language Modeling
2.2552712 Distilling the Knowledge in a Neural Network
2.2552712 Rethinking the Inception Architecture for Computer Vision
2.2552712 Compressing Neural Networks with the Hashing Trick
2.1383119 MaskGAN: Better Text Generation via Filling in the ______
2.1383119 Improving the speed of neural networks on CPUs
2.032885 Addressing the Rare Word Problem in Neural Machine Translation


In [41]:
# by calling .search we get back a standard Search object
s = Paper.search(using=es)
# the search is already limited to the index and doc_type of our document
s = s.query("match", title="the lottery")

In [42]:
response = s.execute()

In [43]:
for hit in response:
    print(hit.meta.score, hit.title, hit.page_rank)

7.1711254 The Lottery Ticket Hypothesis: Training Pruned Neural Networks 0.0016542321229066994
2.6979883 The dropout learning algorithm 0.0019053149224319082
2.5801163 Return of the Devil in the Details: Delving Deep into Convolutional Nets 0.005560533383657462
2.3857658 Exploring the Limits of Language Modeling 0.002667048414064376
2.2552712 Distilling the Knowledge in a Neural Network 0.00414401990427915
2.2552712 Rethinking the Inception Architecture for Computer Vision 0.001837508565113488
2.2552712 Compressing Neural Networks with the Hashing Trick 0.0023534950296675962
2.1383119 MaskGAN: Better Text Generation via Filling in the ______ 0.0018031145099593263
2.1383119 Improving the speed of neural networks on CPUs 0.010104847145196385
2.032885 Addressing the Rare Word Problem in Neural Machine Translation 0.004836191907258115


### HITS 

In [30]:
from bidict import bidict

In [31]:
papers = items
# Give each author an id and store them in this dict
author_ids = bidict()
id_counter = 0
for paper in papers:
    paper_authors = paper["authors"]
    for author in paper_authors:
        if author not in author_ids:
            author_ids[author] = id_counter
            id_counter += 1

# Map each paper to it's authors' ids
paper_authors_dict = dict()
for paper in papers:
    paper_id = paper["id"]
    paper_authors = paper["authors"]
    paper_authors_dict[paper_id] = []
    for author in paper_authors:
        author_id = author_ids[author]
        paper_authors_dict[paper_id].append(author_id)

# Map each author to the authors he/she has referenced
author_references = dict()
for paper in papers:
    paper_authors = paper["authors"]
    paper_references = paper["references"]
    references = []
    for reference_id in paper_references:
        if reference_id in paper_authors_dict:
            references += paper_authors_dict[reference_id]
    for author in paper_authors:
        author_id = author_ids[author]
        if author_id not in author_references:
            author_references[author_id] = []
        author_references[author_id] += references

In [32]:
num_authors = len(author_ids)

In [33]:
connectivity_matrix = np.zeros((num_authors, num_authors))
for author, references in author_references.items():
    for reference in references:
        connectivity_matrix[author, reference] = 1

In [34]:
a = np.ones(num_authors)
h = np.ones(num_authors)

In [35]:
for _ in range(5):
    for i in range(num_authors):
        h[i] = np.sum(connectivity_matrix[i, :] * a)
    for i in range(num_authors):
        a[i] = np.sum(connectivity_matrix[:, i].T * h)
    a /= np.sum(a)
    h /= np.sum(h)

In [38]:
best_authors = []
for i in np.argpartition(a, -4)[-4:]:
    best_authors.append((author_ids.inverse[i], a[i]))

In [43]:
print("Best Authors:")
best_authors = sorted(best_authors, key=lambda x: -x[1])
for author, authority in best_authors:
    print(f"{author} : {authority}")

Best Authors:
Ilya Sutskever : 0.038691023767089466
Geoffrey E. Hinton : 0.03239437930182725
Alex Krizhevsky : 0.028018939726845844
Yoshua Bengio : 0.02182020882571343


## Ranking SVM 

In [1]:
with open("./MIR_Phase3/data/train.txt", "r") as f:
    train_contents = f.read()
with open("./MIR_Phase3/data/vali.txt", "r") as f:
    val_contents = f.read()
with open("./MIR_Phase3/data/test.txt", "r") as f:
    test_contents = f.read()

In [2]:
from dataclasses import dataclass
import numpy as np

@dataclass
class QueryResult:
    qid: int
    doc_id: str
    relevance: int
    feature_vector: np.array

In [35]:
from typing import Dict, List

def parse_data(data: str) -> Dict[int, List[QueryResult]]:
    query_to_result_dict = dict()
    for line in data.split('\n'):
        splitted_line = line.split()
        try:
            relevance = int(splitted_line[0])
            qid = int(line.split()[1][4:])
            doc_id = line.split()[50]
            feature_vector = []
            for feature in line.split()[2:48]:
                feature_vector.append(float(feature.split(':')[1]))
            if qid not in query_to_result_dict:
                query_to_result_dict[qid] = []
            query_to_result_dict[qid].append(QueryResult(qid, doc_id, relevance, np.array(feature_vector)))
        except IndexError:
            pass
    return query_to_result_dict

In [29]:
train_query_to_result_dict = parse_data(train_contents)
val_query_to_result_dict = parse_data(val_contents)
test_query_to_result_dict = parse_data(test_contents)

In [30]:
POSITIVE_CLASS = 1
NEGATIVE_CLASS = -1

In [40]:
def prepare_data_for_model(query_to_result_dict: Dict[int, List[QueryResult]]):
    items = []
    for qid, results in query_to_result_dict.items():
        # Sort in decresing order
        results = sorted(results, key=lambda x: -x.relevance)
        num_results = len(results)
        for idx, result in enumerate(results):
            for other_result in results[idx+1:-1]:
                if result.relevance > other_result.relevance:
                    positive_vector = result.feature_vector - other_result.feature_vector
                    items.append((POSITIVE_CLASS, positive_vector))
                    items.append((NEGATIVE_CLASS, -positive_vector))
    return items

In [43]:
train_data  = prepare_data_for_model(train_query_to_result_dict)
val_data = prepare_data_for_model(val_query_to_result_dict)