## ArXivNLP

In [1]:
#!pip install arxiv



In [3]:
#!pip install neo4j

In [None]:
#!pip install feedparser

In [None]:
# tqdm is for visualising the progress
#!pip install tqdm

In [1]:
#import tqdm

In [27]:
import arxiv

In [28]:
import urllib

In [29]:
# Base api query url
base_url = 'http://export.arxiv.org/api/query?'

In [30]:
search = arxiv.Search(
  # query = urllib.parse.quote("all:artificial intelligence"),
  query = urllib.parse.quote("Artificial Intelligence"),
  id_list = [],
  max_results = 10,  
  sort_by = arxiv.SortCriterion.SubmittedDate,
  sort_order = arxiv.SortOrder.Descending
)


In [31]:
# read ArXiv
articles = []
author_list = []
cat_list = []
link_list = []
article_to_author_list = []
article_to_link_list = []
article_to_cat_list = []
aut_id = cat_id = lin_id = 0

for result in search.results():
    art_id = result.entry_id
    articles.append(
    {
        "artId": art_id,
        "title": result.title,
        # "published": result.published,
        "summary": result.summary,
        "prim_cat": result.primary_category,
        "pdf": result.pdf_url

    })
    
    for a in range(len(result.authors)):
        aut_name = str(result.authors[a])
        author_list.append(
        {
            "authorId": aut_id, 
            "authorName": aut_name
        })
        
        article_to_author_list.append(
        {
            "articleId": art_id, 
            "authorName": aut_name
        })
        aut_id = aut_id + 1
        
    for l in range(len(result.links)):
        lin_name = str(result.links[l])
        link_list.append(
        {
            "linkId": lin_id, 
            "linkName": lin_name
        }) 
        
        article_to_link_list.append(
        {
            "articleId": art_id, 
            "linkName": lin_name
        })
        lin_id = lin_id + 1
        
    for c in range(len(result.categories)):
        cat_name = str(result.categories[c])
        cat_list.append(
        {
            "catId": cat_id, 
            "catName":  cat_name
        }) 
    
        article_to_cat_list.append(
        {
            "articleId": art_id, 
            "catName": cat_name
            
        })
        cat_id= cat_id + 1
    


In [32]:
article_to_cat_list


[{'articleId': 'http://arxiv.org/abs/2303.16203v1', 'catName': 'cs.LG'},
 {'articleId': 'http://arxiv.org/abs/2303.16203v1', 'catName': 'cs.AI'},
 {'articleId': 'http://arxiv.org/abs/2303.16203v1', 'catName': 'cs.CV'},
 {'articleId': 'http://arxiv.org/abs/2303.16203v1', 'catName': 'cs.NE'},
 {'articleId': 'http://arxiv.org/abs/2303.16203v1', 'catName': 'cs.RO'},
 {'articleId': 'http://arxiv.org/abs/2303.16201v1', 'catName': 'cs.CV'},
 {'articleId': 'http://arxiv.org/abs/2303.16201v1', 'catName': 'cs.AI'},
 {'articleId': 'http://arxiv.org/abs/2303.16201v1', 'catName': 'cs.LG'},
 {'articleId': 'http://arxiv.org/abs/2303.16199v1', 'catName': 'cs.CV'},
 {'articleId': 'http://arxiv.org/abs/2303.16199v1', 'catName': 'cs.AI'},
 {'articleId': 'http://arxiv.org/abs/2303.16199v1', 'catName': 'cs.CL'},
 {'articleId': 'http://arxiv.org/abs/2303.16199v1', 'catName': 'cs.LG'},
 {'articleId': 'http://arxiv.org/abs/2303.16199v1', 'catName': 'cs.MM'},
 {'articleId': 'http://arxiv.org/abs/2303.16200v1',

In [33]:
import pandas as pd

dfa = pd.DataFrame(articles)
# dfa.columns = ['id','title', 'auths','published', 'summary', 'prim cat', 'cats', 'lins', 'pdf_url']
dfa.columns = ['artId','title', 'summary', 'prim_cat', 'pdf']


dfa.head(3)

Unnamed: 0,artId,title,summary,prim_cat,pdf
0,http://arxiv.org/abs/2303.16203v1,Your Diffusion Model is Secretly a Zero-Shot C...,The recent wave of large-scale text-to-image d...,cs.LG,http://arxiv.org/pdf/2303.16203v1
1,http://arxiv.org/abs/2303.16201v1,ASIC: Aligning Sparse in-the-wild Image Collec...,We present a method for joint alignment of spa...,cs.CV,http://arxiv.org/pdf/2303.16201v1
2,http://arxiv.org/abs/2303.16199v1,LLaMA-Adapter: Efficient Fine-tuning of Langua...,"We present LLaMA-Adapter, a lightweight adapti...",cs.CV,http://arxiv.org/pdf/2303.16199v1


In [34]:
len(dfa)

10

## Data Clean

In [35]:
categories = pd.DataFrame(cat_list)
categories.rename(columns={'catName':'category'}, inplace=True)
categories = categories.drop_duplicates(subset=['category'])
categories

Unnamed: 0,catId,category
0,0,cs.LG
1,1,cs.AI
2,2,cs.CV
3,3,cs.NE
4,4,cs.RO
10,10,cs.CL
12,12,cs.MM
13,13,cs.CY
24,24,cs.SE
25,25,cs.HC


In [36]:
# Parse links
links = pd.DataFrame(link_list)
links.rename(columns={'linkName':'link'}, inplace=True)
links = links.drop_duplicates(subset=['link'])


In [37]:
authors = pd.DataFrame(author_list)
authors.rename(columns={'authorName':'author'}, inplace=True)
authors = authors.drop_duplicates(subset=['author'])

In [38]:
len(authors)

47

## Connect to Neo4j

In [39]:
from neo4j import GraphDatabase

In [40]:
import time

### Connection Class

In [41]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response
    
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
            

In [42]:
# Connect to Neo4j
conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", pwd="arxivtest")

In [43]:
# Send query: create DB if not exists
conn.query("CREATE OR REPLACE DATABASE ArxivTest")

[]

### Helper Functions

Add nodes from data in the relevant columns, given as $rows parameter <br>
UNWIND takes every list entity and places it into the Neo4j DB

### No Batch

In [44]:
# Adds Category nodes to the Neo4j, category name is an attribute
# categories are converted to a dict of string records
def add_categories(categories):
        query = '''UNWIND $rows AS row
                   MERGE (c:Category {id: row.catId, category: row.category}) 
                   RETURN count(*) as total
        '''
        return conn.query(query, parameters={'rows': categories.to_dict('records')})  
        # return conn.query(query, parameters={'rows': categories}) 

In [45]:
# Adds Author nodes to the Neo4j, name is an attribute
# categories are converted to a dict of string records
def add_authors(authors):
        query = '''UNWIND $rows AS row
            MERGE (:Author {id: row.authorId, name: row.author}) 
            RETURN count(*) as total
        '''
        # return conn.query(query, parameters={'rows': authors.to_dict('records')})  
        return conn.query(query, parameters={'rows': authors.to_dict('records')}) 

In [48]:
# Adds Category nodes to the Neo4j, category name is an attribute, categories are converted to a dict of string records
def add_links(links):
        query = '''UNWIND $rows AS row
                   MERGE (l:Link {id: row.linkId, link: row.link}) 
                   RETURN count(*) as total
        '''
        return conn.query(query, parameters={'rows': links.to_dict('records')})  

In [49]:
# Add Paper nodes    
def add_papers(rows):
        query = '''
                UNWIND $rows as row
                MERGE (p:Article {id: row.artId, title:row.title, summary:row.summary, prim_cat:row.prim_cat, pdf:row.pdf})
                RETURN count(*) as total
        '''
        return conn.query(query, parameters={'rows': rows.to_dict('records')})  
        

In [50]:
# Add relationships
def add_rels_auth(rows, batch_size=500):
        query = '''
           UNWIND $rows as row     
           MATCH (a:Author)
           MATCH (p:Article)
           WHERE p.id = row.articleId AND a.name = row.authorName
           MERGE (a)-[:CREATED]->(p)
           RETURN count(distinct p) as total
        '''
        # return conn.query(query, parameters={'rows': rows})  
        return insert_data(query, rows, batch_size)

In [51]:
# Add relationships
def add_rels_links(rows, batch_size=500):
        query = '''
           UNWIND $rows as row     
           MATCH (l:Link)
           MATCH (p:Article)
           WHERE p.id = row.articleId AND l.link = row.linkName
           MERGE (p)-[:REFERED]->(l)
           RETURN count (*) as total
        '''
        # return conn.query(query, parameters={'rows': rows})  
        return insert_data(query, rows, batch_size)

In [52]:
# Add relationships
def add_rels_cats(rows, batch_size=500):
        query = '''
           UNWIND $rows as row 
           MATCH (p:Article)
           MATCH (c:Category)
           WHERE p.id = row.articleId AND c.category = row.catName
           MERGE (p)-[:BELONGS]->(c)
           RETURN count(*) as total
        '''
        # return conn.query(query, parameters={'rows': rows})  
        return insert_data(query, rows, batch_size)
    


In [53]:
# Function for updating a Neo4j database in batch mode
def insert_data(query, rows, batch_size = 500):
        total = 0
        batch = 0
        start = time.time()
        result = None

        while batch * batch_size < len(rows):
            res = conn.query(query, parameters={'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
            total += res[0]['total']
            batch += 1
            result = {"total":total, "batches":batch, "time":time.time()-start}
            print(result)

        return result

In [54]:
# Create constraints guaranteeing the uniqueness of each node
conn.query('CREATE CONSTRAINT papers IF NOT EXISTS FOR (p:Paper) REQUIRE p.id IS UNIQUE')
conn.query('CREATE CONSTRAINT authors IF NOT EXISTS FOR (a:Author) REQUIRE a.name IS UNIQUE')
conn.query('CREATE CONSTRAINT categories IF NOT EXISTS FOR (c:Category) REQUIRE c.category IS UNIQUE')
conn.query('CREATE CONSTRAINT links IF NOT EXISTS FOR (l:Link) REQUIRE l.link IS UNIQUE')

[]

## Insert in Neo4j

### Add Articles and Categories

In [55]:
# Create articles
add_papers(dfa)

[<Record total=10>]

In [56]:
# Create each of the nodes
add_categories(categories)

[<Record total=10>]

In [57]:
atoc = pd.DataFrame(article_to_cat_list)


In [58]:
add_rels_cats(atoc)

{'total': 32, 'batches': 1, 'time': 0.2026968002319336}


{'total': 32, 'batches': 1, 'time': 0.2026968002319336}

In [38]:
# atoc

### Add Authors

In [39]:
add_authors(authors)

[<Record total=47>]

In [40]:
atoa = pd.DataFrame(article_to_author_list)

In [41]:
add_rels_auth(atoa)

{'total': 10, 'batches': 1, 'time': 0.160736083984375}


{'total': 10, 'batches': 1, 'time': 0.160736083984375}

### Add Links

In [42]:
add_links(links)

[<Record total=21>]

In [43]:
atol = pd.DataFrame(article_to_link_list)

In [44]:
add_rels_links(atol)

{'total': 21, 'batches': 1, 'time': 0.12908601760864258}


{'total': 21, 'batches': 1, 'time': 0.12908601760864258}

In [45]:
# Test it
query_string = '''
    MATCH (c:Category) 
    RETURN c.category, SIZE(()-[:BELONGS]->(c)) AS inDegree 
    ORDER BY inDegree DESC LIMIT 20
'''

... to be continued ...