In [2]:
import os
import platform
from datetime import datetime
import time

import pandas as pd
import yaml
from IPython.display import display, Markdown
from git import Repo as RepoLoader  # gitpython
from selenium.webdriver import Chrome

import papers_analyser.db as db
from papers_analyser import scrape_papers_with_code as navigator
from papers_analyser.paper import Paper, get_repo, scrape_paper

%load_ext autoreload
%autoreload 2

## Load driver

In [2]:
driverPath = os.path.abspath("") + '/drivers/' + platform.system()
#open(driverPath)
driver = Chrome(driverPath + "/chromedriver")

## Connect to DB

In [82]:
connection = db.create_connection( os.path.abspath("") + "/paper.db")
db.create_table_if_not_exist(connection)

2.6.0


## Github API auth-token

In [3]:
auth_token = yaml.load(open("config.yaml", "r"))["auth_token"]

  """Entry point for launching an IPython kernel.


## Load index page
Infinite scroll down all the way

In [4]:
index_page = navigator.get_paper_index_page(driver, datetime.strptime("2020-05-01", "%Y-%m-%d"))

2020-05-20 00:00:00
2020-05-18 00:00:00
2020-05-18 00:00:00
2020-05-17 00:00:00
2020-05-16 00:00:00
2020-05-15 00:00:00
2020-05-15 00:00:00
2020-05-14 00:00:00
2020-05-13 00:00:00
2020-05-13 00:00:00
2020-05-12 00:00:00
2020-05-12 00:00:00
2020-05-11 00:00:00
2020-05-10 00:00:00
2020-05-09 00:00:00
2020-05-08 00:00:00
2020-05-08 00:00:00
2020-05-08 00:00:00


## Get paper links

In [7]:
input_path = os.path.abspath("") + "/input"

There are two options:
1. Parse the paperswithcode.com index page and save to csv.

In [16]:
paper_links = list(set(navigator.get_papers(index_page)))

In [17]:
links_df = pd.DataFrame(paper_links, columns=["url"])
links_df.to_csv(input_path + "/paper_urls_" + str(datetime.now().strftime("%Y-%m-%d")) + ".csv",index=False)

2. Load Csvs to get all paper links that have been persisted.

In [85]:
all_files = os.listdir(input_path)
link_csvs = []

for filename in all_files:
    df = pd.read_csv(input_path + "/" + filename, index_col=None, header=0)
    link_csvs.append(df)

frame = pd.concat(link_csvs, axis=0, ignore_index=True).drop_duplicates()

paper_links = frame["url"].tolist()

# Scrape papers
Scraping papers on Github and Paperswithcode.com. Only query those, that have not been inserted into the db.

In [86]:
papers_persisted = db.get_papers(connection)
paper_links = [url for url in paper_links if url not in papers_persisted]

Seperation of `repos` and `papers` because weird behaviour of paperswithcode.com: Sometimes mixing on-site links with github links, which makes absolutely no sense.
Get auth-token from your github profile under personal tokens.

In [87]:
repos = list()
papers = list()

In [88]:
for paper_link in paper_links:
    if  paper_link.startswith("/paper/") :
        paper= scrape_paper(paper_link)
        paper.scrape_repos(auth_token=auth_token)
        papers.append(paper)
    elif"github" in paper_link:
        repos.append(get_repo(paper_link,auth_token=auth_token))
        

## Take a look on README files

In [89]:
readmes = [paper.repo[0].readme for paper in papers]

TypeError: 'Paper' object is not subscriptable

In [None]:
display(Markdown(readmes[4]))

# Save to database

In [90]:
for paper in papers:
    db.insert_paper(connection,paper)

connection.commit()

In [28]:
connection.close()

## Clone repo

In [36]:
dir = driverPath = os.path.abspath("") + "/repos"
for paper in papers:
    paper_path = dir + "/" + paper.title.replace( ":","")
    if not os.path.exists(paper_path):
        os.mkdir(paper_path)
        
    for repo in paper.repo:
        repo_path = paper_path+ repo.repo_name.replace("/","_")
        
        if not os.path.exists(repo_path):
            os.mkdir(repo_path)
            
        RepoLoader.clone_from(repo.clone_url,repo_path)
    

