Skip to content

Commit

Permalink
preprints.org added
Browse files Browse the repository at this point in the history
  • Loading branch information
OmnesRes committed Aug 11, 2016
1 parent 7f0bb50 commit e207406
Show file tree
Hide file tree
Showing 13 changed files with 376 additions and 8 deletions.
6 changes: 5 additions & 1 deletion author_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@
for author in i[1]:
pub_authors.append(author)


f=open(r'C:\Users\Jordan Anaya\Desktop\prepub\preprints\preprints.txt')
preprints=[eval(i.strip()) for i in f]
for i in preprints:
for author in i[1]:
pub_authors.append(author)


def middle_function(name):
Expand Down
2 changes: 1 addition & 1 deletion papers/name_first.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion papers/name_last.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion papers/unique_first.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion papers/unique_last.py

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions preprints/error_log/for_github.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
github doesn't like empty folders
83 changes: 83 additions & 0 deletions preprints/preprints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import time
import requests
import unicodedata
import re
from bs4 import BeautifulSoup
titles=[]
authors=[]
dates=[]
abstracts=[]
links=[]
tags=[]
author_aff=[]

##get all articles

categories=['biology','medicine_pharmacology','life_sciences']

base='http://preprints.org/subject/browse/'

for cat in categories:
index=1
print cat
templinks=[]
tempdates=[]
tempabstracts=[]
tempauthors=[]
temptags=[]
while True:
print index
r=requests.get(base+cat+'?page_num='+str(index)+'&page_length=100')
print r.ok
count=0
soup=BeautifulSoup(r.content,'html.parser')
for i in soup.find_all("a", {'class':'title'}):
templinks.append(i.get('href'))
templinks=templinks[:-1]
for i in soup.find_all('div',{'class','search-content-box'}):
tempdate=i.find('div',{'class','show-for-large-up'}).text.strip().split('Online: ')[1].split(' (')[0].strip().split()
tempdates.append(tempdate[1]+' '+tempdate[0]+' '+tempdate[2])
count+=1
for i in soup.find_all('div',{'class','abstract-content'}):
tempabstracts.append(i.text)
for i in soup.find_all('div',{'class','search-content-box-author'}):
tempauthors.append([unicodedata.normalize('NFKD',j.text).encode('ascii','ignore') for j in i.find_all('a')])
for i in soup.find_all('div',{'class','search-content-box'}):
temptags.append([i.find_all('div')[4].text.strip().split(', ')[1].split(';')[0].strip()])
if count==100:
index+=1
else:
break

for i,j,k,l,m in zip(templinks,tempdates,tempabstracts,tempauthors,temptags):
if 'manuscript' in i:
print i
r=requests.get('http://preprints.org'+i)
soup=BeautifulSoup(r.content,'html.parser')
if soup.find('span',{'class','type-span'}).text in ['Review','Article']:
dates.append(j)
titles.append(soup.find('h1').text.strip())
abstracts.append(k)
links.append(i)
authors.append(l)
temp_aff=[]
for aff in soup.find('div',{'class','manuscript-affiliations'}).find_all('li'):
temp_aff.append(aff.text)
author_aff.append(temp_aff)
tags.append(m)


else:
pass



if len(titles)==len(authors)==len(dates)==len(abstracts)==len(links)==len(tags)==len(author_aff):
f=open('preprints.txt','w')
for title,author,date,abstract,link,tag,author_af in zip(titles,authors,dates,abstracts,links,tags,author_aff):
f.write(str([title,author,date,abstract,link,tag,author_af]))
f.write('\n')
f.close()
else:
print 'error'

23 changes: 23 additions & 0 deletions preprints/preprints.txt

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions preprints/update_log/for_github.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
github doesn't like empty folders
81 changes: 81 additions & 0 deletions preprints_populate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')

import django
django.setup()

from papers.models import Article
from papers.models import Tag
from papers.models import Affiliation
from papers.models import Author
from datetime import date as dt



f=open(r'C:\Users\Jordan Anaya\Desktop\prepub\preprints\preprints.txt')
preprints=[eval(i.strip()) for i in f]

##work on dates
date_dict={"January":1,"February":2,"March":3,"April":4,"May":5,"June":6,\
"July":7,"August":8,"September":9,"October":10,"November":11,"December":12,
"Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,\
"Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}



for i in preprints:
paper=Article(title=i[0],abstract=i[3],link='https://www.preprints.org'+i[4])
temp=i[2].split()
paper.pub_date=dt(int(temp[2]),date_dict[temp[0]],int(temp[1]))
paper.save()
temp=[]
for author in i[1]:
name=author.replace(',','').replace('.','')
if name[:3].lower()=='jr ':
name=name[3:]
if name[-3:].lower()==' jr':
name=name[:-3]
if name[:3].lower()=='sr ':
name=name[3:]
if name[-3:].lower()==' sr':
name=name[:-3]
first_name=name.split()[0]
last_name=name.split()[-1]
if len(name.split())==2:
middle_name=''
else:
middle_name=name.replace(first_name+' ','').replace(' '+last_name,'').strip()
if middle_name!='':
temp.append(first_name+' '+middle_name+' '+last_name)
else:
temp.append(first_name+' '+last_name)
try:
auth=Author.objects.get(first=first_name,middle=middle_name,last=last_name)
paper.authors.add(auth)
except:
auth=Author.objects.create(first=first_name,middle=middle_name,last=last_name)
paper.authors.add(auth)
paper.author_list=str(temp)
for affiliation in i[-1]:
try:
aff=Affiliation.objects.get(name=affiliation)
paper.affiliations.add(aff)
except:
aff=Affiliation.objects.create(name=affiliation)
paper.affiliations.add(aff)
for t in i[-2]:
try:
tag=Tag.objects.get(name=t)
paper.tags.add(tag)
except:
tag=Tag.objects.create(name=t)
paper.tags.add(tag)
paper.save()








4 changes: 3 additions & 1 deletion templates/help.html
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ <h1>PrePubMed Help</h1>
<p>PrePubMed currently indexes tens of thousands of articles from the preprint servers arXiv q-bio, Figshare, PeerJ Preprints, and bioRxiv, along with articles at F1000Research that are undergoing post-publication peer review and free form articles from The Winnower. New articles are indexed daily. The goal of PrePubMed is to provide a means to search for articles that are not indexed by PubMed, which could be due to the article being "unpublished", or due to the delay between "publication" and indexing by PubMed. With that said, PrePubMed does not remove articles once they are indexed by PubMed in order to give preprinted articles as much exposure as possible. A video showing how to search with PrePubMed is available on <a href="https://www.youtube.com/watch?v=a_HoqWqUJ7Y">youtube</a>.</p>
<h2>A note on Figshare</h2>
<p>Most articles in Figshare are simply supplementary information to published articles, or are articles that have already undergone peer-review. In addition, many articles on Figshare are not related to science and there seems to be absolutely zero quality control for what gets posted. Because of this I have numerous filters that articles from Figshare must pass in order to get indexed. Articles from the other preprint servers do not undergo any filtering since those servers screen articles before allowing them to be posted. On top of this, the Figshare API only lets me access the thousand most recently modified articles. As a result, if you have an article on Figshare it is unlikely to be in PrePubMed. If you have an article which might not be suitable for a traditional preprint server such as arXiv, PeerJ Preprints, or bioRxiv, and still want your article indexed by PrePubMed I would recommend submitting it to The Winnower instead of Figshare.</p>
<h2>A note on preprints.org</h2>
<p>This is a new preprint server that aims to host preprints from a wide range of disciplines. Currently I have decided to index the Biology, Life Sciences, and Medicine & Pharmacology subjects. This server also hosts a variety of different types of articles: Article, Review, Conference Paper, Data Descriptor, Brief Report, Case Report, Communication, Short Note, Technical Note, Hypothesis. It is unclear to me what the quality of some these types of articles will be so I am currently only indexing the preprints labeled "Article" or "Review".</p>
<h2>A note on The Winnower</h2>
<p>After much thought I have decided to index all articles on The Winnower with the "paper" designation. Articles on The Winnower are not preprints and are closer to blog posts. Although PrePubMed is meant for articles that will eventually be indexed by PubMed, I support nontraditional forms of communicating work. I normally only index biology related articles, for example only the q-bio section of arXiv and only certain categories of Figshare, but to support The Winnower and its mission I have indexed all of their categories, including Reddit AMAs.</p>
<p>How will this affect your searches? It likely won't. The Winnower does not have abstracts for its articles, so your search terms will only be searching against titles from The Winnower. As a result, it is unlikely you will be seeing blog posts show up in your RSS feeds instead of preprints. And if you do happen to get a blog post from The Winnower, because your search matched the title it might be something you want to check out.
Expand Down Expand Up @@ -149,7 +151,7 @@ <h3>How to search for authors</h3>
<h3>How to search by journal</h3>
<p>You can't. PrePubMed is journal agnostic and I believe that where your article is published should not impact viewership. However, there does appear to be differences in the quality of the preprint servers with regards to indexing information such as author names or ensuring that an article isn't duplicated. I want the information in PrePubMed to approach the accuracy of PubMed and will be contacting the preprint servers to work towards this goal. If one preprint server is clearly the best I will consider endorsing its use.</p>
<h3>How to search by subject area</h3>
<p>You kind of can. When you perform a search I provide the list of the subject areas associated with each article, and you can click on them to perform a search for that exact subject area. The problem is that there is not a consistent subject area system among the six journals that PrePubMed indexes. As a result, clicking on a tag for Bioinformatics may not return all articles related to bioinformatics. Because of this, I do not provide the ability to perform a custom search with subject areas.</p>
<p>You kind of can. When you perform a search I provide the list of the subject areas associated with each article, and you can click on them to perform a search for that exact subject area. The problem is that there is not a consistent subject area system among the seven journals that PrePubMed indexes. As a result, clicking on a tag for Bioinformatics may not return all articles related to bioinformatics. Because of this, I do not provide the ability to perform a custom search with subject areas.</p>
<h3>How to search by affiliation</h3>
<p>You can search for affiliation with the advanced search option. Note that Figshare does not list affiliations for authors so a search for an affiliation will not return any Figshare preprints. See <a href="#advanced">using advanced search</a> for more details.</p>
<h3>What do I do about duplicated or questionable articles?</h3>
Expand Down
8 changes: 6 additions & 2 deletions templates/home.html
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@
</div>
<div class="col-sm-9" style="background-color:#141927; color:white;padding:0; height:180px">
<div id='grad' style="margin:0;font-size:20px;padding-left:10px"><b>PrePubMed</b></div>
<p style="font-size:20px;padding:10px">PrePubMed indexes preprints from arXiv q-bio, PeerJ Preprints, Figshare, bioRxiv, F1000Research, and The Winnower. Articles are not stored on PrePubMed, but you will be linked to the article at the respective site.</p>
<p style="font-size:20px;padding:10px">PrePubMed indexes preprints from arXiv q-bio, PeerJ Preprints, Figshare, bioRxiv, F1000Research, preprints.org, and The Winnower. Articles are not stored on PrePubMed, but you will be linked to the article at the respective site.</p>
</div>
<div class="col-sm-12" style="font-size:16px; text-align:left; margin-top:10px">
<div class="col-sm-4">
Expand Down Expand Up @@ -170,10 +170,14 @@ <h2>PrePubMed Tools</h2>
</p>
</div>
</div>
<div class="col-sm-12" style="font-size:16px; text-align:center; margin-top:20px">
<div class="col-sm-12" style="font-size:18px; text-align:left; margin-top:20px">
<h2>Some Tips!</h2>
All terms are combined with AND logic, so don't bother typing "AND" or "OR"
<br>
<br>
Don't try to search for affiliations in the default search box, use <a href="/advanced_search/">Advanced Search</a> instead
<br>
<br>
Site is free to use and <a href="https://github.com/OmnesRes/prepub">open source</a>, send questions to omnesresnetwork at gmail.com
<br>
<br>
Expand Down
Loading

0 comments on commit e207406

Please sign in to comment.