preprints.org added

OmnesRes · Aug 11, 2016 · e207406 · e207406
1 parent 7f0bb50
commit e207406
Show file tree

Hide file tree

Showing 13 changed files with 376 additions and 8 deletions.
diff --git a/author_dict.py b/author_dict.py
@@ -44,7 +44,11 @@
     for author in i[1]:
         pub_authors.append(author)
 
-
+f=open(r'C:\Users\Jordan Anaya\Desktop\prepub\preprints\preprints.txt')
+preprints=[eval(i.strip()) for i in f]
+for i in preprints:
+    for author in i[1]:
+        pub_authors.append(author)
 
 
 def middle_function(name):

diff --git a/papers/name_first.py b/papers/name_first.py
diff --git a/papers/name_last.py b/papers/name_last.py
diff --git a/papers/unique_first.py b/papers/unique_first.py
diff --git a/papers/unique_last.py b/papers/unique_last.py
diff --git a/preprints/error_log/for_github.txt b/preprints/error_log/for_github.txt
@@ -0,0 +1 @@
+github doesn't like empty folders
diff --git a/preprints/preprints.py b/preprints/preprints.py
@@ -0,0 +1,83 @@
+import time
+import requests
+import unicodedata
+import re
+from bs4 import BeautifulSoup
+titles=[]
+authors=[]
+dates=[]
+abstracts=[]
+links=[]
+tags=[]
+author_aff=[]
+
+##get all articles
+
+categories=['biology','medicine_pharmacology','life_sciences']
+
+base='http://preprints.org/subject/browse/'
+
+for cat in categories:
+    index=1
+    print cat
+    templinks=[]
+    tempdates=[]
+    tempabstracts=[]
+    tempauthors=[]
+    temptags=[]
+    while True:
+        print index
+        r=requests.get(base+cat+'?page_num='+str(index)+'&page_length=100')
+        print r.ok
+        count=0
+        soup=BeautifulSoup(r.content,'html.parser')
+        for i in soup.find_all("a", {'class':'title'}):
+            templinks.append(i.get('href'))
+        templinks=templinks[:-1]
+        for i in soup.find_all('div',{'class','search-content-box'}):
+            tempdate=i.find('div',{'class','show-for-large-up'}).text.strip().split('Online: ')[1].split(' (')[0].strip().split()
+            tempdates.append(tempdate[1]+' '+tempdate[0]+' '+tempdate[2])
+            count+=1
+        for i in soup.find_all('div',{'class','abstract-content'}):
+            tempabstracts.append(i.text)
+        for i in soup.find_all('div',{'class','search-content-box-author'}):
+            tempauthors.append([unicodedata.normalize('NFKD',j.text).encode('ascii','ignore') for j in i.find_all('a')])
+        for i in soup.find_all('div',{'class','search-content-box'}):
+            temptags.append([i.find_all('div')[4].text.strip().split(', ')[1].split(';')[0].strip()])
+        if count==100:
+            index+=1
+        else:
+            break
+
+    for i,j,k,l,m in zip(templinks,tempdates,tempabstracts,tempauthors,temptags):
+        if 'manuscript' in i:
+            print i
+            r=requests.get('http://preprints.org'+i)
+            soup=BeautifulSoup(r.content,'html.parser')
+            if soup.find('span',{'class','type-span'}).text in ['Review','Article']:
+                dates.append(j)
+                titles.append(soup.find('h1').text.strip())
+                abstracts.append(k)
+                links.append(i)
+                authors.append(l)
+                temp_aff=[]
+                for aff in soup.find('div',{'class','manuscript-affiliations'}).find_all('li'):
+                    temp_aff.append(aff.text)
+                author_aff.append(temp_aff)
+                tags.append(m)
+
+
+            else:
+                pass
+
+
+
+if len(titles)==len(authors)==len(dates)==len(abstracts)==len(links)==len(tags)==len(author_aff):
+    f=open('preprints.txt','w')
+    for title,author,date,abstract,link,tag,author_af in zip(titles,authors,dates,abstracts,links,tags,author_aff):
+        f.write(str([title,author,date,abstract,link,tag,author_af]))
+        f.write('\n')
+    f.close()
+else:
+    print 'error'
+
diff --git a/preprints/preprints.txt b/preprints/preprints.txt
diff --git a/preprints/update_log/for_github.txt b/preprints/update_log/for_github.txt
@@ -0,0 +1 @@
+github doesn't like empty folders
diff --git a/preprints_populate.py b/preprints_populate.py
@@ -0,0 +1,81 @@
+import os
+os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mysite.settings')
+
+import django
+django.setup()
+
+from papers.models import Article
+from papers.models import Tag
+from papers.models import Affiliation
+from papers.models import Author
+from datetime import date as dt
+
+
+
+f=open(r'C:\Users\Jordan Anaya\Desktop\prepub\preprints\preprints.txt')
+preprints=[eval(i.strip()) for i in f]
+
+##work on dates
+date_dict={"January":1,"February":2,"March":3,"April":4,"May":5,"June":6,\
+           "July":7,"August":8,"September":9,"October":10,"November":11,"December":12,
+           "Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,\
+           "Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12}
+
+
+
+for i in preprints:
+    paper=Article(title=i[0],abstract=i[3],link='https://www.preprints.org'+i[4])
+    temp=i[2].split()
+    paper.pub_date=dt(int(temp[2]),date_dict[temp[0]],int(temp[1]))
+    paper.save()
+    temp=[]
+    for author in i[1]:
+        name=author.replace(',','').replace('.','')
+        if name[:3].lower()=='jr ':
+            name=name[3:]
+        if name[-3:].lower()==' jr':
+            name=name[:-3]
+        if name[:3].lower()=='sr ':
+            name=name[3:]
+        if name[-3:].lower()==' sr':
+            name=name[:-3]
+        first_name=name.split()[0]
+        last_name=name.split()[-1]
+        if len(name.split())==2:
+            middle_name=''
+        else:
+            middle_name=name.replace(first_name+' ','').replace(' '+last_name,'').strip()
+        if middle_name!='':
+            temp.append(first_name+' '+middle_name+' '+last_name)
+        else:
+            temp.append(first_name+' '+last_name)
+        try:
+            auth=Author.objects.get(first=first_name,middle=middle_name,last=last_name)
+            paper.authors.add(auth)
+        except:
+            auth=Author.objects.create(first=first_name,middle=middle_name,last=last_name)
+            paper.authors.add(auth)
+    paper.author_list=str(temp)
+    for affiliation in i[-1]:
+        try:
+            aff=Affiliation.objects.get(name=affiliation)
+            paper.affiliations.add(aff)
+        except:
+            aff=Affiliation.objects.create(name=affiliation)
+            paper.affiliations.add(aff)
+    for t in i[-2]:
+        try:
+            tag=Tag.objects.get(name=t)
+            paper.tags.add(tag)
+        except:
+            tag=Tag.objects.create(name=t)
+            paper.tags.add(tag)
+    paper.save()
+
+
+
+
+
+
+
+
diff --git a/templates/help.html b/templates/help.html
@@ -122,6 +122,8 @@ <h1>PrePubMed Help</h1>
             <p>PrePubMed currently indexes tens of thousands of articles from the preprint servers arXiv q-bio, Figshare, PeerJ Preprints, and bioRxiv, along with articles at F1000Research that are undergoing post-publication peer review and free form articles from The Winnower.  New articles are indexed daily. The goal of PrePubMed is to provide a means to search for articles that are not indexed by PubMed, which could be due to the article being "unpublished", or due to the delay between "publication" and indexing by PubMed.  With that said, PrePubMed does not remove articles once they are indexed by PubMed in order to give preprinted articles as much exposure as possible.  A video showing how to search with PrePubMed is available on <a href="https://www.youtube.com/watch?v=a_HoqWqUJ7Y">youtube</a>.</p>
             <h2>A note on Figshare</h2>
             <p>Most articles in Figshare are simply supplementary information to published articles, or are articles that have already undergone peer-review. In addition, many articles on Figshare are not related to science and there seems to be absolutely zero quality control for what gets posted. Because of this I have numerous filters that articles from Figshare must pass in order to get indexed.  Articles from the other preprint servers do not undergo any filtering since those servers screen articles before allowing them to be posted.  On top of this, the Figshare API only lets me access the thousand most recently modified articles.  As a result, if you have an article on Figshare it is unlikely to be in PrePubMed.  If you have an article which might not be suitable for a traditional preprint server such as arXiv, PeerJ Preprints, or bioRxiv, and still want your article indexed by PrePubMed I would recommend submitting it to The Winnower instead of Figshare.</p>
+            <h2>A note on preprints.org</h2>
+            <p>This is a new preprint server that aims to host preprints from a wide range of disciplines.  Currently I have decided to index the Biology, Life Sciences, and Medicine & Pharmacology subjects.  This server also hosts a variety of different types of articles: Article, Review, Conference Paper, Data Descriptor, Brief Report, Case Report, Communication, Short Note, Technical Note, Hypothesis.  It is unclear to me what the quality of some these types of articles will be so I am currently only indexing the preprints labeled "Article" or "Review".</p>
             <h2>A note on The Winnower</h2>
             <p>After much thought I have decided to index all articles on The Winnower with the "paper" designation.  Articles on The Winnower are not preprints and are closer to blog posts.  Although PrePubMed is meant for articles that will eventually be indexed by PubMed, I support nontraditional forms of communicating work.  I normally only index biology related articles, for example only the q-bio section of arXiv and only certain categories of Figshare, but to support The Winnower and its mission I have indexed all of their categories, including Reddit AMAs.</p>
             <p>How will this affect your searches? It likely won't. The Winnower does not have abstracts for its articles, so your search terms will only be searching against titles from The Winnower.  As a result, it is unlikely you will be seeing blog posts show up in your RSS feeds instead of preprints.  And if you do happen to get a blog post from The Winnower, because your search matched the title it might be something you want to check out.
@@ -149,7 +151,7 @@ <h3>How to search for authors</h3>
             <h3>How to search by journal</h3>
             <p>You can't.  PrePubMed is journal agnostic and I believe that where your article is published should not impact viewership. However, there does appear to be differences in the quality of the preprint servers with regards to indexing information such as author names or ensuring that an article isn't duplicated.  I want the information in PrePubMed to approach the accuracy of PubMed and will be contacting the preprint servers to work towards this goal.  If one preprint server is clearly the best I will consider endorsing its use.</p>     
             <h3>How to search by subject area</h3>
-            <p>You kind of can.  When you perform a search I provide the list of the subject areas associated with each article, and you can click on them to perform a search for that exact subject area.  The problem is that there is not a consistent subject area system among the six journals that PrePubMed indexes.  As a result, clicking on a tag for Bioinformatics may not return all articles related to bioinformatics.  Because of this, I do not provide the ability to perform a custom search with subject areas.</p>
+            <p>You kind of can.  When you perform a search I provide the list of the subject areas associated with each article, and you can click on them to perform a search for that exact subject area.  The problem is that there is not a consistent subject area system among the seven journals that PrePubMed indexes.  As a result, clicking on a tag for Bioinformatics may not return all articles related to bioinformatics.  Because of this, I do not provide the ability to perform a custom search with subject areas.</p>
             <h3>How to search by affiliation</h3>
             <p>You can search for affiliation with the advanced search option.  Note that Figshare does not list affiliations for authors so a search for an affiliation will not return any Figshare preprints.  See <a href="#advanced">using advanced search</a> for more details.</p>
             <h3>What do I do about duplicated or questionable articles?</h3>

diff --git a/templates/home.html b/templates/home.html
@@ -134,7 +134,7 @@
             </div>
             <div class="col-sm-9" style="background-color:#141927; color:white;padding:0; height:180px">
                 <div id='grad' style="margin:0;font-size:20px;padding-left:10px"><b>PrePubMed</b></div>
-                <p style="font-size:20px;padding:10px">PrePubMed indexes preprints from arXiv q-bio, PeerJ Preprints, Figshare, bioRxiv, F1000Research, and The Winnower. Articles are not stored on PrePubMed, but you will be linked to the article at the respective site.</p>
+                <p style="font-size:20px;padding:10px">PrePubMed indexes preprints from arXiv q-bio, PeerJ Preprints, Figshare, bioRxiv, F1000Research, preprints.org, and The Winnower. Articles are not stored on PrePubMed, but you will be linked to the article at the respective site.</p>
             </div>
             <div class="col-sm-12" style="font-size:16px; text-align:left; margin-top:10px">
                 <div class="col-sm-4">
@@ -170,10 +170,14 @@ <h2>PrePubMed Tools</h2>
                     </p>
                 </div>
             </div>
-            <div class="col-sm-12" style="font-size:16px; text-align:center; margin-top:20px">
+            <div class="col-sm-12" style="font-size:18px; text-align:left; margin-top:20px">
+                <h2>Some Tips!</h2>
                 All terms are combined with AND logic, so don't bother typing "AND" or "OR"
                 <br>
                 <br>
+                Don't try to search for affiliations in the default search box, use <a href="/advanced_search/">Advanced Search</a> instead
+                <br>
+                <br>
                 Site is free to use and <a href="https://github.com/OmnesRes/prepub">open source</a>, send questions to omnesresnetwork at gmail.com
                 <br>
                 <br>