In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import json
import re
import pandas as pd

### Collect Conventry Papers

In [2]:
# function to scrape coventry publications page

def scrapPapers(start_page = 1, page_limit = 200):

    page = start_page
    url = f"https://pureportal.coventry.ac.uk/en/publications/?format=&page={page}"

    papers = []

    while page < page_limit:

        try:
            pageSource = requests.get(url).text
            soup = BeautifulSoup(pageSource, "html.parser")
            paperLists = soup.select(".list-result-item")

            if len(paperLists) == 0:
                break

            for paper in paperLists:
                paperInfo = {}
                paperInfo['link'] = paper.select_one('h3.title a')['href']
                paperInfo['title'] = paper.select_one('h3.title a').text

                journal = paper.select_one('a', attrs = {'rel' : 'Journal'})
                paperInfo['journal'] = journal.text
                paperInfo['journalLink'] = journal['href']
                cols = ['date', 'volume', 'pages', 'numberofpages', 'type_classification']

                for x in cols:
                    try:
                        paperInfo[x] = paper.select_one(f'span.{x}').text

                        if x == 'numberofpages':
                            paperInfo[x] = int(paperInfo[x][:-2])
                        elif x == 'pages':
                            paperInfo[x] = paperInfo[x][3:]
                        elif x == 'volume':
                            paperInfo[x] = int(paperInfo[x])

                    except:
                        pass
                
                papers.append(paperInfo)

            print(f"Finished {page} ")
            
            page += 1
            url = f"https://pureportal.coventry.ac.uk/en/publications/?format=&page={page}"

        except: 
            break

    return papers

def getAuthorsAndOtherDocumentInformation(paperInfo):
    
    source = requests.get(paperInfo['link']).text
    paperSoup = BeautifulSoup(source, "html.parser")
    
    if paperSoup.select_one("div.doi a") is not None:
        paperInfo['doi'] = paperSoup.select_one("div.doi a")['href']
    
    persons = paperSoup.select_one("p.relations.persons")
    
    if persons is not None:
        paperInfo['authors'] = list(map(
                lambda x : x.strip(), 
                persons.text.split(','))
            )
    
    paperInfo['tags'] = [span.text for span in 
            paperSoup.select("li.userdefined-keyword")]

    paperInfo['coventryAuthors'] = [a['href'] for a in 
            persons.select('a', attrs = { 'rel' : 'Person'})]
    
    abstract = paperSoup.select_one(".rendering_researchoutput_abstractportal")

    paperInfo['abstract'] = None

    if abstract:
        paperInfo['abstract'] = abstract.text
    
    # return paperInfo

In [3]:
papers = scrapPapers(start_page = 1)
len(papers)

Finished 1 
Finished 2 
Finished 3 
Finished 4 
Finished 5 
Finished 6 
Finished 7 
Finished 8 
Finished 9 
Finished 10 
Finished 11 
Finished 12 
Finished 13 
Finished 14 
Finished 15 
Finished 16 
Finished 17 
Finished 18 
Finished 19 
Finished 20 
Finished 21 
Finished 22 
Finished 23 
Finished 24 
Finished 25 
Finished 26 
Finished 27 
Finished 28 
Finished 29 
Finished 30 
Finished 31 
Finished 32 
Finished 33 
Finished 34 
Finished 35 
Finished 36 
Finished 37 
Finished 38 
Finished 39 
Finished 40 
Finished 41 
Finished 42 
Finished 43 
Finished 44 
Finished 45 
Finished 46 
Finished 47 
Finished 48 
Finished 49 
Finished 50 
Finished 51 
Finished 52 
Finished 53 
Finished 54 
Finished 55 
Finished 56 
Finished 57 
Finished 58 
Finished 59 
Finished 60 
Finished 61 
Finished 62 
Finished 63 
Finished 64 
Finished 65 
Finished 66 
Finished 67 
Finished 68 
Finished 69 
Finished 70 
Finished 71 
Finished 72 
Finished 73 
Finished 74 
Finished 75 
Finished 76 
Finished 77 
Finished

5057

In [4]:
with open("papers.json", "w") as f:
    f.write(json.dumps(papers))

In [5]:
getAuthorsAndOtherDocumentInformation(papers[0])
papers[0]

{'link': 'https://pureportal.coventry.ac.uk/en/publications/association-of-minimally-processed-and-ultra-processed-food-daily',
 'title': 'Association of minimally processed and ultra-processed food daily consumption with obesity in overweight adults:  a cross-sectional study',
 'journal': 'Association of minimally processed and ultra-processed food daily consumption with obesity in overweight adults:  a cross-sectional study',
 'journalLink': 'https://pureportal.coventry.ac.uk/en/publications/association-of-minimally-processed-and-ultra-processed-food-daily',
 'date': '15 Feb 2023',
 'volume': '(In-Press)',
 'numberofpages': 30,
 'type_classification': 'Article',
 'doi': 'https://doi.org/10.20960/nh.04270',
 'authors': ['Leonardo V Silva',
  'Pedro Pugliesi Abdalla',
  'Lucimere Bohn',
  'Rafael Gavassa de Araújo',
  'Daniel de Freitas Batalhão',
  'Ana Cláudia Rossini Venturini',
  'Anderson Dos Santos Carvalho',
  'Michael Duncan',
  'Jorge Mota',
  'Dalmo Roberto Lopes Machado'],
 

In [6]:
import threading, subprocess, time
 
num_threads = 20
lock = threading.Lock()
last = time.time()

def scrapePapersParallel(papers, start, end):
    '''
        in the range [start, end)
    '''

    last = time.time()

    for i in range(start, end):
        try:
            getAuthorsAndOtherDocumentInformation(papers[i])

            if i % 100 == 0:
                print(f"Finished {i}")
        except:
            print(f"Failed Index {i}")

        try:
            now = time.time()

            # write to file here synchronize against threads
            if threading.current_thread().name == "Thread-0":
                if (now - last) >= 60:
                    lock.acquire()
                    last = now

                    with open("./papers1.json", "w") as f:
                        f.write(json.dumps(papers))
                    
                    print(f"Wrote to file")
                    lock.release()
        except:
            print("Error writing to file")
            
    print(f"Finished thread {threading.current_thread().name}")


In [7]:
blockSizes = len(papers) // num_threads

startInds = [i * blockSizes for i in range(0, num_threads)]
startInds.append(len(papers))
print(startInds)

threads = []

for ind in range(num_threads):
    new = threading.Thread(
        target=scrapePapersParallel,
        args=(papers, startInds[ind], startInds[ind + 1]),
        name=f"Thread-{ind}"
    )

    threads.append(new)
    new.start()

over_threads = iter(threads)
curr_th = next(over_threads)

while True:
    curr_th.join()
    if curr_th.is_alive():
        continue
    try:
        curr_th = next(over_threads)
    except StopIteration:
        break

[0, 252, 504, 756, 1008, 1260, 1512, 1764, 2016, 2268, 2520, 2772, 3024, 3276, 3528, 3780, 4032, 4284, 4536, 4788, 5057]
Finished 0
Finished 4800
Finished 4300
Finished 3800
Wrote to file
Finished 3300
Finished 2800
Finished 2300
Finished 1800
Finished 1300
Finished 300
Finished 800
Wrote to file
Finished 4600
Finished 4100
Finished 3600
Finished 3100
Finished 2600
Wrote to file
Finished 2100
Finished 1600
Finished 1100
Finished 600
Finished 100
Finished 4900
Wrote to file
Finished 3900
Finished 4400
Finished 3400
Finished 2400
Finished 2900
Finished 1900
Finished 1400
Finished 400
Finished 900
Wrote to file
Finished 4700
Finished 4200
Finished 3700
Finished 3200
Wrote to file
Finished 2700
Finished 2200
Finished 1700
Finished 1200
Finished 700
Finished 200
Wrote to file
Finished 4000
Finished 5000
Finished 4500
Finished 3500
Finished 3000
Finished 2500
Wrote to file
Finished 2000
Finished 1500
Finished thread Thread-15
Finished 500
Finished thread Thread-16
Finished thread Thread-5
Fi

In [8]:
with open("papers.json", "w") as f:
    f.write(json.dumps(papers))
  

In [9]:
with open("papers.json", "r") as f:
   papers = json.loads(f.read())

papers[200]

{'link': 'https://pureportal.coventry.ac.uk/en/publications/from-anglophone-problem-to-anglophone-conflict-in-cameroon-assess',
 'title': 'From ‘Anglophone Problem’ to ‘Anglophone Conflict’ in Cameroon: Assessing Prospects for Peace',
 'journal': 'From ‘Anglophone Problem’ to ‘Anglophone Conflict’ in Cameroon: Assessing Prospects for Peace',
 'journalLink': 'https://pureportal.coventry.ac.uk/en/publications/from-anglophone-problem-to-anglophone-conflict-in-cameroon-assess',
 'date': 'Apr 2023',
 'volume': 58,
 'pages': '89-105',
 'numberofpages': 17,
 'type_classification': 'Article',
 'doi': 'https://doi.org/10.1177/00020397231155244',
 'authors': ['Maurice Beseng', 'Gordon Crawford', 'Nancy Annan'],
 'tags': ['Cameroon',
  'Anglophone problem',
  'Anglophone conflict',
  'identity',
  'conflict resolution'],
 'coventryAuthors': ['https://pureportal.coventry.ac.uk/en/persons/gordon-crawford',
  'https://pureportal.coventry.ac.uk/en/persons/nancy-annan-2'],
 'abstract': 'Since 2017, an

In [10]:
papersDf = pd.read_json("./papers.json")
# papersDf

# export as csv
papersDf.to_csv("./papers.csv", index=False)

### Collect Author Profiles

In [11]:
def getProfileURLorNone(url):

    if "no-content" in url:
        return None
    
    pattern = r"^(\/[^?]+)"
    path_match = re.match(pattern, url)
    path = None

    if path_match:
        path = path_match.group(1)

    return path    

def scrapeAuthors(start_page = 1, page_limit = 200):

    page = start_page
    url = f"https://pureportal.coventry.ac.uk/en/persons/?format=&page={page}"

    authors = []

    while page < page_limit:

        try:
            pageSource = requests.get(url).text

            soup = BeautifulSoup(pageSource, "html.parser")
            
            authorList = soup.select("li.grid-result-item div.result-container")

            if len(authorList) == 0:
                break

            for author in authorList:

                try:
                    authorInfo = {}

                    authorInfo['picUrl'] = getProfileURLorNone(
                            author.select_one("img")['src']
                    )

                    if authorInfo['picUrl'] is not None:
                        authorInfo['picUrl'] = 'https://pureportal.coventry.ac.uk/' + authorInfo['picUrl'] 

                    name = author.select_one("a", attrs = { 'rel' : 'Person'})

                    authorInfo['name'] = name.text
                    authorInfo['profileLink'] = name['href']

                    dept = author.select_one(".relations.organisations a", 
                            attrs = { 'rel' : 'Organisation'})

                    authorInfo['department'] = dept.text
                    authorInfo['deptLink'] = dept['href']
                    
                    authors.append(authorInfo)
                except:
                    pass

            print(f"Finished {page} ")
            
            page += 1
            url = f"https://pureportal.coventry.ac.uk/en/persons/?format=&page={page}"
        except: 
            break

    return authors

In [12]:
authors = scrapeAuthors()

print(f"Scraped {len(authors)} authors")

Finished 1 
Finished 2 
Finished 3 
Finished 4 
Finished 5 
Finished 6 
Finished 7 
Finished 8 
Finished 9 
Finished 10 
Finished 11 
Finished 12 
Finished 13 
Finished 14 
Finished 15 
Finished 16 
Finished 17 
Finished 18 
Finished 19 
Finished 20 
Finished 21 
Finished 22 
Finished 23 
Finished 24 
Finished 25 
Finished 26 
Finished 27 
Finished 28 
Finished 29 
Finished 30 
Finished 31 
Finished 32 
Finished 33 
Finished 34 
Finished 35 
Finished 36 
Finished 37 
Finished 38 
Finished 39 
Finished 40 
Finished 41 
Finished 42 
Finished 43 
Finished 44 
Finished 45 
Finished 46 
Scraped 2026 authors


In [13]:
with open("authors.json", "w") as f:
    f.write(json.dumps(authors))

## Insert into database


In [14]:
authorsDf = pd.read_json("./authors.json")
# papersDf

# export as csv
authorsDf.to_csv("./authors.csv", index=False)