In [1]:
#import relevant libraries
#requests for get requests
#csv to convert file into csv
#re to extract the maximum number of page with a regular expression
#BeautifulSoup to scrape data
import requests
import csv
import re
from bs4 import BeautifulSoup

#get the url-data to extract the max iterations of first for-loop (the max amount of project-lists via 50 projects)
#future work: kinda redundant to scrape the same URL twice (initial_URL and listURL). There should be a better solution
initial_URL = "https://gepris.dfg.de/gepris/OCTOPUS?beginOfFunding=&bewilligungsStatus=&bundesland=DEU%23&context=projekt&einrichtungsart=-1&fachgebiet=%23&findButton=historyCall&gefoerdertIn=&ggsHunderter=0&hitsPerPage=50&index=0&nurProjekteMitAB=false&oldGgsHunderter=0&oldfachgebiet=%23&pemu=%23&task=doKatalog&teilprojekte=true&zk_transferprojekt=false"
website = requests.get(initial_URL)
#convert to beautifulsoup-object
results = BeautifulSoup(website.content, 'html.parser')
#get the number of pages that needs to extracted. 
#the number that we want is saved as '2.751'. To get rid of the dot and convert it into an integer-value, the numbers are extracted individually as an array, joined and converted to an integer value
number_pages_to_extract = int(''.join(re.findall(r'\d+', results.find("span", {"id": "result-info"}).find("strong").get_text())))
#the keys that we are interested in to extract. Needs the exact name of the possible keys from a project page (Example of a project page: https://gepris.dfg.de/gepris/projekt/268931)
keys_to_extract = ["Fachliche Zuordnung", "Förderung", "Webseite", "DFG-Verfahren"]
#create empty dictionary to fill it with data 
data = []

for x in range(number_pages_to_extract):

    listURL = "https://gepris.dfg.de/gepris/OCTOPUS?beginOfFunding=&bewilligungsStatus=&bundesland=DEU%23&context=projekt&einrichtungsart=-1&fachgebiet=%23&findButton=historyCall&gefoerdertIn=&ggsHunderter=0&hitsPerPage=50&index="+str(x*50)+"&nurProjekteMitAB=false&oldGgsHunderter=0&oldfachgebiet=%23&pemu=%23&task=doKatalog&teilprojekte=true&zk_transferprojekt=false"
    website = requests.get(listURL)
    results = BeautifulSoup(website.content, 'html.parser')

    #get GEPRIS Project ID to get the URL of a specific project-page of the project-list
    projects = results.find_all("div", class_="results")

    for project in projects:
        URL = "https://gepris.dfg.de" + project.find('a').get('href')
        print(str(x) + " : " + URL)

        website = requests.get(URL)
        results = BeautifulSoup(website.content, 'html.parser')

        #array that contains html-code for every value that can be extracted from a project-page 
        values = results.find_all('span', class_='value')
        #array that contains all names (keys) that can be extracted from a project-page
        names = results.find_all('span', class_='name')
        #empty dict to store relevant information about a project
        projectdict = {}

        projectdict["GEPRIS Project ID"] = URL.rsplit('/', 1)[-1]
        # .text to get only the information without html-code
        # .join(.split) to get rid of all whitespaces in the information
        projectdict["Projekttitel"] = " ".join(results.find('h1', class_='facelift').text.split())
        projectdict["Wikidata Description"] = "GEPRIS Projekt"
        projectdict["DFG-Webseite"] = URL
        
        #for every value in a project page
        for index, info in enumerate(values):
            #get the key out of html-construct
            key = " ".join(names[index].text.split())
            #check if the key should be added to dict
            if key in keys_to_extract:
                #If the string is "Zur Hompage" (get to website), don't extract the string, but the href-attribute
                if (" ".join(info.text.split()) == 'Zur Homepage'):
                    value = info.find('a').get('href')
                #The value of "Förderung" contains two pieces of information and needs to be split in "von" (start time of funding) and "bis" (end time of funding)
                #There are three possible outcomes for the value "Förderung"
                # 1: If the string contains "bis", it also contains the starttime and endtime, which can be extracted
                # 2: If the string contains "in", the start- and endtime are same
                # 3: If the string contains "seit", there is no endtime inside the string
                elif (key == "Förderung"):
                    foerderung_string = " ".join(info.text.split())
                    if ("bis" in foerderung_string):
                        projectdict["von"] = foerderung_string[14:18]
                        projectdict["bis"] = foerderung_string[-4:]
                    elif ("in" in foerderung_string):
                        projectdict["von"] = foerderung_string[-4:]
                        projectdict["bis"] = foerderung_string[-4:]
                    elif ("seit" in foerderung_string):
                        projectdict["von"] = foerderung_string[-4:]
                else:
                    value = " ".join(info.text.split())
                #to prevent to save the information twice
                if key != "Förderung":
                    projectdict[key] = value
        data.append(projectdict)

0 : https://gepris.dfg.de/gepris/projekt/268853
0 : https://gepris.dfg.de/gepris/projekt/268879
0 : https://gepris.dfg.de/gepris/projekt/268931
0 : https://gepris.dfg.de/gepris/projekt/269007
0 : https://gepris.dfg.de/gepris/projekt/269105
0 : https://gepris.dfg.de/gepris/projekt/269145
0 : https://gepris.dfg.de/gepris/projekt/269237
0 : https://gepris.dfg.de/gepris/projekt/269289
0 : https://gepris.dfg.de/gepris/projekt/269303
0 : https://gepris.dfg.de/gepris/projekt/269347
0 : https://gepris.dfg.de/gepris/projekt/269379
0 : https://gepris.dfg.de/gepris/projekt/269443
0 : https://gepris.dfg.de/gepris/projekt/269589
0 : https://gepris.dfg.de/gepris/projekt/269673
0 : https://gepris.dfg.de/gepris/projekt/269869
0 : https://gepris.dfg.de/gepris/projekt/269889
0 : https://gepris.dfg.de/gepris/projekt/269933
0 : https://gepris.dfg.de/gepris/projekt/270059
0 : https://gepris.dfg.de/gepris/projekt/270081
0 : https://gepris.dfg.de/gepris/projekt/270091
0 : https://gepris.dfg.de/gepris/projekt

KeyboardInterrupt: 

In [32]:
#save as csv_file as gepris_projects_wikidata.csv
myFile = open('gepris_projects_wikidata.csv', 'w', encoding='utf-8')
writer = csv.DictWriter(myFile, lineterminator='\n', fieldnames=['GEPRIS Project ID', 'Projekttitel', 'Wikidata Description', 'DFG-Webseite', 'Fachliche Zuordnung', 'von', 'bis', 'Webseite', 'DFG-Verfahren'])
writer.writeheader()
writer.writerows(data)
myFile.close()