## [Academic Occurence](https://github.com/Pold87/academic-keyword-occurrence)

In [1]:
# By: Volker Strobel, improved by Patrick Hofmann
from bs4 import BeautifulSoup
from urllib.request import Request, build_opener, HTTPCookieProcessor
from urllib.parse import urlencode
from http.cookiejar import MozillaCookieJar
import re, time, sys, urllib

In [2]:
def get_academic_occurence(search_term, start_date, end_date):
    '''
    Obtains academic occurence for subject between two dates from Google Scholar and prints it into a csv. 
    Also supports specific inputs like "Global Outlook Digital Humanities" (with quotation marks)
    IE: get_academic_occurence('/"Global Outlook Digital Humanities/"', 2010, 2012)
    '''
    #########################
    def get_num_results(search_term, start_date, end_date):
        """
        Helper method, sends HTTP request and returns response payload.
        """

        # Open website and read html
        user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
        query_params = { 'q' : search_term, 'as_ylo' : start_date, 'as_yhi' : end_date}
        url = "https://scholar.google.com/scholar?as_vis=1&hl=en&as_sdt=1,5&" + urllib.parse.urlencode(query_params)
        opener = build_opener()
        request = Request(url=url, headers={'User-Agent': user_agent})
        handler = opener.open(request)
        html = handler.read() 

        # Create soup for parsing HTML and extracting the relevant information
        soup = BeautifulSoup(html, 'html.parser')
        div_results = soup.find("div", {"id": "gs_ab_md"}) # find line 'About x results (y sec)

        if div_results != None:

            res = re.findall(r'(\d+).?(\d+)?.?(\d+)?\s', div_results.text) # extract number of search results

            if res == []:
                num_results = '0'
                success = True
            else:
                num_results = ''.join(res[0]) # convert string to numbe
                success = True

        else:
            success = False
            num_results = 0

        return num_results, success
    ###################################
    
    values = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ")
    def remover(my_string = ""):
        for item in my_string:
            if item not in values:
                my_string = my_string.replace(item, "")
            return my_string

    #####################################
    fp = open('occurence_for_{}_{}_{}.csv'.format(remover(search_term),
                                                  start_date,
                                                  end_date), 'w')

    fp.write("year,results\n")
    print("year,results")

    for date in range(start_date, end_date + 1):

        num_results, success = get_num_results(search_term, date, date)
        if not(success):
            print("It seems that you made to many requests to Google Scholar. Please wait a couple of hours and try again.")
            break
        year_results = "{0},{1}".format(date, num_results)
        print(year_results)
        fp.write(year_results + '\n')
        time.sleep(0.8)

    fp.close()

In [4]:
get_academic_occurence('"Global Outlook Digital Humanities"', 2010, 2013)

year,results
2010,0
2011,1
2012,1
2013,6
