In [1]:
from selenium import webdriver
import pandas as pd 
import numpy as np 
import time
import datetime
import subprocess
import re
import os 
import requests

In [2]:
keywords = ["wikipedia"]
journals = ['information systems research', 'mis quarterly', 'journal of management information systems',
 'journal of the association for information systems', 'management science', 'operational research']  # source: "MISQ"
# journals = ['information systems research']
# authors = ["Kuznets"]  # author:Kuznets
fpath = "/Users/Nico/test/test_googlecrawer"

In [3]:
alias = {'information systems research':"ISR", 'mis quarterly':'MISQ', 'journal of management information systems':"JMIS",
         'journal of the association for information systems':"JAIS", 'management science':'MS', 'operational research':"OR"}

In [7]:
# options = webdriver.ChromeOptions()
# # options.add_argument('headless')

# options.binary_location = '/usr/local/bin/chromedriver'
# options.add_argument('headless')
# options.add_argument("--no-sandbox");
# options.add_argument("--disable-dev-shm-usage")

In [4]:
def getbibTeX(article, driver):
    bib = article.find_element_by_css_selector("div[class=gs_fl]").find_element_by_css_selector("a[class=gs_or_cit\ gs_nph]")
    bib.click()
    time.sleep(2)  ## sleep wait for the ajax to load 
    driver.find_element_by_css_selector("div[id=gs_citi]").find_element_by_css_selector("a[class=gs_citi]").click()
    bib_text = driver.find_element_by_tag_name("body").text.replace("\n", "")
    driver.back()
    driver.find_element_by_css_selector("span[class=gs_ico]").click()
    return bib_text

In [130]:
getbibTeX(article[2], driver)

"@article{birdsall1997asset,  title={Asset inequality matters: an assessment of the World Bank's approach to poverty reduction},  author={Birdsall, Nancy and Londo{\\~n}o, Juan Luis},  journal={The American Economic Review},  volume={87},  number={2},  pages={32--37},  year={1997},  publisher={JSTOR}}"

In [148]:
def getInfo(article, driver):
    default = {"title": "NA", "author": "NA", "journal": "NA", "year":"NA"}
    default['title'] = article.find_element_by_class_name("gs_rt").text.lower()
    infobox = article.find_element_by_class_name("gs_a").text
    default['author'], default['journal'], default['year'] = parse(infobox)
    return default

In [149]:
getInfo(article[0], driver)

{'title': 'economic growth and income inequality',
 'author': 's kuznets',
 'journal': 'the american economic review',
 'year': '1955'}

In [131]:
def getPdf(article, driver):
    try:
        tmp = article.find_element_by_css_selector("div[class=gs_or_ggsm")
        pdf_link = tmp.find_element_by_tag_name("a").get_attribute("href")
    except:
        pdf_link = "NA"
    return pdf_link

In [138]:
getPdf(article[1], driver)

'http://www.diva-portal.org/smash/get/diva2:338002/FULLTEXT01.pdf'

In [5]:
def downloadPdf(output, link):
    response = requests.get(link)
    with open(output, 'wb') as f:
        f.write(response.content)

In [6]:
def parse(infobox):
    infobox = infobox.lower().split("-")
    infobox = [c.strip() for c in infobox]
    author = infobox[0].split(",")[0]
    journal = infobox[1].split(",")[0]
    year = infobox[1].split(",")[1].strip()
    return author, journal, year

In [7]:
# for test
search_keyword = '''online communities'''
driver = webdriver.Chrome()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.get('https://scholar.google.com/')
input_element = driver.find_element_by_name("q")
input_element.clear()
input_element.send_keys(search_keyword)
input_element.submit()
elements = driver.find_elements_by_css_selector("div[class=gs_r\ gs_or\ gs_scl]")

In [33]:
elements[3].find_element_by_class_name("gs_fl").text

'[PDF] researchgate.net\nFull View'

In [4]:
class Article:
    def __init__(self, keywords, target_journal, folder):
        self.keywords = keywords
        self.target_journal = target_journal
        self.output_folder = folder
        self.create_folder()
        self.total_article = {}
    
    def create_folder(self):
        self.output_fpath = "/".join([self.output_folder, self.keywords, self.target_journal])
        if not os.path.exists(self.output_fpath):
            os.makedirs(self.output_fpath)
            print('created folder {0}'.format(self.output_fpath))
    
    def getInfo(self, article, driver):
        default = {"title": "NA", "author": "NA", "journal": "NA", "year":"NA", "log": "NA"}
        default['title'] = article.find_element_by_class_name("gs_rt").text.lower()
        default['title'] = re.sub("[^a-z0-9 ]", "", default['title'])
        infobox = article.find_element_by_class_name("gs_a").text
        default['author'], default['journal'], default['year'] = parse(infobox)
        return default
    
    def getPdf(self, article, driver):
        tmp = article.find_element_by_css_selector("div[class=gs_or_ggsm")
        pdf_link = tmp.find_element_by_tag_name("a").get_attribute("href")
        return pdf_link
    
    def getFileName(self, alias=alias):
        by = ["author", "year", "title", "journal"]
        if alias:
            if self.info['journal'] in alias.keys():
                self.info['journal-short'] = alias[self.info['journal']]
                by = ["author", "year", "title", "journal-short"]
        if len(self.info['title'].split(" ")) > 10:
            self.info['title-short'] = " ".join(self.info['title'].split(" ")[:10])
            if "journal-short" in self.info.keys():
                by = ["author", "year", "title-short", "journal-short"]
            else:
                by = ["author", "year", "title-short", "journal"]
        filename = "-".join([self.info[c] for c in by]) + ".pdf"
        return filename
    
    def fit(self, article, driver, num):
        try:
            self.info = self.getInfo(article, driver)
        except:
            print("article info parse error!")
            self.info = None
        if self.info:
            try:
                self.pdf = self.getPdf(article, driver)
            except:
                self.info['log'] = "pdf missing"
                self.pdf = None
            if self.pdf:
                self.filename = self.getFileName()
                output = self.output_fpath + "/" + self.filename
                try:
                    downloadPdf(output, self.pdf)
                except:
                    self.log = self.info['log'] + "||| pdf download error"
        self.total_article[num] = self.info
#         if hasattr(self, 'log'):
#             now = datetime.datetime.now()
#             logfile_path = self.output_fpath + "/" + "log_{0}.txt".format(now.strftime("%m-%d-%Y"))
#             if not os.path.exists(logfile_path):
#                 subprocess.call("touch {0}".format(logfile_path), shell=True)
#             with open(logfile_path, "a") as f:
#                 f.writelines(self.log)

In [33]:
def run(keywords, journals, recursive = 6):
    driver = webdriver.Chrome()
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.get('https://scholar.google.com/')
    for i in keywords:
        for j in journals:
            cnt = 1
            articles = Article(i, j, fpath)
            search_keyword = " ".join([i.lower(), '''source:"{}"'''.format(j.lower())])
            print("current search key: {0}".format(search_keyword))
            input_element = driver.find_element_by_name("q")
            input_element.clear()
            input_element.send_keys(search_keyword)
            input_element.submit()
            time.sleep(2)
            for n in range(recursive):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                elements = driver.find_elements_by_css_selector("div[class=gs_r\ gs_or\ gs_scl]")
                for e in elements:
                    try:
                        articles.fit(e, driver, cnt)
                    except:
                        print("page {} number {} parse error!".format(n, cnt))
                    cnt += 1
                try:
                    driver.find_element_by_css_selector("span[class=gs_ico\ gs_ico_nav_next]").click()
                    time.sleep(5)
                except:
                    pass
            log = pd.DataFrame(articles.total_article).T
            now = datetime.datetime.now()
            log.to_csv(articles.output_fpath+"/"+"logfile_{}.txt".format(now.strftime("%m-%d-%Y")), sep="\t")
    driver.quit()

In [93]:
# article = driver.find_element_by_css_selector("div[class^=gs_ri")

In [34]:
journals = ['journal of management information systems',
 'journal of the association for information systems', 'management science', 'operational research']

In [35]:
run(keywords, journals, recursive=6)

current search key: wikipedia source:"journal of management information systems"
created folder /Users/Nico/test/test_googlecrawer/wikipedia/journal of the association for information systems
current search key: wikipedia source:"journal of the association for information systems"
article info parse error!
article info parse error!
created folder /Users/Nico/test/test_googlecrawer/wikipedia/management science
current search key: wikipedia source:"management science"
article info parse error!
article info parse error!
created folder /Users/Nico/test/test_googlecrawer/wikipedia/operational research
current search key: wikipedia source:"operational research"
article info parse error!
article info parse error!
article info parse error!
article info parse error!
article info parse error!
