In [1]:
queries = ['A New Livestream Retail Analytics Framework to Assess the Sales Impact of Emotional Displays']
API_KEY = '<670656238a7ae259c899c553e69a1807>'

def get_url(url):
    payload = {'api_key': API_KEY, 'url': url, 'country_code': 'us'}
    proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
    return proxy_url

In [2]:
# import argparse
import csv
# import datetime
# import difflib
# import os
# import pprint
import re
import time
# import timeit
# import warnings
from time import sleep

# import matplotlib.pyplot as plt
# import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# from sklearn.linear_model import LinearRegression


def make_url(keyword, conf, author, year, paper_id=None):
    """make url for search papers
    normal search (keyword, conf, author, year) or target search (paper_id)
    :param keyword: str or None
    :param conf: str or None, conference information
    :param author: str or None, author information
    :param year: int or None, published year
    :param paper_id: None or int, paper information
    :return: url
    """
    assert (
        keyword is not None
        or conf is not None
        or author is not None
        or year is not None
        or paper_id is not None
    ), "KeywordNotFoundError"
    url = "https://scholar.google.co.jp/scholar?"
    if paper_id is not None:
        url += f"&cites={paper_id}"
    else:
        url += "&as_sdt=0%2C5"
        if keyword is not None:
            url += f"&as_q={'%20'.join(keyword.split())}"
        else:
            url += "&as_q="
        if conf is not None:
            url += f"&as_publication={'%20'.join(conf.split())}"
        if author is not None:
            author = "+".join(author.split())
            url += f"&as_sauthors={'%20'.join(author.split())}"
        if year is not None:
            url += f"&as_ylo={year}"
    return url


def get_snippet(soup):
    """obtain snippet from soup
    :param soup: parsed html by BeautifulSoup
    :return: snippet_list
    """
    tags = soup.find_all("div", {"class": "gs_rs"})
    snippet_list = [tags[i].text for i in range(len(tags))]
    return snippet_list


def get_title_and_url(soup):
    """obtain title and url from soup
    :param soup: parsed html by BeautifulSoup
    :return: title_list, url_list
    """
    tags1 = soup.find_all("h3", {"class": "gs_rt"})
    title_list = []
    url_list = []
    for tag1 in tags1:
        # タイトル取得
        # PDF, 書籍, B, HTML, 引用, Cのタグを除去
        title = re.sub(r"\[(PDF|書籍|B|HTML|引用|C)\]", "", tag1.text)
        # 空白区切りを廃止
        title = "_".join(title.split(" "))
        if title[0] == "_":
            title = title[1:]
        title_list.append(title)

        # url取得
        try:
            url = tag1.select("a")[0].get("href")
            url_list.append(url)
        except IndexError:
            url_list.append(None)
    return title_list, url_list


def get_writer_and_year(soup):
    """obtain writer(author) and year from soup
    :param soup: parsed html by BeautifulSoup
    :return: writer_list, year_list
    """
    tags2 = soup.find_all("div", {"class": "gs_a"})
    writer_list = []
    year_list = []
    for tag2 in tags2:
        # 著者取得
        """
        writer = tag2.text
        writer = re.sub(r"\d", "", writer)
        for char in range(0, len(writer)):
            if writer[char] == "-":
                writer = writer[2 : char - 1]
                break
        """
        writer = tag2.text.split("\xa0- ")[0]
        writer_list.append(writer)

        # 論文発行年取得
        year = tag2.text
        year = re.sub(r"\D", "", year)
        # yearが5桁以上だった場合の例外処理
        if len(year) > 4:
            year_list.append(year[len(year) - 4 : len(year)])
        else:
            year_list.append(year)
    return writer_list, year_list


def get_citations(soup):
    """obtain number of citations from soup
    :param soup: parsed html by BeautifulSoup
    :return: ci_num_list
    """
    tags3 = soup.find_all(text=re.compile("被引用"))
    ci_num_list = []
    for tag3 in tags3:
        # 被引用数取得
        citation = tag3.replace("被引用", "")[1:3]
        ci_num_list.append(int(citation))
    return ci_num_list


def get_id(soup):
    """obtain paper id from soup
    :param soup: parsed html by BeautifulSoup
    :return: ci_num_list
    """
    tags4 = soup.find_all("div", {"class": "gs_fl"})
    p_id_list = []
    for tag4 in tags4:
        # 論文ID取得
        try:
            elem = tag4.find_all("a")[2]["href"]
            a = 15
            while True:
                if elem[a] == "&":
                    break
                a += 1
            p_id_list.append(elem[15:a])
        except:
            print("")
    return p_id_list

def year_list_to_cite_years(year_list,p_year):
    """convert year_list into cite_years
    :param year_list,p_year:
    :return: cite_years
    """
    year_list_int = []
    for s in year_list:
        try:
            year_list_int.append(int(s))
        except:
            pass
    y = [p_year+i for i in range(2021 - p_year + 1)]
    cite_years = [0 for _ in range(2021 - p_year + 1)]
    for year in year_list_int:
        if year >= p_year and year <= 2021:
            cite_years[year - p_year] += 1
    list_return = [y, cite_years]
#    cite_years = pd.DataFrame(cite_years,
#                       index=y,
#                       columns=['total'])
#    cite_years  = cite_years.T
    return list_return

def grep_candidate_papers(url):
    """scrape first 10 papers and choose one
    :param url:
    :return: target paper information (title, writer, year, citations, url, paper_id, snippet)
    """
    html_doc = requests.get(url).text
    soup = BeautifulSoup(html_doc, "html.parser")

    title_list, url_list = get_title_and_url(soup)
    writer_list, year_list = get_writer_and_year(soup)
    ci_num_list = get_citations(soup)
    print('ci_num_list',ci_num_list)
    p_id_list = get_id(soup)
    snippet_list = get_snippet(soup)

    for i in range(len(title_list)):
        print("-" * 20)
        print(f"paper number: {str(i)}")
        print(f"paper title: {title_list[i]}")
        print(f"published year: {year_list[i]}")

        # print(f"citations: {ci_num_list}")
        print(f"citations: {ci_num_list[i]}")

    target_paper_num = -1
    while target_paper_num < 0 or target_paper_num >= len(title_list):
        target_paper_num = int(input("Select paper number: "))
        if target_paper_num < 0 or target_paper_num >= len(title_list):
            print("Index out of range! Please re-enter")

    target_paper = {
            "title": title_list[target_paper_num],
        "writer": writer_list[target_paper_num],
        "year": year_list[target_paper_num],
        "citations": ci_num_list[target_paper_num],
        "url": url_list[target_paper_num],
        "paper_id": p_id_list[target_paper_num],
        "snippet": snippet_list[target_paper_num],
    }
    return target_paper


def scraping_papers(url):
    """scrape 100 papers
    :param url: target url
    :return: title_list, url_list, writer_list, year_list, ci_num_list, p_id_list, snippet_list
    """
    url_each = url.split("&")
    url_each[0] = url_each[0] + "start={}"
    url_base = "&".join(url_each)

    title_list = []
    url_list = []
    writer_list = []
    year_list = []
    ci_num_list = []
    p_id_list = []
    snippet_list = []

    for page in range(0, 100, 10):
        print("Loading next {} results".format(page + 10))
        url_tmp = url_base.format(page)
        html_doc = requests.get(url_tmp).text
        soup = BeautifulSoup(html_doc, "html.parser")

        title_list_tmp, url_list_tmp = get_title_and_url(soup)
        writer_list_tmp, year_list_tmp = get_writer_and_year(soup)
        ci_num_list_tmp = get_citations(soup)
        p_id_list_tmp = get_id(soup)
        snippet_list_tmp = get_snippet(soup)

        title_list.extend(title_list_tmp)
        url_list.extend(url_list_tmp)
        writer_list.extend(writer_list_tmp)
        year_list.extend(year_list_tmp)
        ci_num_list.extend(ci_num_list_tmp)
        p_id_list.extend(p_id_list_tmp)
        snippet_list.extend(snippet_list_tmp)

        sleep(np.random.randint(5, 10))
    return (
        title_list,
        url_list,
        writer_list,
        year_list,
        ci_num_list,
        p_id_list,
        snippet_list,
    )


def write_csv(
    conf,
    title_list,
    url_list,
    writer_list,
    year_list,
    ci_num_list,
    p_id_list,
    snippet_list,
):
    """write csv
    :param conf, title_list, url_list, writer_list, year_list, ci_num_list, snippet_list:
    :return:
    """
    labels = [
        "conference",
        "title",
        "writer",
        "year",
        "citations",
        "url",
        "paper ID",
        "snippet",
    ]
    path = "data/conf_csv/" + conf + ".csv"
    with open(path, "w") as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow(labels)
        for title, url, writer, year, ci_num, p_id, snippet in zip(
            title_list,
            url_list,
            writer_list,
            year_list,
            ci_num_list,
            p_id_list,
            snippet_list,
        ):
            csv_writer.writerow([conf, title, writer, year, ci_num, url, p_id, snippet])


if __name__ == "__main__":
    #conf = "ICASSP"
    conf = 'arxiv'
    keyword = "A New Livestream Retail Analytics Framework to Assess the Sales Impact of Emotional Displays"
    year = "2018"
    url = make_url(keyword=keyword, conf=None, author=None, year=None)
    print(f"url: {url}")

    # select target paper
    target_paper = grep_candidate_papers(url)
    print(f"target paper: {target_paper}")

    # create paper list about target paper's citation
    url_cite = make_url(
        keyword=None, conf=None, author=None, year=None, paper_id=target_paper["paper_id"]
    )
    (
        title_list,
        url_list,
        writer_list,
        year_list,
        ci_num_list,
        p_id_list,
        snippet_list,
    ) = scraping_papers(url_cite)

    cite_year = year_list_to_cite_years(year_list,int(target_paper['year']))
    print(cite_year)

url: https://scholar.google.co.jp/scholar?&as_sdt=0%2C5&as_q=A%20New%20Livestream%20Retail%20Analytics%20Framework%20to%20Assess%20the%20Sales%20Impact%20of%20Emotional%20Displays
ci_num_list []


# Have a try

In [1]:
import csv
import re
import time
from time import sleep
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def make_url(keyword, conf, author, year, paper_id=None):
    """make url for search papers
    normal search (keyword, conf, author, year) or target search (paper_id)
    :param keyword: str or None
    :param conf: str or None, conference information
    :param author: str or None, author information
    :param year: int or None, published year
    :param paper_id: None or int, paper information
    :return: url
    """
    assert (
        keyword is not None
        or conf is not None
        or author is not None
        or year is not None
        or paper_id is not None
    ), "KeywordNotFoundError"
    url = "https://scholar.google.co.jp/scholar?"
    if paper_id is not None:
        url += f"&cites={paper_id}"
    else:
        url += "&as_sdt=0%2C5"
        if keyword is not None:
            url += f"&as_q={'%20'.join(keyword.split())}"
        else:
            url += "&as_q="
        if conf is not None:
            url += f"&as_publication={'%20'.join(conf.split())}"
        if author is not None:
            author = "+".join(author.split())
            url += f"&as_sauthors={'%20'.join(author.split())}"
        if year is not None:
            url += f"&as_ylo={year}"
    return url

In [3]:
def get_citations(soup):
    """obtain number of citations from soup
    :param soup: parsed html by BeautifulSoup
    :return: ci_num_list
    """
    tags3 = soup.find_all(text=re.compile("被引用"))
    ci_num_list = []
    for tag3 in tags3:
        # 被引用数取得
        citation = tag3.replace("被引用", "")[1:3]
        ci_num_list.append(int(citation))
    return ci_num_list

In [4]:
df = pd.read_csv('combine.csv')
for i in df['Title']:
    print(i)

A Configuration Theory Assessment of Marketing Organization Fit with Business Strategy and Its Relationship with Marketing Performance
A Configurational Perspective on Key Account Management
A Customer Lifetime Value Framework for Customer
Selection and Resource Allocation Strategy
A Customer Relationship Management Roadmap: What Is
Known, Potential Pitfalls, and Where to Go
A Longitudinal Study of Complaining Customers' Evaluations of Multiple Service Failures and Recovery Efforts
A Marketing Perspective on Mergers and Acquisitions: How
Marketing Integration Affects Postmerger Performance
A Strategic Framework for Customer Relationship Management
Actualizing Innovation Effort: The Impact of Market
Knowledge Diffusion in a Dynamic System of Competition
An Empirical Analysis of the Determinants of Retail
Margins: The Role of Store-Brand Share
An Exploratory Study of the Introduction of Online
Reverse Auctions
An Investigation into the Antecedents of Organizational
Participation in Busin

In [6]:
url_lst = []
for i in df['Title']:
    keyword = i
    url = make_url(keyword=keyword, conf=None, author=None, year=None)
    url_lst.append(url)
    # print(url)
    # html_doc = requests.get(url).text
    # soup = BeautifulSoup(html_doc, "html.parser")
    # print(soup.get_text())

    # citation_number = soup.get_text().split('Cited by ')[1].split()[0]
    # print(citation_number)

    # data = [i,get_citations(soup)]
    # print(data)
    # citation_lst.append(data)
    # sleep(np.random.randint(20, 30))

'https://scholar.google.co.jp/scholar?&as_sdt=0%2C5&as_q=A%20Configuration%20Theory%20Assessment%20of%20Marketing%20Organization%20Fit%20with%20Business%20Strategy%20and%20Its%20Relationship%20with%20Marketing%20Performance'

In [69]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(ChromeDriverManager().install())


  driver = webdriver.Chrome(ChromeDriverManager().install())


In [None]:
path = '/Users/richardpang/.wdm/drivers/chromedriver/mac64/106.0.5249/chromedriver'
perl -pi -e 's/cdc_/dog_/g' /Users/richardpang/.wdm/drivers/chromedriver/mac64/106.0.5249/chromedriver

In [70]:
import random
from selenium.webdriver.common.by import By
citation_lst = []
for i in range(len(url_lst)):
    driver.get(url_lst[i])
    # txt = driver.find_element_by_class_name("gs_bdy")
    # txt = driver.find_element_by_xpath("//div[@class='gs_bdy']").text
    # salaries = driver.find_elements_by_xpath('//div[@id="gs_bdy"]')
    element = driver.find_elements(By.TAG_NAME, 'div')
    element = driver.find_elements(By.CLASS_NAME, 'gs_fl')
    for e in element:
        if '被引用' in e.text:
            citation_number = e.text.split('被引用 ')[1].split()[0]
            # print(citation_number)
            citation_lst.append(int(citation_number))
            print(i,int(citation_number),df['Title'][i])
    # print(citation_lst)

# print(driver.find_element(By.XPATH, "//span[@class='gs_bdy']").text)
# n = random.randint(0,9)
# userPos = 'psr'+str(n)
# WebDriverWait(browser, maxTimeout).until(EC.element_to_be_clickable((By.ID, str(userPos)))).click()
# xpath="//div[@id='{}']//span[@id='countLabel']/following-sibling::span".format(userPos)
# userName = WebDriverWait(browser, maxTimeout).until(EC.visibility_of_element_located((By.XAPTH,xpath))).text
# print(userName)
# userName = WebDriverWait(browser, maxTimeout).until(EC.visibility_of_element_located((By.XAPTH,xpath))).get_attribute(textContent)
# print(userName)

WebDriverException: Message: unknown error: cannot determine loading status
from unknown error: cannot determine loading status
from target frame detached
  (Session info: chrome=106.0.5249.103)
Stacktrace:
0   chromedriver                        0x0000000103d2e598 chromedriver + 4404632
1   chromedriver                        0x0000000103cb5fa3 chromedriver + 3911587
2   chromedriver                        0x0000000103962d20 chromedriver + 425248
3   chromedriver                        0x000000010394f4cf chromedriver + 345295
4   chromedriver                        0x000000010394e4ca chromedriver + 341194
5   chromedriver                        0x000000010394e9bc chromedriver + 342460
6   chromedriver                        0x000000010395c3aa chromedriver + 398250
7   chromedriver                        0x000000010395001d chromedriver + 348189
8   chromedriver                        0x00000001039511b0 chromedriver + 352688
9   chromedriver                        0x00000001039502d1 chromedriver + 348881
10  chromedriver                        0x000000010394f717 chromedriver + 345879
11  chromedriver                        0x000000010394f572 chromedriver + 345458
12  chromedriver                        0x000000010394e4ca chromedriver + 341194
13  chromedriver                        0x000000010394e9bc chromedriver + 342460
14  chromedriver                        0x000000010395ac52 chromedriver + 392274
15  chromedriver                        0x000000010395b8a2 chromedriver + 395426
16  chromedriver                        0x000000010396aade chromedriver + 457438
17  chromedriver                        0x00000001039d0216 chromedriver + 872982
18  chromedriver                        0x00000001039b8bd3 chromedriver + 777171
19  chromedriver                        0x000000010398dced chromedriver + 601325
20  chromedriver                        0x000000010398ee51 chromedriver + 605777
21  chromedriver                        0x0000000103cfe0ce chromedriver + 4206798
22  chromedriver                        0x0000000103d02797 chromedriver + 4224919
23  chromedriver                        0x0000000103d09a4f chromedriver + 4254287
24  chromedriver                        0x0000000103d034da chromedriver + 4228314
25  chromedriver                        0x0000000103cdc23f chromedriver + 4067903
26  chromedriver                        0x0000000103d206a8 chromedriver + 4347560
27  chromedriver                        0x0000000103d20816 chromedriver + 4347926
28  chromedriver                        0x0000000103d358fe chromedriver + 4434174
29  libsystem_pthread.dylib             0x00007fff206408fc _pthread_start + 224
30  libsystem_pthread.dylib             0x00007fff2063c443 thread_start + 15


In [56]:
citation = pd.DataFrame(citation_lst)

Unnamed: 0.1,Unnamed: 0,Number,Word Count,Title,Abstract,Body
0,0,1,87143,A Configuration Theory Assessment of Marketing...,Theory posits that organizing marketing activi...,Most businesses find it easier to formulate st...
1,1,2,114558,A Configurational Perspective on Key Account M...,Most firms struggle with the challenge of mana...,Many companies today are faced with powerful a...
2,2,3,98109,A Customer Lifetime Value Framework for Custom...,The authors evaluate the usefulness of custome...,Customer lifetime value (CLV) is rapidly gaini...
3,3,4,69190,A Customer Relationship Management Roadmap: Wh...,The goal of this preface is to describe how th...,--Ruth N. BoltonThis article introduces the te...
4,4,5,75450,A Longitudinal Study of Complaining Customers'...,The authors report a repeated measures field s...,Firms can affect customer evaluations when the...
...,...,...,...,...,...,...
364,364,78,86018,Traveling with Companions: The Social Customer...,When customers journey from a need to a purcha...,Many consumer decisions do not occur in isolat...
365,365,79,64222,Virtual Reality in New Product Development: In...,This investigation examines how consumer durab...,Introducing a new consumer durable product to ...
366,366,80,79481,When Algorithms Fail: Consumers' Responses to ...,"Algorithms, increasingly used by brands, somet...",Given the explosive growth in the volume of da...
367,367,81,53687,"When to Use Markets, Lines, and Lotteries: How...","When allocating scarce goods and services, fir...","When allocating scarce goods and services, the..."


In [59]:
df['Title'][0]

'A Configuration Theory Assessment of Marketing Organization Fit with Business Strategy and Its Relationship with Marketing Performance'