In [175]:
#----- LIBRARIES -----
from serpapi import GoogleSearch
from urllib.parse import urlsplit, parse_qsl
import pandas as pd
import requests
import time

#----- STATIC -----
SERP_API_KEY = 'eff696b8dde4f4f2103c1534afb562364a2f4f8e1da2aabd9d88a212b025b543'
SEARCH_QUERY = '("ability bias" OR "intelligence bias") AND ("private returns") AND ("income" OR "earnings") AND ("schooling" OR "education")'

In [185]:
def conduct_search(start_from = 0):
  '''Input a number of studies to skip when searching, and conduct the google scholar
  search.
  '''    
  print("extracting organic results..")

  params = {
    "api_key": SERP_API_KEY,            # https://serpapi.com/manage-api-key
    "engine": "google_scholar",
    "q": SEARCH_QUERY,  # search query
    "hl": "en",        # language
    # "as_ylo": "2017",  # from 2017
    # "as_yhi": "2021",  # to 2021
    "start": start_from
  }

  search = GoogleSearch(params)

  return search

def organic_results(search_results:list):
  '''Input the google scholar search results, and transfrom them into a cool format.
  '''
  print("Extracting all organic data...")
  organic_results_data = []

  for results in reversed(search_results): # Iterate over all 58 search results, each made up of results
      try:
        page_nr = results["search_information"]["page_number"]
      except KeyError: # The first page
        page_nr = 1
    
      for result in results["organic_results"]:
          position = result["position"]
          title = result["title"]
          publication_info_summary = result["publication_info"]["summary"]
          result_id = result["result_id"]
          link = result.get("link")
          result_type = result.get("type")
          snippet = result.get("snippet")
      
          try:
            file_title = result["resources"][0]["title"]
          except: file_title = None
      
          try:
            file_link = result["resources"][0]["link"]
          except: file_link = None
      
          try:
            file_format = result["resources"][0]["file_format"]
          except: file_format = None
      
          try:
            cited_by_count = int(result["inline_links"]["cited_by"]["total"])
          except: cited_by_count = None
      
          cited_by_id = result.get("inline_links", {}).get("cited_by", {}).get("cites_id", {})
          cited_by_link = result.get("inline_links", {}).get("cited_by", {}).get("link", {})
      
          try:
            total_versions = int(result["inline_links"]["versions"]["total"])
          except: total_versions = None
      
          all_versions_link = result.get("inline_links", {}).get("versions", {}).get("link", {})
          all_versions_id = result.get("inline_links", {}).get("versions", {}).get("cluster_id", {})
      
          organic_results_data.append({
            "page_number": page_nr,
            "position": position + 1,
            "result_type": result_type,
            "title": title,
            "link": link,
            "result_id": result_id,
            "publication_info_summary": publication_info_summary,
            "snippet": snippet,
            "cited_by_count": cited_by_count,
            "cited_by_link": cited_by_link,
            "cited_by_id": cited_by_id,
            "total_versions": total_versions,
            "all_versions_link": all_versions_link,
            "all_versions_id": all_versions_id,
            "file_format": file_format,
            "file_title": file_title,
            "file_link": file_link,
          })

  print("Extracted all organic data successfully.")
  return organic_results_data


def cite_results(citation_df):
  '''Input a pandas data frame including 3 columns - study title, link, and 
  the id. Return citations for those studyies.
  '''    
  print("extracting cite results..")

  citation_results = []

  for row_iter in citation_df.iterrows():
    row = row_iter[1]    
    cite_title = row.title
    cite_link = row.link
    cite_id = row.result_id
    try:
      cite_id = cite_id.replace("\"","") # Remove extra double quotes, which are necessary in excel
    except AttributeError:
      citation_results.append({
        "organic_result_title": cite_title,
        "organic_result_link": cite_link,
        "citation_title": cite_title,
        "citation_snippet": "FAILED_TO_RETRIEVE"
      })
      continue

    params = {
      "api_key": SERP_API_KEY,
      "engine": "google_scholar_cite",
      "q": cite_id
    }

    search = GoogleSearch(params)
    results = search.get_dict()

    try:
      result = results["citations"][2] # Chicago citation

      # Study citation and citation
      cite_snippet = result["snippet"]

    except KeyError:
      cite_snippet = "FAILED_TO_RETRIEVE"

    citation_results.append({
        "organic_result_title": cite_title,
        "organic_result_link": cite_link,
        "citation_title": cite_title,
        "citation_snippet": cite_snippet
    })

  return citation_results

def frame_to_csv(df, name):
  '''Input the output data, and the name under which the csv file should be stored, and create said csv file.
  '''    
  pd.DataFrame(df).to_csv(f"data/{name}.csv", encoding="utf-8", index=False)
  return None

In [20]:
# Get all the jsons of the query searches from the scholar - one off-use, honestly
search_ids = ['63d3f4d3f716eea2a73edb87', '63d3f4d01988e57397ea5f0c', '63d3f4cfd737d7f56356c6d3', '63d3f4ce7d171af45fc78ea4', '63d3f4cb3a218a3adefc640c', '63d3f4ca2c0844023a1870fc', '63d3f4c95fc493b2fc548b73', '63d3f4c89c84acc407de231a', '63d3f4c7c90e0a26f8d20d50', '63d3f4c649ecdb1c6b10a763', '63d3f4c4c47d3c991ca2ebda', '63d3f4c33a218a3b8819b3fd', '63d3f4c2303eb1d9579c8c93', '63d3f4c17f8361fec4fab5b5', '63d3f4be1988e5730214cf9c', '63d3f4bb11c88158ac7ce94d', '63d3f4bab7b1cc85416bed9e', '63d3f4b9ada54bb50bc2b725', '63d3f4b896f5d7dc33f3ef6d', '63d3f4b70b50621d23abb75e', '63d3f4b6d3aba21ff96af4b3', '63d3f4b4477c0efcad0ae3fe', '63d3f4b3e135084c19136ed5', '63d3f4b1d04d6ddb54964a56', '63d3f4b0f55d777a7a284e24', '63d3f4aff26ac6eb06ea1a41', '63d3f4ad979054b2fd2f092d', '63d3f4a7c47d3c988534c007', '63d3f4a6c56d933e01a0bbe8', '63d3f4a5c573d5ffd1461ea6', '63d3f4a458762b06b3bb1c69', '63d3f4a38ccee0dd1268a2a9', '63d3f4a294fb03609e53d68f', '63d3f4a1629a01b487a44b2c', '63d3f49f1988e57397ea5f0b', '63d3f49d7690dc9a4ac5710c', '63d3f49c8ccee0dda7b5a521', '63d3f49be815af0fe064b81d', '63d3f499ce87f85947b38fdb', '63d3f49896f5d7dc33f3ef65', '63d3f49249ecdb1bd018ee85', '63d3f490eb690f555c5845f6', '63d3f48f5119a69ae076d50e', '63d3f48a49ecdb1bd018ee84', '63d3f485ca968f407d20ffe5', '63d3f48379627c57bc8a522e', '63d3f481116a55631dc54f85', '63d3f480a1b42355d071be77', '63d3f47f9b647223b5f8e64a', '63d3f47eedb0343cedbe19b3', '63d3f47ddefa130f9325627b', '63d3f47cd04d6ddbee2af67f', '63d3f47bc504e907ea7c99ac', '63d3f47ad5ecf4552236a189', '63d3f478797ac6adbb1bf800', '63d3f477c7ad419a373b4c39', '63d3f4754d443d7812267e19', '63d3f47418ca86a7ab2cc077']
all_data = []

for id in search_ids:
    url = f"https://serpapi.com/searches/{id}?api_key={SERP_API_KEY}"
    response = requests.get(url)
    data = response.json()
    all_data.append(data)
    time.sleep(1)


In [55]:
# Get the organic data of all the search results
org_data = organic_results(all_data)
frame_to_csv(org_data, "org_data_1")

Extracting all organic data...
Extracted all organic data successfully.


  values = values.astype(str)


##### Read the excel file with clean, raw organic *(lol)* data, extract the result ids, and call the **SERPAPI** to get their citations

In [180]:
# Read the excel, extract onl the 3 necessary columns
round = 3
skip_rows = 100 * (round - 1)
org_data_red = pd.read_excel('data/Query_data_clean.xlsx', sheet_name='raw_data')
cit_df  = org_data_red[['title', 'link', 'result_id']]
cit_df = cit_df[skip_rows:skip_rows+100]

In [177]:
org_data = cite_results(cit_df) # Query the actual google scholar for citations - takes away the search possibilities from the site

extracting cite results..
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
https://serpapi.com/search
ht

AttributeError: 'float' object has no attribute 'replace'

In [None]:
pd.DataFrame(org_data).to_csv(f"data/citations{round}.csv", index=False)
pd.DataFrame(org_data).to_string(f"data/citations{round}.txt", index=False)