# Scrape Google Scholar Entries 

In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import quote_plus
import time
import random
import json
import pandas as pd
import plotly

### 1. Constants

In [4]:
DEBUG = False
#RAW_SEARCH_STRING = '(transformer OR llm) AND (extreme OR low bit OR 8-bit OR 4-bit OR 2-bit OR 1-bit) AND "quantization function"'
RAW_SEARCH_STRING = 'transformer AND ("2-bit" OR "2bit" OR "2 bit" OR "1-bit" OR "1bit" OR "1 bit") AND ("post-training" OR "post training") AND "quantization" AND "function" -"image compression"'
SEARCH_STRING = quote_plus(RAW_SEARCH_STRING)
FROM_YEAR = "as_ylo=2022"
SEARCH_STRING


'transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22'

In [61]:
url = f"https://www.google.com/scholar?q={SEARCH_STRING}&hl=de&{FROM_YEAR}"
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.361681261652"
}
response = requests.get(url, headers=headers, verify=False)
soup = BeautifulSoup(response.text, 'html.parser')
soup







<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html dir="LTR">
<head><meta content="text/html; charset=utf-8" http-equiv="content-type"/><meta content="initial-scale=1" name="viewport"/><title>https://www.google.com/scholar?q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&amp;hl=de&amp;as_ylo=2022</title></head>
<body onload="e=document.getElementById('captcha');if(e){e.focus();} if(solveSimpleChallenge) {solveSimpleChallenge(,);}" style="font-family: arial, sans-serif; background-color: #fff; color: #000; padding:20px; font-size:18px; overscroll-behavior:contain;">
<div style="max-width:400px;">
<hr noshade="" size="1" style="color:#ccc; background-color:#ccc;"/><br/>
<form action="index" id="captcha-form" method="post">
<noscript>
<div style="font-size:13px;">
Um fortzufahren, musst du Ja

### 2. Function definition for downloading and iterating through scholar pages

In [16]:
def downloadScholarDataPage(start_from: int = 0):
    try:
        url = f"https://www.google.com/scholar?start={start_from}&q={SEARCH_STRING}&hl=de&{FROM_YEAR}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.361681261652"
        }
        response = requests.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        print("Download from URL:", url)
        try:
            total_hits = soup.select(".gs_ab_mdw")[-1].text.split(" ")[-4] #example how this string looks: Seite 17 von ungefähr 1.692 Ergebnissen (0,05 Sek.)
            
        except Exception as e: 
            print("Couldn't retrieve number of results. Due to error:", e)
            total_hits = None

    except Exception as e:
        print("Couldn't retrieve website due to error:", e)
    return total_hits, soup


In [7]:
def getScholarData(soup: BeautifulSoup):
    scholar_results = []
    items_skipped = 0

    for el in soup.select(".gs_scl"):
        try:
            if DEBUG:
                print("title:", el.select(".gs_rt")[0].text)
                print("title_link:", el.select(".gs_rt a")[0]["href"])
                print("id:", el.select(".gs_rt a")[0]["id"])
                print("cited_by_count:", el.select(".gs_nph+ a")[0].text.split("Zitiert von: ")[-1])

            cite_count = el.select(".gs_nph+ a")[0].text
            if "ähnliche artikel" in cite_count.lower():
                cite_count = 0
            elif "zitiert von" in cite_count.lower():
                cite_count = int(cite_count.split("Zitiert von: ")[-1])

            scholar_results.append({
                "title": el.select(".gs_rt")[0].text,
                "title_link": el.select(".gs_rt a")[0]["href"],
                "id": el.select(".gs_rt a")[0]["id"],
                #"displayed_link": el.select(".gs_a")[0].text,
                #"snippet": el.select(".gs_rs")[0].text.replace("\n", ""),
                "cited_by_count": cite_count,
                #"cited_link": "https://scholar.google.com" + el.select(".gs_nph+ a")[0]["href"],
                #"versions_count": el.select("a~ a+ .gs_nph")[0].text,
                #"versions_link": "https://scholar.google.com" + el.select("a~ a+ .gs_nph")[0]["href"] if el.select("a~ a+ .gs_nph")[0].text else "",
            })
        except Exception as e:
            items_skipped += 1
            print("Couldn't append item. Due to error: ", e)
    for i in range(len(scholar_results)):
        scholar_results[i] = {key: value for key, value in scholar_results[i].items() if value != "" and value is not None}

    print(scholar_results)
    return items_skipped, scholar_results


In [12]:
def iterateScholarPages(start: int = 0):
    start_from = start
    total_items_skipped = 0
    total_results = []
    try:
        max_item, soup = downloadScholarDataPage(start_from)
        print("Max items:", max_item)
        items_skipped, scholar_results = getScholarData(soup)
        if scholar_results is []:
            print("Too many requests were sent and google sends no proper results")
            print(f"Arived at '0' results out of approximately '0' max results")
            return total_items_skipped, total_results

        total_items_skipped += items_skipped
        total_results.extend(scholar_results)
        
        # remove all "." from the number and turn it into an integer
        max_item = int("".join(max_item.split("."))) 
        while start_from < max_item:
            print(f"Currently at '{start_from+1}' out of '{max_item}' results")
            random_sleep = random.random()*1.5
            time.sleep(random_sleep)
            
        
            start_from += 10
            _, soup = downloadScholarDataPage(start_from)

            items_skipped, scholar_results = getScholarData(soup)
            if scholar_results is []:
                print("Too many requests were sent and google sends no proper results")
                print(f"Arived at '{start_from-9}' results out of approximately '{max_item}' max results")
                return total_items_skipped, total_results
                
            total_items_skipped += items_skipped
            total_results.extend(scholar_results)

        return total_items_skipped, total_results
    
    except Exception as e:
        if e == KeyboardInterrupt:
            return total_items_skipped, total_results
        else:
            print("Error occured:", e)
            return total_items_skipped, total_results

### 3. Download Data

In [28]:
res = iterateScholarPages(631)
skipped, total_results = res



Download from URL: https://www.google.com/scholar?start=631&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Max items: 688
Couldn't append item. Due to error:  list index out of range
[{'title': 'Training with Mixed-Precision Floating-Point Assignments', 'title_link': 'https://arxiv.org/abs/2301.13464', 'id': 'uPAAE9UaYvcJ', 'cited_by_count': 2}, {'title': 'Point cloud based semantic segmentation for catenary systems using deep learning: Compressibility of a PointNet++ network', 'title_link': 'http://essay.utwente.nl/92901/', 'id': 'nQx0Vr6qTlQJ', 'cited_by_count': 1}, {'title': 'Towards Efficient Edge Intelligence with In-Sensor and Neuromorphic Computing: Algorithm-Hardware Co-Design', 'title_link': 'https://search.proquest.com/openview/e79564c23f94637851dc65eb792decec/1?pq-origsite=gscho



Download from URL: https://www.google.com/scholar?start=641&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Couldn't append item. Due to error:  list index out of range
Couldn't append item. Due to error:  list index out of range
[{'title': 'CFSP: An Efficient Structured Pruning Framework for LLMs with Coarse-to-Fine Activation Information', 'title_link': 'https://arxiv.org/abs/2409.13199', 'id': 'uaHy3r0GVwEJ', 'cited_by_count': 0}, {'title': '[PDF][PDF] Implementation of Machine Learning Algorithms on Ultra-Low-Power Hardware for In-Sensor Inference', 'title_link': 'https://tesidottorato.depositolegale.it/bitstream/20.500.14242/156910/1/conv_tesi-final.pdf', 'id': 'dMmPaT08ClQJ', 'cited_by_count': 0}, {'title': 'MoE-Pruner: Pruning Mixture-of-Experts Large Language Model using the Hints f



Download from URL: https://www.google.com/scholar?start=651&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Couldn't append item. Due to error:  list index out of range
[{'title': 'Update compression for deep neural networks on the edge', 'title_link': 'https://openaccess.thecvf.com/content/CVPR2022W/MobileAI/html/Chen_Update_Compression_for_Deep_Neural_Networks_on_the_Edge_CVPRW_2022_paper.html', 'id': 'hEaGT1012BYJ', 'cited_by_count': 16}, {'title': '[PDF][PDF] Evaluation of the DL accelerator designs', 'title_link': 'https://vedliot.eu/wp-content/uploads/2024/05/VEDLIoT_Deliverable_D3.3_v1.1_submitted.pdf', 'id': 'kqqREk2VTqYJ', 'cited_by_count': 0}, {'title': 'HadSkip: Homotopic and Adaptive Layer Skipping of Pre-trained Language Models for Efficient Inference', 'title_link': 'https://a



Download from URL: https://www.google.com/scholar?start=661&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Couldn't append item. Due to error:  list index out of range
[{'title': 'Boosted dynamic neural networks', 'title_link': 'https://ojs.aaai.org/index.php/AAAI/article/view/26302', 'id': 'mOUt69Abtu0J', 'cited_by_count': 13}, {'title': 'The Evolution of Mixture of Experts: A Survey from Basics to Breakthroughs', 'title_link': 'https://www.preprints.org/manuscript/202408.0583', 'id': 'JHBZ0S25LvsJ', 'cited_by_count': 0}, {'title': 'Investigations into Ultra-Low-Power Underwater Imaging', 'title_link': 'https://dspace.mit.edu/handle/1721.1/152645', 'id': 'AYwujT4-pNgJ', 'cited_by_count': 0}, {'title': 'On-Chip DNN Training for Direct Feedback Alignment in FeFET', 'title_link': 'https://li



Download from URL: https://www.google.com/scholar?start=671&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Couldn't append item. Due to error:  list index out of range
[{'title': 'High-efficiency Compressor Trees for Latest AMD FPGAs', 'title_link': 'https://dl.acm.org/doi/abs/10.1145/3645097', 'id': 'zOtKKcFor1IJ', 'cited_by_count': 0}, {'title': ': Confidence Calibration Model Cascade for Inference-Efficient Cross-Lingual Natural Language Understanding', 'title_link': 'https://arxiv.org/abs/2402.15991', 'id': 'O3rXMDBh-x0J', 'cited_by_count': 0}, {'title': 'Power-Efficient Machine Learning-Based Hardware Architectures for Biomedical Applications', 'title_link': 'https://search.proquest.com/openview/8f3221666e8cc75be537c6243324852e/1?pq-origsite=gscholar&cbl=18750&diss=y', 'id': 'LQ0qKJbj



Download from URL: https://www.google.com/scholar?start=681&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Couldn't append item. Due to error:  list index out of range
[{'title': 'Environment-aware knowledge distillation for improved resource-constrained edge speech recognition.', 'title_link': 'https://espace.inrs.ca/id/eprint/15682/', 'id': '3Vds-2mjXl0J', 'cited_by_count': 0}, {'title': 'Agile and Efficient Inference of Quantized Neural Networks', 'title_link': 'https://www.research-collection.ethz.ch/handle/20.500.11850/675547', 'id': 'rPWn_VOIrp8J', 'cited_by_count': 0}, {'title': '[PDF][PDF] Αριστοτέλειο Πανεπιστή ιο Θεσσαλονίκης', 'title_link': 'https://ikee.lib.auth.gr/record/354668/files/manuscript_18042024.pdf', 'id': 'I2G4NMsdqvQJ', 'cited_by_count': 0}, {'title': '[PDF][PDF] 高效



Download from URL: https://www.google.com/scholar?start=691&q=transformer+AND+%28%222-bit%22+OR+%222bit%22+OR+%222+bit%22+OR+%221-bit%22+OR+%221bit%22+OR+%221+bit%22%29+AND+%28%22post-training%22+OR+%22post+training%22%29+AND+%22quantization%22+AND+%22function%22+-%22image+compression%22&hl=de&as_ylo=2022
Couldn't append item. Due to error:  list index out of range
[]


In [29]:
print(f"""###Statistics###
Google Results: {skipped+ len(total_results)}
No #citations: {skipped}
Papers with citations: {len(total_results)}""")

###Statistics###
Google Results: 64
No #citations: 8
Papers with citations: 56


### 4. Persist or load Data

In [5]:
# load data from disk
def load_from_disk(file_name: str):
    #FILE = "%28transformer+OR+llm%29+AND+%28extreme+OR+low+bit+OR+8-bit+OR+4-bit+OR+2-bit+OR+1-bit%29+AND+%22quantization+function%22__1729496820.4837933"
    with open(file_name, "r") as f:
        skipped, total_results = json.load(f)
    return skipped, total_results

In [6]:
# save data to disk
def save_to_disk(total_results: dict, skipped: int, append_previous_results: str = None, name: str = RAW_SEARCH_STRING):
    total_results_combined = total_results
    skipped_combined = skipped

    if append_previous_results is not None:
        skipped_tmp, total_results_tmp = load_from_disk(append_previous_results)
        skipped_combined += skipped_tmp
        # append the newer entries to the end
        total_results_tmp.extend(total_results_combined)
        total_results_combined = total_results_tmp

    with open(f"{name}__{len(total_results_combined)}of680__{time.time()}", "w") as f:
        json.dump([skipped_combined, total_results_combined], f)

In [30]:
save_to_disk(total_results, skipped, append_previous_results='transformer AND ("2-bit" OR "2bit" OR "2 bit" OR "1-bit" OR "1bit" OR "1 bit") AND ("post-training" OR "post training") AND "quantization" AND "function" -"image compression"__647of680__1729862542.3874462')

In [7]:
skipped, total_results = load_from_disk('transformer AND ("2-bit" OR "2bit" OR "2 bit" OR "1-bit" OR "1bit" OR "1 bit") AND ("post-training" OR "post training") AND "quantization" AND "function" -"image compression"__703of680__1729863140.3934956')

### 5. Postprocess results

In [8]:
def remove_duplicate_entries(results: list):
    assert results[0].get("title") is not None, "The provided data must be a list of dictionaries that must contain the key 'title'"
    seen = []
    unique = []
    for item in results:
        if item["title"] in seen:
            continue
        elif item["title"] is None:
            print(f"Warning: item '{item}' does not contain a title")
        else:
            seen.append(item["title"])
            unique.append(item)
    print(f"Reduced original results from length: {len(results)} to unique items: {len(unique)}")
    return unique

In [9]:
def filter_results(results: dict, min_citations: int = 50, title_must_contain: str = None):
    assert "cited_by_count" in results[0].keys(), "The file cannot be filtered, because not all entries contain a value for the key 'cited_by_count'"
    output_results = []
    for elem in results:
        if elem["cited_by_count"] >= min_citations:
            if title_must_contain is None:
                output_results.append(elem)
            elif title_must_contain.lower() in elem["title"].lower():
                output_results.append(elem)
    return output_results

In [10]:
deduplicated_results = remove_duplicate_entries(total_results)

Reduced original results from length: 703 to unique items: 633


In [11]:
min_citations = 20
filtered_results = filter_results(deduplicated_results, min_citations)
len(filtered_results)

64

In [52]:
filtered_results

[{'title': 'Zeroquant: Efficient and affordable post-training quantization for large-scale transformers',
  'title_link': 'https://proceedings.neurips.cc/paper_files/paper/2022/hash/adf7fa39d65e2983d724ff7da57f00ac-Abstract-Conference.html',
  'id': 'M1H30TzgocoJ',
  'cited_by_count': 289},
 {'title': 'I-vit: Integer-only quantization for efficient vision transformer inference',
  'title_link': 'http://openaccess.thecvf.com/content/ICCV2023/html/Li_I-ViT_Integer-only_Quantization_for_Efficient_Vision_Transformer_Inference_ICCV_2023_paper.html',
  'id': 'wA-a8sZKDhMJ',
  'cited_by_count': 73},
 {'title': 'Q-vit: Accurate and fully quantized low-bit vision transformer',
  'title_link': 'https://proceedings.neurips.cc/paper_files/paper/2022/hash/deb921bff461a7b0a5c344a4871e7101-Abstract-Conference.html',
  'id': 'pYjd1jUZ5TYJ',
  'cited_by_count': 73},
 {'title': 'Mr. biq: Post-training non-uniform quantization based on minimizing the reconstruction error',
  'title_link': 'http://openacc

### 6. Write to Excel

In [54]:
# save pandas dataframe to create excel
filtered_df = pd.DataFrame(filtered_results)

In [55]:
filtered_df.to_excel(f"{RAW_SEARCH_STRING}__min_citations_{min_citations}__{time.time()}.xlsx")

### 7. Visualize results

In [18]:
# visualize results
import plotly.express as px
from plotly import graph_objects as go
#title = '''(transformer OR llm) AND 
#(extreme OR low bit OR 8-bit OR 4-bit OR 2-bit OR 1-bit) 
#AND "quantization function"'''
fig = go.Figure(
    go.Funnel(
        x=[757, #total
           688, #After 2022
           len(filtered_results), # > 20 citations
           50, #No quantization function =14
           46, # No performance tests =4
           43, #Scalability not proven >7B =3 
           35, #No open-source code =8
           15 #Not applicable =20
           ], 
        y=["Total Search Results", "After 2022", ">20 Citations", "No Quantization Function", "No Performance Tests", "No Scalability Test >7B Parameters", "No Open-Source Code", "Not Applicable for Ternary or PTQ"], 
        textinfo = "value+percent initial",
        marker={"color": "black"}
    )
)
fig.update_layout({"plot_bgcolor": "white"})
fig.show()

In [92]:
new_excel = pd.read_excel("transformer-and-2-bit-etc/min_citations_20__1729864347.3632069.xlsx.xlsx", header=0)
old_excel = pd.read_excel('(transformer OR llm) AND (extreme OR low bit OR 8-bit OR 4-bit OR 2-bit OR 1-bit) AND "quantization function"__1729502211.5505743.xlsx', sheet_name="Sheet1", header=0)

In [94]:
# reset index so it aligns with the respective row count
new_excel = new_excel.reset_index()
old_excel = old_excel.reset_index()

In [125]:
new_excel.columns

Index(['index', 'Unnamed: 0', 'title', 'title_link', 'id', 'cited_by_count',
       'Exclusion', 'Reason', 'Notes', 'Quantization Function'],
      dtype='object')

In [126]:
old_excel.columns

Index(['index', 'Unnamed: 0', 'title', 'title_link', 'id', 'cited_by_count',
       'Exclusion', 'Reason', 'Notes', 'Quantization Function'],
      dtype='object')

In [75]:
new_excel.loc[0] = new_excel.loc[1]

In [128]:
new_excel.loc[2][1:]

Unnamed: 0                                                               0
title                    Q-vit: Accurate and fully quantized low-bit vi...
title_link               https://proceedings.neurips.cc/paper_files/pap...
id                                                            pYjd1jUZ5TYJ
cited_by_count                                                          73
Exclusion                                                               No
Reason                                                                 NaN
Notes                                                                  NaN
Quantization Function                                                  NaN
Name: 2, dtype: object

In [123]:
a = old_excel[old_excel["title"] == 'Q-vit: Accurate and fully quantized low-bit vision transformer']
len(a)

1

In [130]:
for idx, new_row in new_excel.iterrows():
    # search if the title exists in the old excel and insert the row of the old excel
    new_title = new_row["title"]
    old_row = old_excel[old_excel["title"] == new_title]
    
    if len(old_row) == 1: #if there was a match
        print(old_row["title"])
        # try:
        #     new_excel.loc[idx] = old_row.loc[0][1:]
        # except Exception as e:
        #     print(f"Couldnt append row {old_row} because of error: {e}")

0    Q-vit: Accurate and fully quantized low-bit vi...
Name: title, dtype: object
17    Towards efficient post-training quantization o...
Name: title, dtype: object
8    Zeroquant-v2: Exploring post-training quantiza...
Name: title, dtype: object
5    Ant: Exploiting adaptive numerical data type f...
Name: title, dtype: object
14    Loftq: Lora-fine-tuning-aware quantization for...
Name: title, dtype: object
30    A survey of techniques for optimizing transfor...
Name: title, dtype: object
7    The era of 1-bit llms: All large language mode...
Name: title, dtype: object
23    Revisiting the parameter efficiency of adapter...
Name: title, dtype: object
12    Accurate lora-finetuning quantization of llms ...
Name: title, dtype: object
34    Bibert: Accurate fully binarized bert
Name: title, dtype: object
20    With shared microexponents, a little shifting ...
Name: title, dtype: object
54    Fast: Dnn training under variable precision bl...
Name: title, dtype: object
27    A survey of qu