In [1]:
import os
from datetime import datetime
import requests
from requests_html import HTML
import pandas as pd
import time

In [2]:
cwd = os.getcwd()
now = datetime.now()
timestamps = now.strftime("%d/%m/%Y %H:%M:%S")
BASE_DIR = os.path.dirname(cwd)
DATA_DIR = os.path.join(BASE_DIR, "data")
os.makedirs(DATA_DIR, exist_ok=True)

In [3]:
base_url = "https://stackoverflow.com/questions/tagged/"
tag = "python"
query_filter = "Unanswered"
url = f"{base_url}{tag}?tab={query_filter}"

In [4]:
def clean_scraped_data(text, keyname=None):
    if keyname == "votes":
        return text.replace("\nvotes", "")
    elif keyname == "tags":
        return text.replace("\n", ", ")
    elif keyname == "summary":
        return text
    return text

In [5]:
def parse_tagged_page(html):
    question_summaries = html.find(".s-post-summary")
    answers_el = html.find(".s-post-summary--stats-item.has-answers")
    
    key_names = ["question", "summary", "answers", "votes", "tags", "links"]
    classes_needed = [".s-link", ".s-post-summary--content-excerpt", ".s-post-summary--stats-item.has-answers", ".s-post-summary--stats-item-number", ".tags" ]
    datas = []

    for q_el in question_summaries:
        question_data = {}
        for i, cls in enumerate(classes_needed):
            keyname = key_names[i]
            question_summary_id = q_el.attrs["id"].split("-")[2]
            sub_el = q_el.find(cls, first=True)
            if keyname == "answers":
                if sub_el is None:
                    question_data["answers"] = 0
                else:
                    a_sub_el = sub_el.find(".s-post-summary--stats-item-number", first=True)
                    question_data["answers"] = a_sub_el.text
            else:
                question_data[keyname] = clean_scraped_data(sub_el.text, keyname=keyname)
                question_data["links"] = f"https://stackoverflow.com/questions/{question_summary_id}"

#         for a_el in answers_el:
#             a_sub_el = a_el.find(".s-post-summary--stats-item-number", first=True)
#             question_data["answers"] = a_sub_el.text

        datas.append(question_data)

    return datas

In [6]:
def extract_data_from_url(url):
    r = requests.get(url)
    if r.status_code not in range(200, 299):
        return []
    html = HTML(html=r.text)
    datas = parse_tagged_page(html)

    return datas

In [7]:
def scrape_tag(tag="python", query_filter="Unanswered", max_pages=1):
    base_url = "https://stackoverflow.com/questions/tagged/"
    datas = []
    
    for p in range(max_pages):
        url = f"{base_url}{tag}?tab={query_filter}&page={p + 1}"
        print(url)
        datas += extract_data_from_url(url)
        time.sleep(1.2)

    return datas

In [8]:
# scrape_tag(tag=tag, max_pages=1, pagesize=5)

In [9]:
def extract_to_csv(df, output):  
    df.to_csv(output, index=False) 

In [10]:
def extract_to_py(df, output):
    arr = df.to_dict('records')
    arr.insert(0, {"timestamps": timestamps})
    js_file = open(output, "w+", encoding="utf-8")
    js_file.write(f"questions = {str(arr)}")
    js_file.close()

In [11]:
def extract_data(tags=["python"], max_pages=1):
    for tag in tags:
        datas = scrape_tag(tag=tag, max_pages=max_pages)
        df = pd.DataFrame(datas)
        output_dir_csv = os.path.join(DATA_DIR, f"stackoverflow_{tag}.csv")
        output_dir_py = os.path.join(DATA_DIR, f"stackoverflow_{tag}.py")
        
        extract_to_csv(df, output_dir_csv)
        extract_to_py(df, output_dir_py)
        
    return "Done"

In [12]:
extract_data(tags=["python", "javascript"], max_pages=1)

https://stackoverflow.com/questions/tagged/python?tab=Unanswered&page=1
https://stackoverflow.com/questions/tagged/javascript?tab=Unanswered&page=1


'Done'