In [None]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os

In [None]:
API_KEY = "STACK_APPS_API"

In [None]:
os.makedirs('./stackoverflow_dataset', exist_ok=True)

file_path = './stackoverflow_dataset/finetune_data_summarized.jsonl'
with open(file_path, 'w') as f:
    pass

In [None]:
def html_to_markdown_with_code(html):
    soup = BeautifulSoup(html, 'html.parser')

    for pre in soup.find_all('pre'):
        code = pre.code
        if code:
            lang = "js" if "javascript" in code.get('class', []) else ""
            block = code.get_text()
            markdown_block = f"\n```{lang}\n{block.strip()}\n```\n"
            pre.replace_with(markdown_block)

    for code in soup.find_all('code'):
        if code.parent.name != 'pre':
            inline = code.get_text()
            code.replace_with(f"`{inline.strip()}`")

    text = soup.get_text(separator="\n").strip()
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = '\n'.join(line.rstrip() for line in text.splitlines())

    return text


In [None]:
SITE = "stackoverflow"
# Issues and their respective accepted answers are taken with this.
TAG = ["reactjs", "next.js", "vue.js", "frontend", "css", "html"]
API_BASE = "https://api.stackexchange.com/2.3"
PAGE_SIZE = 15

In [None]:
def get_questions_with_accepted_answer(tag, page=1):
    url = f"{API_BASE}/questions"
    params = {
        "order": "desc",
        "sort": "votes",
        "site": SITE,
        "tagged": tag,
        "filter": "withbody",
        "pagesize": PAGE_SIZE,
        "page": page,
        "key": API_KEY
    }
    resp = requests.get(url, params=params)
    data = resp.json()
    return [
        q for q in data.get('items', [])
        if q.get('accepted_answer_id')
    ], data.get("has_more", False)

def get_top_answer_body(question_id):
    url = f"{API_BASE}/questions/{question_id}/answers"
    params = {
        "order": "desc",
        "sort": "votes",
        "site": SITE,
        "filter": "withbody",
        "key": API_KEY
    }
    resp = requests.get(url, params=params)
    data = resp.json()
    if data.get('items'):
        soup = html_to_markdown_with_code(data['items'][0]['body'])
        return soup
    return None

In [None]:
def main():
    results = []
    for tag in TAG:
        page = 1
        while page <= 50:
            print("TAG:", tag, "Page:", page)
            questions, has_more = get_questions_with_accepted_answer(tag, page)
            for q in questions:
                title = html_to_markdown_with_code(q["title"])
                body = html_to_markdown_with_code(q["body"])
                question = title + "\n" + body
                question_id = q["question_id"]
                top_answer = get_top_answer_body(question_id)
                result = {
                    'messages': [
                        {"role": "user", "content": question},
                        {"role": "assistant", "content": top_answer}
                    ]
                }
                results.append(result)
                with open("./stackoverflow_dataset/finetune_data_summarized.jsonl", "a", encoding="utf-8") as f:
                    f.write(json.dumps(result, ensure_ascii=False) + "\n")
            page += 1

    print(f"Collected {len(results)} samples from Stack Overflow.")
    print("Data saved to stackoverflow_dataset/frontend_bugfix_dataset.jsonl")

In [11]:
main()

TAG: reactjs Page: 1
TAG: reactjs Page: 2
TAG: reactjs Page: 3
TAG: reactjs Page: 4
TAG: reactjs Page: 5
TAG: reactjs Page: 6
TAG: reactjs Page: 7
TAG: reactjs Page: 8
TAG: reactjs Page: 9
TAG: reactjs Page: 10
TAG: reactjs Page: 11
TAG: reactjs Page: 12
TAG: reactjs Page: 13
TAG: reactjs Page: 14
TAG: reactjs Page: 15
TAG: reactjs Page: 16
TAG: reactjs Page: 17
TAG: reactjs Page: 18
TAG: reactjs Page: 19
TAG: reactjs Page: 20
TAG: reactjs Page: 21
TAG: reactjs Page: 22
TAG: reactjs Page: 23
TAG: reactjs Page: 24
TAG: reactjs Page: 25
TAG: reactjs Page: 26
TAG: reactjs Page: 27
TAG: reactjs Page: 28
TAG: reactjs Page: 29
TAG: reactjs Page: 30
TAG: reactjs Page: 31
TAG: reactjs Page: 32
TAG: reactjs Page: 33
TAG: reactjs Page: 34
TAG: reactjs Page: 35
TAG: reactjs Page: 36
TAG: reactjs Page: 37
TAG: reactjs Page: 38
TAG: reactjs Page: 39
TAG: reactjs Page: 40
TAG: reactjs Page: 41
TAG: reactjs Page: 42
TAG: reactjs Page: 43
TAG: reactjs Page: 44
TAG: reactjs Page: 45
TAG: reactjs Page: 

In [None]:
input_file_path = './stackoverflow_dataset/finetune_data_summarized.jsonl'
output_file_path = './stackoverflow_dataset/finetune_data_summarized_dedup.jsonl'

# Ensure the output directory exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

with open(input_file_path, 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Remove duplicates
seen = set()
deduped_data = []
for entry in data:
    entry_str = json.dumps(entry, sort_keys=True)
    if entry_str not in seen:
        seen.add(entry_str)
        deduped_data.append(entry)

with open(output_file_path, 'w', encoding='utf-8') as f:
    for entry in deduped_data:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')