In [28]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [29]:

def extract_problem_info(html_content):
    """
    Extracts problem text, examples, and constraints from the HTML content.
    """
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract problem text
    problem_text = soup.find('p').get_text()

    # Extract examples
    examples = []
    for example_section in soup.find_all('p', class_='example'):
        example_text = example_section.get_text()
        examples.append(example_text)

    # Extract examples from <pre> tags
    for example_section in soup.find_all('pre'):
        example_text = example_section.get_text()
        examples.append(example_text)

    # Extract constraints
    constraints = []
    constraints_section = soup.find('p', string='Constraints:')
    if constraints_section:
        for constraint in constraints_section.find_next_siblings('ul'):
            constraints.extend([li.get_text(strip=True) for li in constraint.find_all('li')])

    return problem_text, examples, constraints


In [30]:
def get_solution_url(title_slug):
    """
    Get the URL for the solution of a problem using its title slug.
    """
    return f"https://leetcode.com/problems/{title_slug}/solution/"

In [31]:
def scrape_questions_list():
    """
    Scrapes the list of questions from leetcode.com and stores them in the 'questions'
    dataframe. The columns include the question QID, acceptance rate, difficulty,
    title, titleSlug, topic tags, and question body. It also has a column indicating
    whether the question is available only to Leetcode's paying customers.
    """
    print("Scraping questions list ... ", end="")
    data = {
        "query": """query problemsetQuestionList($categorySlug: String, $limit: Int, $skip: Int, $filters: QuestionListFilterInput) {
                problemsetQuestionList: questionList(
                    categorySlug: $categorySlug
                    limit: $limit
                    skip: $skip
                    filters: $filters
                ) {
                    total: totalNum
                    questions: data {
                        acceptanceRate: acRate
                        difficulty
                        QID: questionFrontendId
                        paidOnly: isPaidOnly
                        title
                        titleSlug
                        topicTags {
                            slug
                        }
                        content
                    }
                }
            }
        """,
        "variables": {
            "categorySlug": "",
            "skip": 0,
            "limit": 10000,
            "filters": {},
        },
    }

    r = requests.post("https://leetcode.com/graphql", json=data).json()
    #print(r)

    questions = pd.json_normalize(
        r["data"]["problemsetQuestionList"]["questions"]
    )[
        [
            "QID",
            "title",
            "titleSlug",
            "difficulty",
            "acceptanceRate",
            "paidOnly",
            "topicTags",
            "content",
        ]
    ]
    questions["topicTags"] = questions["topicTags"].apply(
        lambda w: [tag["slug"] for tag in w]
    )

    # Create new columns for problem_text, examples, and constraints
    questions["problem_text"] = ""
    questions["examples"] = ""
    questions["constraints"] = ""

    questions = questions.dropna()

    for index, row in questions.iterrows():
        problem_text, examples, constraints = extract_problem_info(row["content"])
        questions.at[index, "problem_text"] = problem_text
        questions.at[index, "examples"] = examples
        questions.at[index, "constraints"] = constraints

    #print("\n", questions[["QID", "title", "problem_text", "examples", "constraints"]])
    print("Done")

    # Save to CSV
    questions.to_csv("leetcode_questions.csv", index=False)
    print("Data saved to leetcode_questions.csv")

In [32]:
scrape_questions_list()

Scraping questions list ... Done
Data saved to leetcode_questions.csv
