In [2]:
import time  # For handling time-related functions
import re  # For regular expressions
import numpy as np  # For numerica/anaconda/envs/l operations
import pandas as pd  # For data manipulation and analysis
from bs4 import BeautifulSoup  # For web scraping
from selenium import webdriver  # For browser automation
from selenium.webdriver.chrome.service import Service  # For configuring the ChromeDriver service

In [3]:
def get_page_source(url, delay=10):
    # Configure Chrome options
    options = webdriver.ChromeOptions()
    
    # Ignore certificate errors
    options.add_argument('--ignore-certificate-errors')
    
    # Start the browser in maximized mode
    options.add_argument('--start-maximized')

    # Create a Chrome WebDriver instance with the specified options
    driver = webdriver.Chrome(options=options)

    # Open the specified URL in the browser
    driver.get(url)

    # Allow time for the page to load (adjust delay as needed)
    time.sleep(delay)

    # Get the page source using BeautifulSoup for parsing
    page_source = BeautifulSoup(driver.page_source, 'html.parser')

    # Close the WebDriver to release resources
    driver.close()

    # Return the parsed page source
    return page_source

In [12]:
def get_titles_URLs(page_source, first_page=False):
    # Determine the starting index based on whether it's the first page or not
    start_index = 1 if first_page else 0

    # Find all problems using BeautifulSoup
    problem_elements = page_source.find_all('a', href=True, class_=[
            'h-5 hover:text-blue-s dark:hover:text-dark-blue-s',
            'h-5 hover:text-blue-s dark:hover:text-dark-blue-s opacity-60'
        ])[start_index:]

    # Extract text and 'href' attribute values from problem elements and store in a list
    titles = [e.text for e in problem_elements]
    problems_url = [e['href'] for e in problem_elements]
    
    # Return the list of titles
    return titles, problems_url

In [4]:
def get_acceptances_difficulties(page_source, first_page=False):
    # Find all div elements with the specified class using BeautifulSoup
    div_elements = page_source.find_all('div', class_='mx-2 flex items-center py-[11px]')

    # Determine the starting index based on whether it's the first page or not
    start_index = 1 if first_page else 0

    # Extract text from the span elements within the div elements and store in a list
    items = [
        span_element.text.strip()
        for div_element in div_elements
        for span_element in [div_element.find('span')]
        if span_element
    ]

    # Separate the items into acceptances and difficulties lists
    acceptances, difficulties = [], []
    for item in items:
        if item:
            (acceptances if item.endswith('%') else difficulties).append(item)

    # Return the lists of acceptances and difficulties
    return acceptances[start_index:], difficulties[start_index:]

In [13]:
def get_single_page_df(url, first_page=False):
    # Get the page source for the specified URL
    page_source = get_page_source(url)

    # Extract titles, problem URLs, acceptances, and difficulties from the page source
    titles, problems_url = get_titles_URLs(page_source, first_page)
    acceptances, difficulties = get_acceptances_difficulties(page_source, first_page)

    # Create a dictionary with the extracted data
    data = {
        'title': titles,
        'problem_URL': problems_url,
        'difficulty': difficulties
    }
    
    # Create a DataFrame using the dictionary
    df = pd.DataFrame(data)

    # Return the DataFrame
    return df

In [10]:
def scrape(start=1, end=100, file_name='scrape.csv'):
    # Initialize an empty list to store DataFrames for each page
    list_of_dfs = []

    # Set the flag for the first page
    first_page = True if start == 1 else False

    # Iterate over the specified range of pages
    for i in range(start, end + 1):
        # Construct the URL for the current page
        url = 'https://leetcode.com/problemset/all/?page=' + str(i)

        # Get the DataFrame for the current page and append it to the list
        df = get_single_page_df(url, first_page)
        list_of_dfs.append(df)

        # Update the first_page flag for subsequent pages
        first_page = False

    # Concatenate the list of DataFrames into a single DataFrame
    df = pd.concat(list_of_dfs, ignore_index=True)

    # Save the DataFrame to a CSV file
    df.to_csv(path_or_buf=file_name, index=False)
    
    return df

Running Web-Scraping Process:

In [15]:
df = scrape(start=1, end=67, file_name='scrape.csv')

50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
50 50 50
3 3 3


In [16]:
df['problem_URL'] = df['problem_URL'].apply(lambda x: f"https://leetcode.com{x}")

In [17]:
df1 = df
df1

Unnamed: 0,title,problem_URL,difficulty
0,1. Two Sum,https://leetcode.com/problems/two-sum,Easy
1,2. Add Two Numbers,https://leetcode.com/problems/add-two-numbers,Medium
2,3. Longest Substring Without Repeating Characters,https://leetcode.com/problems/longest-substrin...,Medium
3,4. Median of Two Sorted Arrays,https://leetcode.com/problems/median-of-two-so...,Hard
4,5. Longest Palindromic Substring,https://leetcode.com/problems/longest-palindro...,Medium
...,...,...,...
3298,3299. Sum of Consecutive Subsequences,https://leetcode.com/problems/sum-of-consecuti...,Hard
3299,3300. Minimum Element After Replacement With D...,https://leetcode.com/problems/minimum-element-...,Easy
3300,3301. Maximize the Total Height of Unique Towers,https://leetcode.com/problems/maximize-the-tot...,Medium
3301,3302. Find the Lexicographically Smallest Vali...,https://leetcode.com/problems/find-the-lexicog...,Medium


In [18]:
def get_topic_tags(page_source):
    # Initialize an empty list to store topic tags
    topic_tags = []

    # Find all elements with the specified class using BeautifulSoup
    topic_tag_elements = page_source.find_all('a',
                 class_='mr-4 rounded-xl px-2 py-1 text-xs transition-colors text-label-2 dark:text-dark-label-2 hover:text-label-2 dark:hover:text-dark-label-2 bg-fill-3 dark:bg-dark-fill-3 hover:bg-fill-2 dark:hover:bg-dark-fill-2') 

    # Extract text content from each topic tag element and append to the list
    for topic_tag_element in topic_tag_elements:
        topic_tag = topic_tag_element.text
        topic_tags.append(topic_tag)

    # Join the list of topic tags into a comma-separated string
    topic_tags_str = ', '.join(f"'{item}'" for item in topic_tags)

    # Return the formatted string of topic tags
    return topic_tags_str

In [19]:
def get_similar_questions(page_source):
    # Initialize an empty list to store similar questions
    similar_questions = []

    # Find all elements with the specified class using BeautifulSoup
    similar_question_elements = page_source.find_all('a', class_='text-sm font-medium transition-none text-label-1 dark:text-dark-label-1 hover:text-blue-s dark:hover:text-dark-blue-s')

    # Extract text content from each similar question element and append to the list
    for similar_question_element in similar_question_elements:
        similar_question = similar_question_element.text
        similar_questions.append(similar_question)

    # Join the list of similar questions into a comma-separated string
    similar_questions_str = ', '.join(f"{item}" for item in similar_questions)

    # Return the formatted string of similar questions
    return similar_questions_str

In [20]:
def get_is_premium(page_source):
    # Find the element with the specified class using BeautifulSoup
    premium_element = page_source.find('div', class_='text-md mb-6 text-center text-label-2 dark:text-dark-label-2')

    # Determine premium status based on the existence of the element
    is_premium = 'True' if premium_element else 'False'

    # Return the premium status
    return is_premium

In [21]:
def scrape(df, file_name='scrape2.csv'):
    # Extract links for the specified range from the DataFrame
    links = df['problem_URL']

    # Initialize an empty list to store DataFrames for each link
    dfs = []
    
    # Configure Chrome options
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')  # Ignore certificate errors
    options.add_argument('--start-maximized')  # Start the browser in maximized mode

    # Create a Chrome WebDriver instance with the specified service and options
    driver = webdriver.Chrome(options=options)

    # Iterate over the links and scrape data
    for link in links:
        i = 0
        
        # Open the specified URL in the browser
        driver.get(link)

        # Allow time for the page to load (adjust delay as needed)
        time.sleep(10)

        # Get the page source using BeautifulSoup for parsing
        page_source = BeautifulSoup(driver.page_source, 'html.parser')

        # Create a dictionary to store scraped data
        data = {'is_premium': get_is_premium(page_source)}

        # Check if the problem is not premium before scraping additional data
        if data['is_premium'] == 'False':
            # Update the data dictionary with additional scraped data
            data.update({
                'topic_tags': get_topic_tags(page_source),
                'similar_questions': get_similar_questions(page_source)
            })

            # Create a DataFrame for the current link and append it to the list
            df = pd.DataFrame(data, index=[i])
            dfs.append(df)
            i += 1

    # Concatenate the list of DataFrames into a single DataFrame
    df = pd.concat(dfs, ignore_index=True)

    # Save the final DataFrame to a CSV file
    df.to_csv(path_or_buf=file_name, index=None)

In [None]:
scrape(df, file_name='problems.csv')
df