In [1]:
# Cell 1 - Install all required libraries
!pip install selenium webdriver-manager pandas beautifulsoup4 tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Cell 2 - Imports and browser setup
import pandas as pd
import time
import json
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def get_driver():
    options = Options()
    options.add_argument("--headless")           # run without opening browser window
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--window-size=1920,1080")
    # Spoof a real user agent so GFG doesn't block the headless browser
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(
        service=Service(ChromeDriverManager().install()),
        options=options
    )
    return driver

print("Driver setup ready.")

Driver setup ready.


In [3]:
# Cell 3 - Load your CSV
df = pd.read_csv("Result-2.csv")
print("Loaded shape:", df.shape)
print("Columns:", df.columns.tolist())

# Preview rows that have a GFG URL
gfg_df = df[df['gfg-url'].notna() & (df['gfg-url'] != '')].copy()
print(f"\nRows with GFG URL: {len(gfg_df)}")
gfg_df[['title', 'gfg-url']].head(5)


Loaded shape: (3234, 10)
Columns: ['merge_key', 'id', 'leetcode-url', 'title', 'coding-ninjas-url', 'gfg-url', 'difficulty', 'companies', 'slug', 'topicTags']

Rows with GFG URL: 41


Unnamed: 0,title,gfg-url
0,01 Matrix,https://www.geeksforgeeks.org/problems/distanc...
2,132 Pattern,https://www.geeksforgeeks.org/problems/find-13...
3,2 Keys Keyboard,https://www.geeksforgeeks.org/problems/special...
5,3Sum,https://www.geeksforgeeks.org/problems/triplet...
6,3Sum Closest,https://www.geeksforgeeks.org/problems/3-sum-c...


In [4]:
# Cell 4 - GFG tag scraper function
def scrape_gfg_tags(driver, url, wait_seconds=8):
    """
    Visits a GFG problem URL and extracts:
    - company_tags_gfg: list of company tags from GFG
    - topic_tags_gfg:   list of topic tags from GFG
    Returns a dict with both lists, or empty lists on failure.
    """
    result = {"company_tags_gfg": [], "topic_tags_gfg": []}
    
    try:
        driver.get(url)
        
        # Wait until the tags container is present in the DOM
        WebDriverWait(driver, wait_seconds).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.problems_accordion_tags_container__zk2Um")
            )
        )
        
        # Extra buffer for JS to fully render tag labels
        time.sleep(2)
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Find the main tags container
        container = soup.find("div", class_="problems_accordion_tags_container__zk2Um")
        if not container:
            return result
        
        # Each accordion block is: div.problems_accordion_tags__JJ2DX
        accordion_blocks = container.find_all("div", class_="problems_accordion_tags__JJ2DX")
        
        for block in accordion_blocks:
            # Get the header title text (Company Tags / Topic Tags / etc.)
            title_el = block.find("strong")
            if not title_el:
                continue
            section_title = title_el.get_text(strip=True)
            
            # Extract all tag labels from this section
            tag_links = block.find_all("a", class_="problems_tag_label__A4Ism")
            tags = [a.get_text(strip=True) for a in tag_links]
            
            if section_title == "Company Tags":
                result["company_tags_gfg"] = tags
            elif section_title == "Topic Tags":
                result["topic_tags_gfg"] = tags
                
    except Exception as e:
        print(f"  ERROR on {url}: {e}")
    
    return result

print("Scraper function defined.")

Scraper function defined.


In [5]:
# Cell 5 - Run scraper over all GFG URLs with checkpoint save
CHECKPOINT_FILE = "gfg_tags_checkpoint.json"
OUTPUT_FILE     = "Result-2-with-gfg-tags.csv"
DELAY_BETWEEN   = 3   # seconds between requests (be polite to GFG servers)

# Load existing checkpoint if available (lets you resume if it crashes)
try:
    with open(CHECKPOINT_FILE, "r") as f:
        checkpoint = json.load(f)
    print(f"Loaded checkpoint: {len(checkpoint)} rows already processed.")
except FileNotFoundError:
    checkpoint = {}
    print("No checkpoint found — starting fresh.")

# Only process rows not already in checkpoint
remaining = gfg_df[~gfg_df['slug'].astype(str).isin(checkpoint.keys())]
print(f"Rows remaining to scrape: {len(remaining)}")

# Start driver
driver = get_driver()

try:
    for _, row in tqdm(remaining.iterrows(), total=len(remaining), desc="Scraping GFG"):
        slug = str(row['slug'])
        url  = str(row['gfg-url'])
        
        tags = scrape_gfg_tags(driver, url)
        checkpoint[slug] = tags
        
        # Save checkpoint every 10 rows
        if len(checkpoint) % 10 == 0:
            with open(CHECKPOINT_FILE, "w") as f:
                json.dump(checkpoint, f)
        
        time.sleep(DELAY_BETWEEN)
        
finally:
    driver.quit()
    # Final checkpoint save
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump(checkpoint, f)
    print(f"\nDone! Checkpoint saved with {len(checkpoint)} entries.")

Loaded checkpoint: 20 rows already processed.
Rows remaining to scrape: 41


Scraping GFG:  10%|▉         | 4/41 [00:32<04:41,  7.61s/it]

  ERROR on https://www.geeksforgeeks.org/problems/3-sum-closest/1, https://www.geeksforgeeks.org/problems/three-sum-closest/1: Message: 
Stacktrace:
0   chromedriver                        0x00000001012efd84 cxxbridge1$str$ptr + 3127864
1   chromedriver                        0x00000001012e8174 cxxbridge1$str$ptr + 3096104
2   chromedriver                        0x0000000100dc59f4 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 75356
3   chromedriver                        0x0000000100e0df84 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 371692
4   chromedriver                        0x0000000100e4cb8c _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 628724
5   chromedriver                        0x0000000100e022bc _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 323364
6   chromedriver                        0x00000001012ae8a8 cxxbridge1$str$ptr + 2860380
7   chromedriver                        0x0000000101

Scraping GFG:  78%|███████▊  | 32/41 [03:43<01:01,  6.79s/it]

  ERROR on https://www.geeksforgeeks.org/problems/minimum-steps-required--170647/1, https://www.geeksforgeeks.org/problems/geeks-and-the-string--170645/1: Message: 
Stacktrace:
0   chromedriver                        0x00000001012efd84 cxxbridge1$str$ptr + 3127864
1   chromedriver                        0x00000001012e8174 cxxbridge1$str$ptr + 3096104
2   chromedriver                        0x0000000100dc59f4 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 75356
3   chromedriver                        0x0000000100e0df84 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 371692
4   chromedriver                        0x0000000100e4cb8c _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 628724
5   chromedriver                        0x0000000100e022bc _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 323364
6   chromedriver                        0x00000001012ae8a8 cxxbridge1$str$ptr + 2860380
7   chromedriver        

Scraping GFG:  88%|████████▊ | 36/41 [04:14<00:34,  7.00s/it]

  ERROR on https://www.geeksforgeeks.org/problems/subset-sum-problem2014/1, https://www.geeksforgeeks.org/problems/partitions-with-given-difference/1: Message: 
Stacktrace:
0   chromedriver                        0x00000001012efd84 cxxbridge1$str$ptr + 3127864
1   chromedriver                        0x00000001012e8174 cxxbridge1$str$ptr + 3096104
2   chromedriver                        0x0000000100dc59f4 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 75356
3   chromedriver                        0x0000000100e0df84 _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 371692
4   chromedriver                        0x0000000100e4cb8c _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 628724
5   chromedriver                        0x0000000100e022bc _RNvCsdExgN8vFLbb_7___rustc35___rust_no_alloc_shim_is_unstable_v2 + 323364
6   chromedriver                        0x00000001012ae8a8 cxxbridge1$str$ptr + 2860380
7   chromedriver            

Scraping GFG: 100%|██████████| 41/41 [04:54<00:00,  7.18s/it]



Done! Checkpoint saved with 61 entries.


In [8]:
# Cell 6 - Merge GFG tags back into the main dataframe and save

def get_from_checkpoint(slug, field):
    entry = checkpoint.get(str(slug), {})
    tags  = entry.get(field, [])
    return json.dumps(tags) if tags else None  # store as JSON string in CSV

# Add new columns to original df
df['company_tags_gfg'] = df['slug'].apply(
    lambda s: get_from_checkpoint(s, 'company_tags_gfg')
)
df['topic_tags_gfg'] = df['slug'].apply(
    lambda s: get_from_checkpoint(s, 'topic_tags_gfg')
)

# Save enriched CSV
df.to_csv(OUTPUT_FILE, index=False)
print(f"Saved enriched CSV to: {OUTPUT_FILE}")
print(df[['title', 'topicTags', 'topic_tags_gfg', 'company_tags_gfg']].dropna(subset=['topic_tags_gfg']).head(10))

Saved enriched CSV to: Result-2-with-gfg-tags.csv
                    title                                          topicTags  \
0               01 Matrix  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
2             132 Pattern  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
3         2 Keys Keyboard  [{'name': 'Math', 'slug': 'math'}, {'name': 'D...   
5                    3Sum  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
7            3Sum Smaller  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
9         4 Keys Keyboard  [{'name': 'Math', 'slug': 'math'}, {'name': 'D...   
10                   4Sum  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
11                4Sum II  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
14         Accounts Merge  [{'name': 'Array', 'slug': 'array'}, {'name': ...   
17  Activity Participants         [{'name': 'Database', 'slug': 'database'}]   

                                       topic_tags_gfg  \
0   ["Matrix

In [7]:
# Cell 7 - Quick stats and sample check
enriched = pd.read_csv(OUTPUT_FILE)

total_gfg         = enriched['gfg-url'].notna().sum()
got_topic_tags    = enriched['topic_tags_gfg'].notna().sum()
got_company_tags  = enriched['company_tags_gfg'].notna().sum()

print(f"Total GFG rows:              {total_gfg}")
print(f"Rows with GFG topic tags:    {got_topic_tags}")
print(f"Rows with GFG company tags:  {got_company_tags}")
print(f"Coverage: {got_topic_tags/total_gfg*100:.1f}%")

# Show a sample
print("\nSample row:")
sample = enriched[enriched['topic_tags_gfg'].notna()].iloc[0]
print("Title:            ", sample['title'])
print("LeetCode tags:    ", sample['topicTags'])
print("GFG topic tags:   ", sample['topic_tags_gfg'])
print("GFG company tags: ", sample['company_tags_gfg'])


Total GFG rows:              41
Rows with GFG topic tags:    37
Rows with GFG company tags:  20
Coverage: 90.2%

Sample row:
Title:             01 Matrix
LeetCode tags:     [{'name': 'Array', 'slug': 'array'}, {'name': 'Dynamic Programming', 'slug': 'dynamic-programming'}, {'name': 'Breadth-First Search', 'slug': 'breadth-first-search'}, {'name': 'Matrix', 'slug': 'matrix'}]
GFG topic tags:    ["Matrix", "Graph", "BFS", "Data Structures", "Algorithms"]
GFG company tags:  ["Bloomberg", "Amazon", "Microsoft", "Accenture", "Google", "Flipkart", "Uber", "NPCI"]
