### Using api for leetcode

In [1]:
import requests
from bs4 import BeautifulSoup
import psycopg2

In [1]:
import requests
import json

url = "https://leetcode.com/graphql"
headers = {
    "Content-Type": "application/json",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
# Define the GraphQL query for fetching problems
problem_list_query = '''
query problemsetQuestionList($limit: Int!, $skip: Int!) {
    problemsetQuestionListV2(
        limit: $limit
        skip: $skip
    ) {
        questions {
            titleSlug
            title
            difficulty
        }
    }
}
'''

# Reusing existing variables (url and headers)
all_problems = []
page_size = 100  # Fetch 100 problems at a time
skip = 0
has_more = True

while has_more:
        # Prepare variables for the request
        problem_vars = {
                "limit": page_size,
                "skip": skip
        }
        
        # Create the payload
        problem_payload = {
                "query": problem_list_query,
                "variables": problem_vars
        }
        
        # Make the request
        try:
                problem_response = requests.post(url, headers=headers, json=problem_payload)
                problem_data = problem_response.json()
                
                if "data" in problem_data and "problemsetQuestionListV2" in problem_data["data"]:
                        questions = problem_data["data"]["problemsetQuestionListV2"]["questions"]
                        all_problems.extend(questions)
                        
                        # Check if we need to continue pagination
                        if len(questions) < page_size:
                                has_more = False
                        else:
                                skip += page_size
                                
                        print(f"Fetched {len(questions)} problems. Total: {len(all_problems)}")
                else:
                        print("Failed to fetch problems:", problem_data)
                        has_more = False
                        
        except Exception as e:
                print(f"Error fetching problems: {e}")
                has_more = False
                
        # Optional: Add a small delay to avoid rate limiting
        import time
        time.sleep(1)

print(f"Total problems fetched: {len(all_problems)}")

Fetched 100 problems. Total: 100
Fetched 100 problems. Total: 200
Fetched 100 problems. Total: 300
Fetched 100 problems. Total: 400
Fetched 100 problems. Total: 500
Fetched 100 problems. Total: 600
Fetched 100 problems. Total: 700
Fetched 100 problems. Total: 800
Fetched 100 problems. Total: 900
Fetched 100 problems. Total: 1000
Fetched 100 problems. Total: 1100
Fetched 100 problems. Total: 1200
Fetched 100 problems. Total: 1300
Fetched 100 problems. Total: 1400
Fetched 100 problems. Total: 1500
Fetched 100 problems. Total: 1600
Fetched 100 problems. Total: 1700
Fetched 100 problems. Total: 1800
Fetched 100 problems. Total: 1900
Fetched 100 problems. Total: 2000
Fetched 100 problems. Total: 2100
Fetched 100 problems. Total: 2200
Fetched 100 problems. Total: 2300
Fetched 100 problems. Total: 2400
Fetched 100 problems. Total: 2500
Fetched 100 problems. Total: 2600
Fetched 100 problems. Total: 2700
Fetched 100 problems. Total: 2800
Fetched 100 problems. Total: 2900
Fetched 100 problems. T

In [3]:
from bs4 import BeautifulSoup
import re
import psycopg2
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import os

load_dotenv()  # This loads the .env file

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

conn = psycopg2.connect(
        dbname="dsa_search",
        user="dsa_user",
        password="dsa_user",
        host="localhost",
        port="5432",
    )

# Create a cursor
cur = conn.cursor()

insert_query = """
INSERT INTO dsa_questions_gemini (title, difficulty, topic, link, description, embedding)
VALUES (%s, %s, %s, %s, %s, %s)
"""
url = "https://leetcode.com/graphql"
headers = {
    "Content-Type": "application/json",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}

# Query for problem details
query = """
query questionData($titleSlug: String!) {
    question(titleSlug: $titleSlug) {
        questionId
        title
        content
        difficulty
        topicTags {
            name
        }
    }
}
"""
start = 3500
end = 3686
# variables = {"titleSlug": "two-sum"}
for i in range(start, end):
    problem = all_problems[i]
    variables = {"titleSlug": problem["titleSlug"]}
    payload = {"query": query, "variables": variables}
    response = requests.post(url, headers=headers, json=payload)
    data = response.json()   
    if "data" in data and data["data"]["question"]:
        print(f"Fetched problem: {problem['title']}")
        q = data["data"]["question"]
        title = q["title"]
        difficulty = q["difficulty"]
        tags = ", ".join([t["name"] for t in q["topicTags"]])
        # Content is HTML, let's clean it
        raw_html = q["content"]
        if not raw_html:
            print(f"❌ Skipping question '{title}' because content is None")
            continue
        soup = BeautifulSoup(raw_html, "html.parser")
        description_parts = []
        for elem in soup.find_all():
            if elem.get_text().strip().startswith("Example"):
                break
            description_parts.append(str(elem))

        description_html = "\n".join(description_parts)
        description_text = BeautifulSoup(description_html, "html.parser").get_text()
        text = re.sub(r"\s+", " ", description_text).strip()
        embedding = embeddings.embed_query(text)
        cur.execute(insert_query, (title, difficulty, tags, f"https://leetcode.com/problems/{problem['titleSlug']}", text, embedding))

    else:
        print("❌ Failed:", data)

conn.commit()
cur.close()
conn.close()

Fetched problem: Maximize Active Section with Trade II
Fetched problem: Minimum Cost to Reach Every Position
Fetched problem: Longest Palindrome After Substring Concatenation I
Fetched problem: Longest Palindrome After Substring Concatenation II
Fetched problem: Minimum Operations to Make Elements Within K Subarrays Equal
Fetched problem: Find Time Required to Eliminate Bacterial Strains
❌ Skipping question 'Find Time Required to Eliminate Bacterial Strains' because content is None
Fetched problem: Minimum Pair Removal to Sort Array I
Fetched problem: Implement Router
Fetched problem: Maximum Product of Subsequences With an Alternating Sum Equal to K
Fetched problem: Minimum Pair Removal to Sort Array II
Fetched problem: Make a Positive Array
❌ Skipping question 'Make a Positive Array' because content is None
Fetched problem: Minimum Operations to Make Array Sum Divisible by K
Fetched problem: Number of Unique XOR Triplets I
Fetched problem: Number of Unique XOR Triplets II
Fetched pro

### Adjacency list creation


In [28]:
import psycopg2
import numpy as np
import ipywidgets
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import ast  # for safely parsing string to list

# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname="dsa_search",
    user="dsa_user",
    password="dsa_user",
    host="localhost",
    port="5432",
)
cur = conn.cursor()

# Fetch all questions and embeddings
cur.execute("SELECT id, embedding FROM dsa_questions_gemini")
rows = cur.fetchall()

ids = [row[0] for row in rows]

# Parse embeddings string to list of floats
embeddings = []
for row in rows:
    emb_str = row[1]  # This is coming as string like '[-0.00898, ...]'
    if isinstance(emb_str, str):
        emb_list = ast.literal_eval(emb_str)  # safely convert to list
    else:
        emb_list = row[1]
    embeddings.append(emb_list)

embeddings = np.array(embeddings, dtype=np.float32)

# Cosine similarity threshold
THRESHOLD = 0.8
TOP_K = 10  # number of neighbors to store

# Prepare insert query
insert_query = """
INSERT INTO dsa_question_edges_gemini (source_id, target_id, score)
VALUES (%s, %s, %s)
ON CONFLICT (source_id, target_id) DO NOTHING
"""

# Compute pairwise cosine similarity and store top-K neighbors
for i in tqdm(range(len(ids))):
    src_id = ids[i]
    src_emb = embeddings[i].reshape(1, -1)

    sims = cosine_similarity(src_emb, embeddings)[0]
    sims[i] = -1  # ignore self

    top_k_idx = sims.argsort()[-TOP_K:][::-1]

    for idx in top_k_idx:
        if sims[idx] >= THRESHOLD:
            cur.execute(insert_query, (src_id, ids[idx], float(sims[idx])))

conn.commit()
cur.close()
conn.close()
print("✅ Knowledge graph edges stored successfully!")

  0%|          | 0/3327 [00:00<?, ?it/s]

✅ Knowledge graph edges stored successfully!


### Tfidf embeddings

In [2]:
import psycopg2
import json
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Download NLTK data if not already
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# --------------------------
# Preprocessing function
# --------------------------
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    text = text.lower()
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(tok) for tok in tokens if tok not in stop_words]
    return " ".join(tokens)


# --------------------------
# Database connection
# --------------------------
conn = psycopg2.connect(
    dbname="dsa_search",
    user="dsa_user",
    password="dsa_user",
    host="localhost",
    port="5432",
)
cur = conn.cursor()

# --------------------------
# Retrieve existing data
# --------------------------
cur.execute(
    "SELECT id, title, description, difficulty, topic, link FROM dsa_questions_gemini;"
)
rows = cur.fetchall()

# --------------------------
# Preprocess documents
# --------------------------
documents = [
    preprocess_text(f"{row[1]} {row[2]}") for row in rows
]  # title + description

# --------------------------
# Build TF-IDF embeddings
# --------------------------
vectorizer = TfidfVectorizer(max_features=3000)  # limit features to reduce size
X = vectorizer.fit_transform(documents)

conn.commit()

# --------------------------
# Insert embeddings into DB
# --------------------------
for i, row in enumerate(rows):
    tfidf_vector = X[i].toarray()[0]
    tfidf_dict = {
        str(j): float(tfidf_vector[j]) for j in tfidf_vector.nonzero()[0]
    }  # sparse dict

    cur.execute(
        """
        INSERT INTO dsa_questions_tfidf (title, description, difficulty, topic, link, embedding)
        VALUES (%s, %s, %s, %s, %s, %s)
        """,
        (row[1], row[2], row[3], row[4], row[5], json.dumps(tfidf_dict)),
    )

conn.commit()
cur.close()
conn.close()

print("TF-IDF embeddings generated and stored successfully!")

[nltk_data] Downloading package stopwords to /home/prasad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/prasad/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/prasad/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


TF-IDF embeddings generated and stored successfully!


In [7]:
import joblib
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("✅ TF-IDF Vectorizer saved to tfidf_vectorizer.pkl")

✅ TF-IDF Vectorizer saved to tfidf_vectorizer.pkl


In [6]:
import psycopg2
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import json

# --------------------------
# DB connection
# --------------------------
conn = psycopg2.connect(
    dbname="dsa_search",
    user="dsa_user",
    password="dsa_user",
    host="localhost",
    port="5432",
)
cur = conn.cursor()

# --------------------------
# Fetch all questions and TF-IDF embeddings
# --------------------------
cur.execute("SELECT id, embedding FROM dsa_questions_tfidf")
rows = cur.fetchall()

ids = [row[0] for row in rows]

# Parse embeddings JSON to numpy array
embeddings = []
for row in rows:
    emb_data = row[1]
    if isinstance(emb_data, str):
        emb_dict = json.loads(emb_data)
    elif isinstance(emb_data, dict):
        emb_dict = emb_data
    else:
        raise TypeError(f"Unexpected embedding type: {type(emb_data)}")
    # Convert sparse dict to dense array
    max_idx = max([int(k) for k in emb_dict.keys()]) + 1 if emb_dict else 0
    emb = np.zeros(max_idx, dtype=np.float32)
    for k, v in emb_dict.items():
        emb[int(k)] = v
    embeddings.append(emb)

# Pad all vectors to same length (TF-IDF vectors may vary in length)
max_len = max([len(e) for e in embeddings])
embeddings = np.array(
    [np.pad(e, (0, max_len - len(e))) for e in embeddings], dtype=np.float32
)

# --------------------------
# Cosine similarity parameters
# --------------------------
THRESHOLD = 0.3  # TF-IDF vectors are sparse; similarity will be lower
TOP_K = 10  # number of neighbors

# --------------------------
# Create adjacency list in DB
# --------------------------

conn.commit()

insert_query = """
INSERT INTO dsa_question_edges_tfidf (source_id, target_id, score)
VALUES (%s, %s, %s)
ON CONFLICT (source_id, target_id) DO NOTHING
"""

# --------------------------
# Compute top-K cosine similarity
# --------------------------
for i in tqdm(range(len(ids))):
    src_id = ids[i]
    src_emb = embeddings[i].reshape(1, -1)

    sims = cosine_similarity(src_emb, embeddings)[0]
    sims[i] = -1  # ignore self

    top_k_idx = sims.argsort()[-TOP_K:][::-1]

    for idx in top_k_idx:
        if sims[idx] >= THRESHOLD:
            cur.execute(insert_query, (src_id, ids[idx], float(sims[idx])))

conn.commit()
cur.close()
conn.close()
print("✅ TF-IDF knowledge graph edges stored successfully!")

100%|██████████| 2927/2927 [01:28<00:00, 33.23it/s]

✅ TF-IDF knowledge graph edges stored successfully!





### CSES problme seet

In [22]:
import requests
from bs4 import BeautifulSoup
import psycopg2
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import dotenv


dotenv.load_dotenv()  # This loads the .env file

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

conn = psycopg2.connect(
    dbname="dsa_search",
    user="dsa_user",
    password="dsa_user",
    host="localhost",
    port="5432",
)

# Create a cursor
cur = conn.cursor()

insert_query = """
INSERT INTO dsa_questions_gemini (title, difficulty, topic, link, description, embedding)
VALUES (%s, %s, %s, %s, %s, %s)
"""

problemset = "https://cses.fi/problemset/list/"

res = requests.get(problemset)
soup = BeautifulSoup(res.text, "html.parser")
problems = soup.find_all("a", href=True)
for i in range(8, len(problems)):
    print(problems[i].text, problems[i].get("href"))  # type: ignore
    url = f"https://cses.fi{problems[i].get('href')}"  # type: ignore
    res = requests.get(url)
    soup = BeautifulSoup(res.text, "html.parser")
    title = soup.find("title").text # type: ignore
    description = soup.find("p").text # type: ignore
    link = url
    difficulty = "Unknown"
    topic = "Unknown"
    embedding = embeddings.embed_query(description)
    cur.execute(insert_query, (title, difficulty, topic, link, description, embedding))
    print(title)

conn.commit()
cur.close()
conn.close()

Weird Algorithm /problemset/task/1068
CSES - Weird Algorithm
Missing Number /problemset/task/1083
CSES - Missing Number
Repetitions /problemset/task/1069
CSES - Repetitions
Increasing Array /problemset/task/1094
CSES - Increasing Array
Permutations /problemset/task/1070
CSES - Permutations
Number Spiral /problemset/task/1071
CSES - Number Spiral
Two Knights /problemset/task/1072
CSES - Two Knights
Two Sets /problemset/task/1092
CSES - Two Sets
Bit Strings /problemset/task/1617
CSES - Bit Strings
Trailing Zeros /problemset/task/1618
CSES - Trailing Zeros
Coin Piles /problemset/task/1754
CSES - Coin Piles
Palindrome Reorder /problemset/task/1755
CSES - Palindrome Reorder
Gray Code /problemset/task/2205
CSES - Gray Code
Tower of Hanoi /problemset/task/2165
CSES - Tower of Hanoi
Creating Strings /problemset/task/1622
CSES - Creating Strings
Apple Division /problemset/task/1623
CSES - Apple Division
Chessboard and Queens /problemset/task/1624
CSES - Chessboard and Queens
Raab Game I /proble

# Code forces Problem sheet

In [27]:
import requests
from bs4 import BeautifulSoup
import psycopg2
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import dotenv

headers = {
    "Content-Type": "application/json",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
}
dotenv.load_dotenv()  # This loads the .env file

embeddings = GoogleGenerativeAIEmbeddings(model="models/gemini-embedding-001")

conn = psycopg2.connect(
    dbname="dsa_search",
    user="dsa_user",
    password="dsa_user",
    host="localhost",
    port="5432",
)

# Create a cursor
cur = conn.cursor()

insert_query = """
INSERT INTO dsa_questions_gemini (title, difficulty, topic, link, description, embedding)
VALUES (%s, %s, %s, %s, %s, %s)
"""

problemset = "https://atcoder.jp/contests/practice2/tasks/practice2_a"

res = requests.get(problemset, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")
problems = soup.find_all("a", href=True)
print(problems)
for problem in problems:
    print(problem.text, problem.get("href"))  # type: ignore
# for i in range(8, len(problems)):
#     print(problems[i].text, problems[i].get("href"))  # type: ignore
#     url = f"https://cses.fi{problems[i].get('href')}"  # type: ignore
#     res = requests.get(url)
#     soup = BeautifulSoup(res.text, "html.parser")
#     title = soup.find("title").text  # type: ignore
#     description = soup.find("p").text  # type: ignore
#     link = url
#     difficulty = "Unknown"
#     topic = "Unknown"
#     embedding = embeddings.embed_query(description)
#     cur.execute(insert_query, (title, difficulty, topic, link, description, embedding))
#     print(title)

conn.commit()
cur.close()
conn.close()

[<a class="navbar-brand" href="/home"></a>, <a class="contest-title" href="/contests/practice2">AtCoder Library Practice Contest</a>, <a aria-expanded="false" aria-haspopup="true" class="dropdown-toggle" data-toggle="dropdown" href="#" role="button">
<img src="//img.atcoder.jp/assets/top/img/flag-lang/en.png"/> English <span class="caret"></span>
</a>, <a href="/contests/practice2/tasks/practice2_a?lang=ja"><img src="//img.atcoder.jp/assets/top/img/flag-lang/ja.png"/> 日本語</a>, <a href="/contests/practice2/tasks/practice2_a?lang=en"><img src="//img.atcoder.jp/assets/top/img/flag-lang/en.png"/> English</a>, <a href="/register?continue=https%3A%2F%2Fatcoder.jp%2Fcontests%2Fpractice2%2Ftasks%2Fpractice2_a">Sign Up</a>, <a href="/login?continue=https%3A%2F%2Fatcoder.jp%2Fcontests%2Fpractice2%2Ftasks%2Fpractice2_a">Sign In</a>, <a href="http://www.timeanddate.com/worldclock/fixedtime.html?iso=20200907T2231&amp;p1=248" target="blank"><time class="fixtime fixtime-full">2020-09-07 22:31:51+0900