In [1]:
pip install requests




Below code was used to pull NLP requests 


In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["nlp", "nltk", "spacy", "transformers", "text-classification"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [nlp]...
✅ Total collected so far: 17
Fetching page 2 for tag [nlp]...
✅ Total collected so far: 37
Fetching page 3 for tag [nlp]...
✅ Total collected so far: 56
Fetching page 4 for tag [nlp]...
✅ Total collected so far: 66
Fetching page 5 for tag [nlp]...
✅ Total collected so far: 78
Fetching page 6 for tag [nlp]...
✅ Total collected so far: 96
Fetching page 7 for tag [nlp]...
✅ Total collected so far: 117
Fetching page 8 for tag [nlp]...
✅ Total collected so far: 145
Fetching page 9 for tag [nlp]...
✅ Total collected so far: 172
Fetching page 10 for tag [nlp]...
✅ Total collected so far: 200
Fetching page 11 for tag [nlp]...
✅ Total collected so far: 224
Fetching page 12 for tag [nlp]...
✅ Total collected so far: 240
Fetching page 13 for tag [nlp]...
✅ Total collected so far: 270
Fetching page 14 for tag [nlp]...
✅ Total collected so far: 293
Fetching page 15 for tag [nlp]...
✅ Total collected so far: 323
Fetching page 16 for tag [nlp]...
✅ Total collected so 

the beelow code was used to convert the NLP json to CSV 

In [2]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers.csv


In [5]:
df = pd.read_csv("stackoverflow_nlp_posts_with_answers.csv")
df

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,"['python', 'nlp', 'spacy', 'presidio']",79552218,<p>The configuration file is missing the 'labe...,https://stackoverflow.com/questions/79549787,python
1,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,"['nlp', 'huggingface-transformers', 'pre-train...",79551169,<p>The author of the tutorial you mentioned se...,https://stackoverflow.com/questions/79548202,nlp
2,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,"['numpy', 'nlp', 'dependencies', 'google-colab...",79523777,<p>You have to restart the session for the und...,https://stackoverflow.com/questions/79523269,numpy
3,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,"['python', 'nlp', 'large-language-model']",79501337,<p>I can't test it but ...</p>\n<p>I checked <...,https://stackoverflow.com/questions/79501178,python
4,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,"['python', 'nlp', 'spacy', 'langchain', 'presi...",79495969,<p>After some test I was able to find the solu...,https://stackoverflow.com/questions/79482283,python
...,...,...,...,...,...,...,...,...
9595,17490361,"learning, validation, and testing classifier",<p>I'm working on Sentiment Analysis for text ...,"['machine-learning', 'text-classification']",17494417,"<p>In your example, I don't think there is a m...",https://stackoverflow.com/questions/17490361,machine-learning
9596,16823609,Natural Language Processing - Converting Text ...,<p>So I've been working on a natural language ...,"['java', 'nlp', 'svm', 'text-classification']",16824208,<p>I'm not sure what values your attributes ca...,https://stackoverflow.com/questions/16823609,java
9597,16694088,How can i classify text documents with using S...,<p>Almost all of the examples are based on num...,"['svm', 'knn', 'document-classification', 'tex...",16694673,<p>The common approach is to use a bag of word...,https://stackoverflow.com/questions/16694088,svm
9598,16266842,MAXent classifier NLTK output understand,<p>I am trying to understand the <code>classif...,"['python', 'machine-learning', 'nltk', 'text-c...",16310378,"<p>It seems that you have two labels, <code>""R...",https://stackoverflow.com/questions/16266842,python


the below code was used to get nltk requests 

In [6]:
import requests
import time
import json

# === SETTINGS ===
TAGS = [ "nltk"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_nltk.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [nltk]...
✅ Total collected so far: 26
Fetching page 2 for tag [nltk]...
✅ Total collected so far: 51
Fetching page 3 for tag [nltk]...
✅ Total collected so far: 81
Fetching page 4 for tag [nltk]...
✅ Total collected so far: 111
Fetching page 5 for tag [nltk]...
✅ Total collected so far: 141
Fetching page 6 for tag [nltk]...
✅ Total collected so far: 171
Fetching page 7 for tag [nltk]...
✅ Total collected so far: 201
Fetching page 8 for tag [nltk]...
✅ Total collected so far: 231
Fetching page 9 for tag [nltk]...
✅ Total collected so far: 261
Fetching page 10 for tag [nltk]...
✅ Total collected so far: 291
Fetching page 11 for tag [nltk]...
✅ Total collected so far: 321
Fetching page 12 for tag [nltk]...
✅ Total collected so far: 351
Fetching page 13 for tag [nltk]...
✅ Total collected so far: 381
Fetching page 14 for tag [nltk]...
✅ Total collected so far: 411
Fetching page 15 for tag [nltk]...
✅ Total collected so far: 441
Fetching page 16 for tag [nltk]...
✅ 

In [7]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_nltk.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_nltk.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_nltk.csv


In [8]:
df_nltk = pd.read_csv("stackoverflow_nlp_posts_with_answers_nltk.csv")

In [9]:
df_nltk.head()

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79485382,nltk.NaiveBayesClassifier.classify() input par...,<p>I have the following trained classifier:</p...,"['classification', 'nltk', 'naivebayes', 'pyth...",79485768,<p>Use the feature without the label: <code>{'...,https://stackoverflow.com/questions/79485382,classification
1,79312133,Getting all leaf words (reverse stemming) into...,<p>On the same lines as the solution provided ...,"['python', 'nlp', 'nltk']",79312987,<p>One solution using nested list comprehensio...,https://stackoverflow.com/questions/79312133,python
2,79293919,Determining most popular words in the English ...,"<p>Forgive me if my wording is awful, but I'm ...","['python', 'nlp', 'nltk', 'detection']",79294074,<p>You need a external dataset for this task. ...,https://stackoverflow.com/questions/79293919,python
3,79229713,define equality predicate Lambda-Calculus nltk,<p>I am trying to define a Lambda-Calculus rep...,"['python', 'nltk', 'grammar', 'lambda-calculus']",79230197,"<p>By its syntactic definition, <code>are</cod...",https://stackoverflow.com/questions/79229713,python
4,78884251,Unable to install wordnet with nltk 3.9.0 as i...,"<p>It is not possible to import nltk, and the ...","['python', 'nltk', 'wordnet']",78884294,<p>This bug was introduced in nltk 3.9.0 (rele...,https://stackoverflow.com/questions/78884251,python


In [13]:
df_nltk.shape

(2124, 8)

the below code will be used to get spacy json 


In [14]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["spacy"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_spacy.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [spacy]...
✅ Total collected so far: 22
Fetching page 2 for tag [spacy]...
✅ Total collected so far: 51
Fetching page 3 for tag [spacy]...
✅ Total collected so far: 81
Fetching page 4 for tag [spacy]...
✅ Total collected so far: 105
Fetching page 5 for tag [spacy]...
✅ Total collected so far: 135
Fetching page 6 for tag [spacy]...
✅ Total collected so far: 165
Fetching page 7 for tag [spacy]...
✅ Total collected so far: 195
Fetching page 8 for tag [spacy]...
✅ Total collected so far: 225
Fetching page 9 for tag [spacy]...
✅ Total collected so far: 255
Fetching page 10 for tag [spacy]...
✅ Total collected so far: 285
Fetching page 11 for tag [spacy]...
✅ Total collected so far: 315
Fetching page 12 for tag [spacy]...
✅ Total collected so far: 345
Fetching page 13 for tag [spacy]...
✅ Total collected so far: 375
Fetching page 14 for tag [spacy]...
✅ Total collected so far: 405
Fetching page 15 for tag [spacy]...
✅ Total collected so far: 435
Fetching page 16 for t

the below code was used to convert spacy json to CSV 


In [15]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_spacy.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_spacy.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_spacy.csv


In [16]:
df_spacy = pd.read_csv("stackoverflow_nlp_posts_with_answers_spacy.csv")

In [17]:
df_spacy.head()

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,"['python', 'nlp', 'spacy', 'presidio']",79552218,<p>The configuration file is missing the 'labe...,https://stackoverflow.com/questions/79549787,python
1,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,"['python', 'nlp', 'spacy', 'langchain', 'presi...",79495969,<p>After some test I was able to find the solu...,https://stackoverflow.com/questions/79482283,python
2,79330953,Lemma of puncutation in spacy,"<p>I'm using spacy for some downstream tasks, ...","['python', 'spacy', 'lemmatization']",79331038,"<p>I can confirm the issue with German, but wh...",https://stackoverflow.com/questions/79330953,python
3,79292283,Attaching custom KB to Spacy &quot;entity_link...,<p>I want to run an entity linking job using a...,"['spacy', 'named-entity-recognition', 'entity-...",79293967,<p>What happens here is that this line</p>\n<p...,https://stackoverflow.com/questions/79292283,spacy
4,79159805,How can I share a complex spaCy NLP model acro...,<p>I'm working on a multiprocessing python app...,"['nlp', 'multiprocessing', 'python-multiproces...",79162232,<p>I would strongly advise you not to treat NL...,https://stackoverflow.com/questions/79159805,nlp


the below code is used to pull text_classification json 

In [19]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["text-classification"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_text-classification.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [text-classification]...
✅ Total collected so far: 28
Fetching page 2 for tag [text-classification]...
✅ Total collected so far: 51
Fetching page 3 for tag [text-classification]...
✅ Total collected so far: 74
Fetching page 4 for tag [text-classification]...
✅ Total collected so far: 104
Fetching page 5 for tag [text-classification]...
✅ Total collected so far: 134
Fetching page 6 for tag [text-classification]...
✅ Total collected so far: 164
Fetching page 7 for tag [text-classification]...
✅ Total collected so far: 194
Fetching page 8 for tag [text-classification]...
✅ Total collected so far: 224
Fetching page 9 for tag [text-classification]...
✅ Total collected so far: 254
Fetching page 10 for tag [text-classification]...
✅ Total collected so far: 284
Fetching page 11 for tag [text-classification]...
✅ Total collected so far: 314
Fetching page 12 for tag [text-classification]...
✅ Total collected so far: 344
Fetching page 13 for tag [text-classification]...
✅ 

text-classification CSV Conversion below 

In [20]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_text-classification.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_text-classification.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_text-classification.csv


In [21]:
df_text_classification = pd.read_csv("stackoverflow_nlp_posts_with_answers_text-classification.csv")
df_text_classification.head()

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79247672,Error in getting Captum text explanations for ...,<p>I have the following code that I am using t...,"['machine-learning', 'pytorch', 'nlp', 'huggin...",79248379,<p>You need to slightly change the gradients c...,https://stackoverflow.com/questions/79247672,machine-learning
1,79247594,euclidian distance from word to sentence after...,<p>I have dataframe with 1000 text rows.</p>\n...,"['pandas', 'dataframe', 'nlp', 'text-classific...",79248087,<p>I am not convinced that the Euclidean dista...,https://stackoverflow.com/questions/79247594,pandas
2,79192127,How can I get the confidence variable from a C...,<p>I am using the CreateML tool to train a tex...,"['machine-learning', 'text-classification', 'c...",79209144,"<p>To best use the text classifiers, you shoul...",https://stackoverflow.com/questions/79192127,machine-learning
3,79016929,Machine learning model predicts training label...,<p>I am trying to build a model to predict &qu...,"['python', 'machine-learning', 'text-classific...",79017406,<p>The problem is that you read the test data ...,https://stackoverflow.com/questions/79016929,python
4,77879635,How to reset parameters from AutoModelForSeque...,<p>Currently to reinitialize a model for <code...,"['python', 'machine-learning', 'huggingface-tr...",77879847,"<p>That is the purpose of <a href=""https://hug...",https://stackoverflow.com/questions/77879635,python


In [22]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["lemmatization"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_lemmatization.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [lemmatization]...
✅ Total collected so far: 30
Fetching page 2 for tag [lemmatization]...
✅ Total collected so far: 60
Fetching page 3 for tag [lemmatization]...
✅ Total collected so far: 90
Fetching page 4 for tag [lemmatization]...
✅ Total collected so far: 120
Fetching page 5 for tag [lemmatization]...
✅ Total collected so far: 141
Fetching page 6 for tag [lemmatization]...
No items found for tag [lemmatization], page 6
🎉 Done! Saved 141 valid Q&A posts to stackoverflow_nlp_posts_with_answers_lemmatization.json


In [23]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_lemmatization.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_lemmatization.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_lemmatization.csv


In [24]:
df_lemmantization = pd.read_csv("stackoverflow_nlp_posts_with_answers_lemmatization.csv")
df_lemmantization.head()

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79330953,Lemma of puncutation in spacy,"<p>I'm using spacy for some downstream tasks, ...","['python', 'spacy', 'lemmatization']",79331038,"<p>I can confirm the issue with German, but wh...",https://stackoverflow.com/questions/79330953,python
1,79081924,"With spaCy, how can I get all lemmas from a st...",<p>I have a pandas data frame with a column of...,"['python', 'pandas', 'nlp', 'spacy', 'lemmatiz...",79086290,<p>There are many ways to speed up SpaCy proce...,https://stackoverflow.com/questions/79081924,python
2,78489915,How to lemmatize text column in pandas datafra...,<p>I read csv file into pandas dataframe.</p>\...,"['pandas', 'nlp', 'tokenize', 'lemmatization',...",78491545,"<p>No, you don't necessarily have to tokenize ...",https://stackoverflow.com/questions/78489915,pandas
3,78278881,How to speed up the lemmatization of a Serie i...,<p>I got this line that lemmatize a serie of a...,"['python', 'dataframe', 'optimization', 'serie...",78304621,"<p>I found a way to speed up this. In my case,...",https://stackoverflow.com/questions/78278881,python
4,78215873,Comparison between stemmiation and lemmatization,"<p>Based on several research , i found followi...","['python', 'nltk', 'stemming', 'lemmatization']",78216510,<p>Here is an example of what parts of speech ...,https://stackoverflow.com/questions/78215873,python


In [25]:
import requests
import time
import json

# === SETTINGS ===
TAG = "nlp"
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
TARGET_COUNT = 20000
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"

# Files
ORIGINAL_JSON = "stackoverflow_nlp_posts_with_answers.json"  # already collected
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_second_set.json"

# === Load existing question IDs to avoid duplicates ===
try:
    with open(ORIGINAL_JSON, "r", encoding="utf-8") as f:
        existing_data = json.load(f)
        existing_ids = {q["question_id"] for q in existing_data}
except FileNotFoundError:
    print(f"⚠️ Original file {ORIGINAL_JSON} not found. Proceeding without filtering.")
    existing_ids = set()

print(f"🔍 Found {len(existing_ids)} existing question IDs to exclude.")

# === Initialize ===
all_questions = []
valid_question_count = 0

# === FETCH QUESTIONS ===
for page in range(1, MAX_PAGES + 1):
    if valid_question_count >= TARGET_COUNT:
        break

    print(f"Fetching page {page} for tag [{TAG}]...")

    url = "https://api.stackexchange.com/2.3/questions"
    params = {
        "page": page,
        "pagesize": PAGE_SIZE,
        "order": "desc",
        "sort": "creation",
        "tagged": TAG,
        "site": SITE,
        "filter": "withbody",
        "key": API_KEY
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"⚠️ Error {response.status_code}: {response.text}")
        break

    try:
        data = response.json()
    except ValueError:
        print("⚠️ Invalid JSON response. Skipping this page.")
        continue

    if "items" not in data or not data["items"]:
        print(f"No items found on page {page}")
        break

    # Filter for new, answered questions only
    questions_with_answers = [
        item for item in data["items"]
        if item.get("accepted_answer_id") and item.get("question_id") not in existing_ids
    ]

    # Batch accepted answer IDs
    accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
    id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

    accepted_answers = {}
    for batch in id_batches:
        ids_str = ";".join(batch)
        ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
        ans_params = {
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }
        ans_resp = requests.get(ans_url, params=ans_params)
        if ans_resp.status_code == 200:
            try:
                ans_data = ans_resp.json()
                for ans in ans_data.get("items", []):
                    accepted_answers[ans["answer_id"]] = ans.get("body")
            except ValueError:
                continue
        time.sleep(0.5)

    # Store new unique Q&A pairs only
    for item in questions_with_answers:
        if valid_question_count >= TARGET_COUNT:
            break

        aid = item["accepted_answer_id"]
        abody = accepted_answers.get(aid)

        if abody:
            question_id = item.get("question_id")
            if question_id in existing_ids:
                continue  # double-check

            question_data = {
                "question_id": question_id,
                "title": item.get("title"),
                "body": item.get("body"),
                "tags": item.get("tags"),
                "accepted_answer_id": aid,
                "accepted_answer_body": abody,
                "score": item.get("score")
            }
            all_questions.append(question_data)
            valid_question_count += 1

    print(f"✅ Total new questions collected so far: {valid_question_count}")
    time.sleep(0.5)

# === SAVE TO NEW FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} NEW Q&A posts to {OUTPUT_FILE}")


🔍 Found 8520 existing question IDs to exclude.
Fetching page 1 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 2 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 3 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 4 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 5 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 6 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 7 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 8 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 9 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 10 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 11 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 12 for tag [nlp]...
✅ Total new questions collected so far: 0
Fetching page 13 for tag [nlp]...
✅ Total new ques

nlp tag second set CSV Conversion 

In [26]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_second_set.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_second_set.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_second_set.csv


In [27]:
import json

# === File Paths ===
FILE_1 = "stackoverflow_nlp_posts_with_answers.json"
FILE_2 = "stackoverflow_nlp_posts_with_answers_second_set.json"
OUTPUT_FILE = "stackoverflow_nlp_combined.json"

# === Load both JSON files ===
with open(FILE_1, "r", encoding="utf-8") as f1:
    data1 = json.load(f1)

with open(FILE_2, "r", encoding="utf-8") as f2:
    data2 = json.load(f2)

# === Merge and remove duplicates based on question_id ===
combined = {item["question_id"]: item for item in data1}
for item in data2:
    qid = item["question_id"]
    if qid not in combined:
        combined[qid] = item

# === Convert back to list and save ===
merged_data = list(combined.values())
with open(OUTPUT_FILE, "w", encoding="utf-8") as f_out:
    json.dump(merged_data, f_out, indent=4)

print(f"✅ Combined dataset saved to {OUTPUT_FILE} with {len(merged_data)} unique questions.")


✅ Combined dataset saved to stackoverflow_nlp_combined.json with 10459 unique questions.


In [29]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["stemming"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_stemming.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [stemming]...
✅ Total collected so far: 30
Fetching page 2 for tag [stemming]...
✅ Total collected so far: 60
Fetching page 3 for tag [stemming]...
✅ Total collected so far: 90
Fetching page 4 for tag [stemming]...
✅ Total collected so far: 120
Fetching page 5 for tag [stemming]...
✅ Total collected so far: 150
Fetching page 6 for tag [stemming]...
✅ Total collected so far: 170
Fetching page 7 for tag [stemming]...
No items found for tag [stemming], page 7
🎉 Done! Saved 170 valid Q&A posts to stackoverflow_nlp_posts_with_answers_stemming.json


In [30]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_stemming.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_stemming.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_stemming.csv


In [31]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["named-entity-recognition"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_named-entity-recognition.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [named-entity-recognition]...
✅ Total collected so far: 16
Fetching page 2 for tag [named-entity-recognition]...
✅ Total collected so far: 46
Fetching page 3 for tag [named-entity-recognition]...
✅ Total collected so far: 76
Fetching page 4 for tag [named-entity-recognition]...
✅ Total collected so far: 106
Fetching page 5 for tag [named-entity-recognition]...
✅ Total collected so far: 136
Fetching page 6 for tag [named-entity-recognition]...
✅ Total collected so far: 166
Fetching page 7 for tag [named-entity-recognition]...
✅ Total collected so far: 196
Fetching page 8 for tag [named-entity-recognition]...
✅ Total collected so far: 226
Fetching page 9 for tag [named-entity-recognition]...
✅ Total collected so far: 256
Fetching page 10 for tag [named-entity-recognition]...
✅ Total collected so far: 286
Fetching page 11 for tag [named-entity-recognition]...
✅ Total collected so far: 316
Fetching page 12 for tag [named-entity-recognition]...
✅ Total collected so f

In [32]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_named-entity-recognition.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_named-entity-recognition.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_named-entity-recognition.csv


In [33]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["language-detection"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_language-detection.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [language-detection]...
✅ Total collected so far: 30
Fetching page 2 for tag [language-detection]...
✅ Total collected so far: 60
Fetching page 3 for tag [language-detection]...
No items found for tag [language-detection], page 3
🎉 Done! Saved 60 valid Q&A posts to stackoverflow_nlp_posts_with_answers_language-detection.json


In [35]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_language-detection.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_language-detection.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")


✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_language-detection.csv


In [36]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["text-summarization"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_text-summarization.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [text-summarization]...
No items found for tag [text-summarization], page 1
🎉 Done! Saved 0 valid Q&A posts to stackoverflow_nlp_posts_with_answers_text-summarization.json


In [37]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["bert"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_bert.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [bert]...
✅ Total collected so far: 10
Fetching page 2 for tag [bert]...
✅ Total collected so far: 30
Fetching page 3 for tag [bert]...
✅ Total collected so far: 45
Fetching page 4 for tag [bert]...
✅ Total collected so far: 73
Fetching page 5 for tag [bert]...
✅ Total collected so far: 99
Fetching page 6 for tag [bert]...
✅ Total collected so far: 129
Fetching page 7 for tag [bert]...
✅ Total collected so far: 153
Fetching page 8 for tag [bert]...
✅ Total collected so far: 183
Fetching page 9 for tag [bert]...
✅ Total collected so far: 210
Fetching page 10 for tag [bert]...
✅ Total collected so far: 239
Fetching page 11 for tag [bert]...
✅ Total collected so far: 269
Fetching page 12 for tag [bert]...
✅ Total collected so far: 299
Fetching page 13 for tag [bert]...
✅ Total collected so far: 329
Fetching page 14 for tag [bert]...
✅ Total collected so far: 357
Fetching page 15 for tag [bert]...
✅ Total collected so far: 387
Fetching page 16 for tag [bert]...
✅ To

In [38]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_bert.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_bert.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_bert.csv


In [39]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["sentence-transformers"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_sentence-transformers.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [sentence-transformers]...
✅ Total collected so far: 8
Fetching page 2 for tag [sentence-transformers]...
✅ Total collected so far: 36
Fetching page 3 for tag [sentence-transformers]...
✅ Total collected so far: 50
Fetching page 4 for tag [sentence-transformers]...
No items found for tag [sentence-transformers], page 4
🎉 Done! Saved 50 valid Q&A posts to stackoverflow_nlp_posts_with_answers_sentence-transformers.json


In [40]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_sentence-transformers.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_sentence-transformers.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_sentence-transformers.csv


In [42]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["huggingface-transformers"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_huggingface-transformers.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [huggingface-transformers]...
✅ Total collected so far: 17
Fetching page 2 for tag [huggingface-transformers]...
✅ Total collected so far: 32
Fetching page 3 for tag [huggingface-transformers]...
✅ Total collected so far: 58
Fetching page 4 for tag [huggingface-transformers]...
✅ Total collected so far: 75
Fetching page 5 for tag [huggingface-transformers]...
✅ Total collected so far: 89
Fetching page 6 for tag [huggingface-transformers]...
✅ Total collected so far: 100
Fetching page 7 for tag [huggingface-transformers]...
✅ Total collected so far: 123
Fetching page 8 for tag [huggingface-transformers]...
✅ Total collected so far: 151
Fetching page 9 for tag [huggingface-transformers]...
✅ Total collected so far: 174
Fetching page 10 for tag [huggingface-transformers]...
✅ Total collected so far: 192
Fetching page 11 for tag [huggingface-transformers]...
✅ Total collected so far: 210
Fetching page 12 for tag [huggingface-transformers]...
✅ Total collected so far

In [43]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_huggingface-transformers.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_huggingface-transformers.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_huggingface-transformers.csv


In [44]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["information-retrieval"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_information=retrieval.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [information-retrieval]...
✅ Total collected so far: 17
Fetching page 2 for tag [information-retrieval]...
✅ Total collected so far: 47
Fetching page 3 for tag [information-retrieval]...
✅ Total collected so far: 77
Fetching page 4 for tag [information-retrieval]...
✅ Total collected so far: 107
Fetching page 5 for tag [information-retrieval]...
✅ Total collected so far: 137
Fetching page 6 for tag [information-retrieval]...
✅ Total collected so far: 167
Fetching page 7 for tag [information-retrieval]...
✅ Total collected so far: 197
Fetching page 8 for tag [information-retrieval]...
✅ Total collected so far: 227
Fetching page 9 for tag [information-retrieval]...
✅ Total collected so far: 257
Fetching page 10 for tag [information-retrieval]...
✅ Total collected so far: 287
Fetching page 11 for tag [information-retrieval]...
✅ Total collected so far: 317
Fetching page 12 for tag [information-retrieval]...
✅ Total collected so far: 347
Fetching page 13 for tag [in

In [45]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_information=retrieval.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_information=retrieval.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_information=retrieval.csv


In [46]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["openai-api"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_openai-api.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [openai-api]...
✅ Total collected so far: 16
Fetching page 2 for tag [openai-api]...
✅ Total collected so far: 29
Fetching page 3 for tag [openai-api]...
✅ Total collected so far: 45
Fetching page 4 for tag [openai-api]...
✅ Total collected so far: 65
Fetching page 5 for tag [openai-api]...
✅ Total collected so far: 82
Fetching page 6 for tag [openai-api]...
✅ Total collected so far: 97
Fetching page 7 for tag [openai-api]...
✅ Total collected so far: 109
Fetching page 8 for tag [openai-api]...
✅ Total collected so far: 126
Fetching page 9 for tag [openai-api]...
✅ Total collected so far: 152
Fetching page 10 for tag [openai-api]...
✅ Total collected so far: 180
Fetching page 11 for tag [openai-api]...
✅ Total collected so far: 201
Fetching page 12 for tag [openai-api]...
✅ Total collected so far: 217
Fetching page 13 for tag [openai-api]...
✅ Total collected so far: 247
Fetching page 14 for tag [openai-api]...
✅ Total collected so far: 273
Fetching page 15 for 

In [47]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_openai-api.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_openai-api.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_openai-api.csv


In [48]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["topic-modeling"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_topic-modeling.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [topic-modeling]...
✅ Total collected so far: 21
Fetching page 2 for tag [topic-modeling]...
✅ Total collected so far: 49
Fetching page 3 for tag [topic-modeling]...
✅ Total collected so far: 79
Fetching page 4 for tag [topic-modeling]...
✅ Total collected so far: 109
Fetching page 5 for tag [topic-modeling]...
✅ Total collected so far: 139
Fetching page 6 for tag [topic-modeling]...
✅ Total collected so far: 169
Fetching page 7 for tag [topic-modeling]...
✅ Total collected so far: 199
Fetching page 8 for tag [topic-modeling]...
✅ Total collected so far: 229
Fetching page 9 for tag [topic-modeling]...
✅ Total collected so far: 259
Fetching page 10 for tag [topic-modeling]...
✅ Total collected so far: 289
Fetching page 11 for tag [topic-modeling]...
No items found for tag [topic-modeling], page 11
🎉 Done! Saved 289 valid Q&A posts to stackoverflow_nlp_posts_with_answers_topic-modeling.json


In [49]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_topic-modeling.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_topic-modeling.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"✅ CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_topic-modeling.csv


In [50]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["question-answering"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_question-answering.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f"⚠️ Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print("⚠️ Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f"✅ Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f"🎉 Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [question-answering]...
✅ Total collected so far: 30
Fetching page 2 for tag [question-answering]...
✅ Total collected so far: 60
Fetching page 3 for tag [question-answering]...
✅ Total collected so far: 71
Fetching page 4 for tag [question-answering]...
No items found for tag [question-answering], page 4
🎉 Done! Saved 71 valid Q&A posts to stackoverflow_nlp_posts_with_answers_question-answering.json


In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_question-answering.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_question-answering.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_question-answering.csv


In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["tf-idf"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_tf-idf.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [tf-idf]...
✅ Total collected so far: 30
Fetching page 2 for tag [tf-idf]...
✅ Total collected so far: 60
Fetching page 3 for tag [tf-idf]...
✅ Total collected so far: 90
Fetching page 4 for tag [tf-idf]...
✅ Total collected so far: 120
Fetching page 5 for tag [tf-idf]...
✅ Total collected so far: 150
Fetching page 6 for tag [tf-idf]...
✅ Total collected so far: 180
Fetching page 7 for tag [tf-idf]...
✅ Total collected so far: 210
Fetching page 8 for tag [tf-idf]...
✅ Total collected so far: 240
Fetching page 9 for tag [tf-idf]...
✅ Total collected so far: 270
Fetching page 10 for tag [tf-idf]...
✅ Total collected so far: 300
Fetching page 11 for tag [tf-idf]...
✅ Total collected so far: 330
Fetching page 12 for tag [tf-idf]...
✅ Total collected so far: 360
Fetching page 13 for tag [tf-idf]...
✅ Total collected so far: 390
Fetching page 14 for tag [tf-idf]...
✅ Total collected so far: 392
Fetching page 15 for tag [tf-idf]...
No items found for tag [tf-idf], page

In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_tf-idf.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_tf-idf.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_tf-idf.csv


In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["genism"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_genism.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [genism]...
No items found for tag [genism], page 1
🎉 Done! Saved 0 valid Q&A posts to stackoverflow_nlp_posts_with_answers_genism.json


In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["text-mining"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_text-mining.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [text-mining]...
✅ Total collected so far: 29
Fetching page 2 for tag [text-mining]...
✅ Total collected so far: 59
Fetching page 3 for tag [text-mining]...
✅ Total collected so far: 89
Fetching page 4 for tag [text-mining]...
✅ Total collected so far: 119
Fetching page 5 for tag [text-mining]...
✅ Total collected so far: 149
Fetching page 6 for tag [text-mining]...
✅ Total collected so far: 179
Fetching page 7 for tag [text-mining]...
✅ Total collected so far: 209
Fetching page 8 for tag [text-mining]...
✅ Total collected so far: 239
Fetching page 9 for tag [text-mining]...
✅ Total collected so far: 269
Fetching page 10 for tag [text-mining]...
✅ Total collected so far: 299
Fetching page 11 for tag [text-mining]...
✅ Total collected so far: 329
Fetching page 12 for tag [text-mining]...
✅ Total collected so far: 359
Fetching page 13 for tag [text-mining]...
✅ Total collected so far: 389
Fetching page 14 for tag [text-mining]...
✅ Total collected so far: 419
Fetc

In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_text-mining.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_text-mining.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_text-mining.csv


In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = ["pos-tagging"]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_pos-tagging.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [pos-tagging]...
✅ Total collected so far: 30
Fetching page 2 for tag [pos-tagging]...
✅ Total collected so far: 60
Fetching page 3 for tag [pos-tagging]...
✅ Total collected so far: 90
Fetching page 4 for tag [pos-tagging]...
✅ Total collected so far: 120
Fetching page 5 for tag [pos-tagging]...
✅ Total collected so far: 150
Fetching page 6 for tag [pos-tagging]...
✅ Total collected so far: 180
Fetching page 7 for tag [pos-tagging]...
No items found for tag [pos-tagging], page 7
🎉 Done! Saved 180 valid Q&A posts to stackoverflow_nlp_posts_with_answers_pos-tagging.json


In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_pos-tagging.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_pos-tagging.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_pos-tagging.csv


In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = [
    "sequence-labeling",
    "dependency-parsing",
    "word-embedding",
    "glove",
    "core-nlp",
    "langchain",
    "few-shot-learning",
    "zero-shot-learning",
    "semantic-similarity"
]
SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_final-stretch.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [sequence-labeling]...
No items found for tag [sequence-labeling], page 1
Fetching page 1 for tag [dependency-parsing]...
✅ Total collected so far: 30
Fetching page 2 for tag [dependency-parsing]...
No items found for tag [dependency-parsing], page 2
Fetching page 1 for tag [word-embedding]...
✅ Total collected so far: 50
Fetching page 2 for tag [word-embedding]...
✅ Total collected so far: 80
Fetching page 3 for tag [word-embedding]...
✅ Total collected so far: 110
Fetching page 4 for tag [word-embedding]...
✅ Total collected so far: 140
Fetching page 5 for tag [word-embedding]...
✅ Total collected so far: 170
Fetching page 6 for tag [word-embedding]...
✅ Total collected so far: 200
Fetching page 7 for tag [word-embedding]...
✅ Total collected so far: 230
Fetching page 8 for tag [word-embedding]...
✅ Total collected so far: 260
Fetching page 9 for tag [word-embedding]...
✅ Total collected so far: 290
Fetching page 10 for tag [word-embedding]...
✅ Total collecte

In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_final-stretch.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_final-stretch.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_final-stretch.csv


In [None]:
import pandas as pd

# === List of all 20 CSV files ===
csv_files = [
 "stackoverflow_nlp_posts_with_answers.csv","stackoverflow_nlp_posts_with_answers_bert.csv","stackoverflow_nlp_posts_with_answers_final-stretch.csv","stackoverflow_nlp_posts_with_answers_huggingface-transformers.csv","stackoverflow_nlp_posts_with_answers_information=retrieval.csv","stackoverflow_nlp_posts_with_answers_language-detection.csv","stackoverflow_nlp_posts_with_answers_lemmatization.csv","stackoverflow_nlp_posts_with_answers_named-entity-recognition.csv","stackoverflow_nlp_posts_with_answers_nltk.csv","stackoverflow_nlp_posts_with_answers_openai-api.csv","stackoverflow_nlp_posts_with_answers_pos-tagging.csv","stackoverflow_nlp_posts_with_answers_question-answering.csv","stackoverflow_nlp_posts_with_answers_second_set.csv","stackoverflow_nlp_posts_with_answers_sentence-transformers.csv","stackoverflow_nlp_posts_with_answers_spacy.csv","stackoverflow_nlp_posts_with_answers_stemming.csv","stackoverflow_nlp_posts_with_answers_text-classification.csv","stackoverflow_nlp_posts_with_answers_text-mining.csv","stackoverflow_nlp_posts_with_answers_tf-idf.csv","stackoverflow_nlp_posts_with_answers_topic-modeling.csv"
]

# === Load and concatenate all CSVs ===
all_dataframes = [pd.read_csv(file) for file in csv_files]
combined_df = pd.concat(all_dataframes, ignore_index=True)

# === Drop duplicate rows by question_id if it exists ===
if "question_id" in combined_df.columns:
    combined_df = combined_df.drop_duplicates(subset="question_id")

# === Save combined CSV ===
combined_df.to_csv("combined_20_csvs.csv", index=False, encoding="utf-8")

print(f" Combined CSV saved as 'combined_20_csvs.csv' with {len(combined_df)} unique rows.")


✅ Combined CSV saved as 'combined_20_csvs.csv' with 14188 unique rows.


In [70]:
df_c = pd.read_csv("combined_20_csvs.csv")
df_c.head()

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,"['python', 'nlp', 'spacy', 'presidio']",79552218,<p>The configuration file is missing the 'labe...,https://stackoverflow.com/questions/79549787,python
1,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,"['nlp', 'huggingface-transformers', 'pre-train...",79551169,<p>The author of the tutorial you mentioned se...,https://stackoverflow.com/questions/79548202,nlp
2,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,"['numpy', 'nlp', 'dependencies', 'google-colab...",79523777,<p>You have to restart the session for the und...,https://stackoverflow.com/questions/79523269,numpy
3,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,"['python', 'nlp', 'large-language-model']",79501337,<p>I can't test it but ...</p>\n<p>I checked <...,https://stackoverflow.com/questions/79501178,python
4,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,"['python', 'nlp', 'spacy', 'langchain', 'presi...",79495969,<p>After some test I was able to find the solu...,https://stackoverflow.com/questions/79482283,python


In [71]:
df_c.shape

(14188, 8)

In [None]:
import requests
import time
import json

# === SETTINGS ===
TAGS = [
    "nlp-transformer",
    "sequence-to-sequence",
    "text-generation",
    "attention-mechanism",
    "sentence-similarity",
    "text-embedding",
    "intent-classification",
    "chatbot"
]

SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_final-stretch_p2.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(f" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [nlp-transformer]...
No items found for tag [nlp-transformer], page 1
Fetching page 1 for tag [sequence-to-sequence]...
✅ Total collected so far: 30
Fetching page 2 for tag [sequence-to-sequence]...
No items found for tag [sequence-to-sequence], page 2
Fetching page 1 for tag [text-generation]...
✅ Total collected so far: 32
Fetching page 2 for tag [text-generation]...
No items found for tag [text-generation], page 2
Fetching page 1 for tag [attention-mechanism]...
No items found for tag [attention-mechanism], page 1
Fetching page 1 for tag [sentence-similarity]...
✅ Total collected so far: 59
Fetching page 2 for tag [sentence-similarity]...
✅ Total collected so far: 89
Fetching page 3 for tag [sentence-similarity]...
✅ Total collected so far: 98
Fetching page 4 for tag [sentence-similarity]...
No items found for tag [sentence-similarity], page 4
Fetching page 1 for tag [text-embedding]...
No items found for tag [text-embedding], page 1
Fetching page 1 for tag [

In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_final-stretch_p2.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_final-stretch_p2.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_final-stretch_p2.csv


In [None]:
import pandas as pd

# === File Paths ===
csv_file_1 = "combined_20_csvs.csv"
csv_file_2 = "stackoverflow_nlp_posts_with_answers_final-stretch_p2.csv"
output_csv = "combined_V1_csv.csv"

# === Load the CSVs ===
df1 = pd.read_csv(csv_file_1)
df2 = pd.read_csv(csv_file_2)

# === Combine them ===
combined_df = pd.concat([df1, df2], ignore_index=True)

# === Remove duplicates based on 'question_id' column ===
if "question_id" in combined_df.columns:
    combined_df = combined_df.drop_duplicates(subset="question_id")

# === Save the combined file ===
combined_df.to_csv(output_csv, index=False, encoding="utf-8")

print(f" Combined CSV saved as '{output_csv}' with {len(combined_df)} unique rows.")


✅ Combined CSV saved as 'combined_V1_csv.csv' with 15344 unique rows.


In [None]:
import requests
import time
import json



# === SETTINGS ===
TAGS = [
    "text-matching",
    "semantic-search",
    "intent-recognition",
    "slot-filling",
    "relation-extraction",
    "multi-label-classification",
    "zero-shot-classification",
    "co-reference-resolution",
    "text-vectorization",
    "rasa-nlu"
]


SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_final-stretch_p3.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_final-stretch_p3.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")


Fetching page 1 for tag [text-matching]...
No items found for tag [text-matching], page 1
Fetching page 1 for tag [semantic-search]...
✅ Total collected so far: 9
Fetching page 2 for tag [semantic-search]...
No items found for tag [semantic-search], page 2
Fetching page 1 for tag [intent-recognition]...
No items found for tag [intent-recognition], page 1
Fetching page 1 for tag [slot-filling]...
No items found for tag [slot-filling], page 1
Fetching page 1 for tag [relation-extraction]...
✅ Total collected so far: 9
Fetching page 2 for tag [relation-extraction]...
No items found for tag [relation-extraction], page 2
Fetching page 1 for tag [multi-label-classification]...
No items found for tag [multi-label-classification], page 1
Fetching page 1 for tag [zero-shot-classification]...
No items found for tag [zero-shot-classification], page 1
Fetching page 1 for tag [co-reference-resolution]...
No items found for tag [co-reference-resolution], page 1
Fetching page 1 for tag [text-vectoriz

In [None]:
import pandas as pd

# === File Paths ===
csv_file_1 = "combined_20_csvs.csv"
csv_file_2 = "stackoverflow_nlp_posts_with_answers_final-stretch_p2.csv"
output_csv = "combined_V1_csv.csv"

# === Load the CSVs ===
df1 = pd.read_csv(csv_file_1)
df2 = pd.read_csv(csv_file_2)

# === Combine them ===
combined_df = pd.concat([df1, df2], ignore_index=True)

# === Remove duplicates based on 'question_id' column ===
if "question_id" in combined_df.columns:
    combined_df = combined_df.drop_duplicates(subset="question_id")

# === Save the combined file ===
combined_df.to_csv(output_csv, index=False, encoding="utf-8")

print(f" Combined CSV saved as '{output_csv}' with {len(combined_df)} unique rows.")


In [None]:
import json
import pandas as pd

# === File Paths ===
INPUT_JSON = "stackoverflow_nlp_posts_with_answers_final-stretch_p3.json"
OUTPUT_CSV = "stackoverflow_nlp_posts_with_answers_final-stretch_p3.csv"

# === Load JSON Data ===
with open(INPUT_JSON, "r", encoding="utf-8") as file:
    data = json.load(file)

# === Convert to DataFrame ===
df = pd.DataFrame(data)

# === Add Link to Question ===
df["link"] = "https://stackoverflow.com/questions/" + df["question_id"].astype(str)

# === Extract First Tag ===
df["tag"] = df["tags"].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === Reorder and Keep Specific Columns ===
ordered_columns = [
    "question_id",
    "title",
    "body",
    "tags",
    "accepted_answer_id",
    "accepted_answer_body",
    "link",
    "tag"
]

df = df[ordered_columns]

# === Save to CSV ===
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f" CSV saved successfully as: {OUTPUT_CSV}")



✅ CSV saved successfully as: stackoverflow_nlp_posts_with_answers_final-stretch_p3.csv


In [None]:
import pandas as pd

# === File Paths ===
csv_file_1 = "combined_20_csvs.csv"
csv_file_2 = "stackoverflow_nlp_posts_with_answers_final-stretch_p3.csv"
output_csv = "combined_V2_csv.csv"

# === Load the CSVs ===
df1 = pd.read_csv(csv_file_1)
df2 = pd.read_csv(csv_file_2)

# === Combine them ===
combined_df = pd.concat([df1, df2], ignore_index=True)

# === Remove duplicates based on 'question_id' column ===
if "question_id" in combined_df.columns:
    combined_df = combined_df.drop_duplicates(subset="question_id")

# === Save the combined file ===
combined_df.to_csv(output_csv, index=False, encoding="utf-8")

print(f" Combined CSV saved as '{output_csv}' with {len(combined_df)} unique rows.")


✅ Combined CSV saved as 'combined_V2_csv.csv' with 14343 unique rows.


In [80]:
df_V2_csv = pd.read_csv("combined_V2_csv.csv")
df_V2_csv.shape

(14343, 8)

In [None]:
import requests
import time
import json



# === SETTINGS ===
TAGS = [
    "text-cleaning",
    "text-normalization",
    "language-translation",
    "term-extraction",
    "ocr-to-text",
    "unsupervised-nlp",
    "sentence-segmentation",
    "language-identification",
    "document-similarity"
]



SITE = "stackoverflow"
PAGE_SIZE = 100
MAX_PAGES = 200
OUTPUT_FILE = "stackoverflow_nlp_posts_with_answers_final-stretch_p4.json"
API_KEY = "rl_MdVXEpV7UgL1XMwe5e5BwoqEa"  # Replace with your key
TARGET_COUNT = 20000

all_questions = []
valid_question_count = 0

# === START LOOPING THROUGH TAGS AND PAGES ===
for tag in TAGS:
    if valid_question_count >= TARGET_COUNT:
        break

    for page in range(1, MAX_PAGES + 1):
        if valid_question_count >= TARGET_COUNT:
            break

        print(f"Fetching page {page} for tag [{tag}]...")

        # Get questions
        url = "https://api.stackexchange.com/2.3/questions"
        params = {
            "page": page,
            "pagesize": PAGE_SIZE,
            "order": "desc",
            "sort": "creation",
            "tagged": tag,
            "site": SITE,
            "filter": "withbody",
            "key": API_KEY
        }

        response = requests.get(url, params=params)
        if response.status_code != 200:
            print(f" Error {response.status_code}: {response.text}")
            break

        try:
            data = response.json()
        except ValueError:
            print(" Invalid JSON response. Skipping this page.")
            continue

        if "items" not in data or not data["items"]:
            print(f"No items found for tag [{tag}], page {page}")
            break

        # Filter questions with accepted_answer_id
        questions_with_answers = [
            item for item in data["items"]
            if item.get("accepted_answer_id")
        ]

        # Batch accepted_answer_ids (max 100 per API call)
        accepted_ids = [str(item["accepted_answer_id"]) for item in questions_with_answers]
        id_batches = [accepted_ids[i:i+100] for i in range(0, len(accepted_ids), 100)]

        accepted_answers = {}
        for batch in id_batches:
            ids_str = ";".join(batch)
            ans_url = f"https://api.stackexchange.com/2.3/answers/{ids_str}"
            ans_params = {
                "site": SITE,
                "filter": "withbody",
                "key": API_KEY
            }
            ans_response = requests.get(ans_url, params=ans_params)
            if ans_response.status_code == 200:
                try:
                    ans_data = ans_response.json()
                    for ans in ans_data.get("items", []):
                        accepted_answers[ans["answer_id"]] = ans.get("body")
                except ValueError:
                    continue

            time.sleep(0.5)  # avoid hitting batch rate limits

        # Combine only valid Q&A pairs
        for item in questions_with_answers:
            if valid_question_count >= TARGET_COUNT:
                break

            aid = item["accepted_answer_id"]
            abody = accepted_answers.get(aid)

            if abody:  # Only store if body exists
                question_data = {
                    "question_id": item.get("question_id"),
                    "title": item.get("title"),
                    "body": item.get("body"),
                    "tags": item.get("tags"),
                    "accepted_answer_id": aid,
                    "accepted_answer_body": abody,
                    "score": item.get("score")
                }
                all_questions.append(question_data)
                valid_question_count += 1

        print(f" Total collected so far: {valid_question_count}")
        time.sleep(0.5)

# === SAVE TO FILE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(all_questions, f, indent=4)

print(f" Done! Saved {valid_question_count} valid Q&A posts to {OUTPUT_FILE}")





Fetching page 1 for tag [text-cleaning]...
No items found for tag [text-cleaning], page 1
Fetching page 1 for tag [text-normalization]...
✅ Total collected so far: 12
Fetching page 2 for tag [text-normalization]...
No items found for tag [text-normalization], page 2
Fetching page 1 for tag [language-translation]...
✅ Total collected so far: 38
Fetching page 2 for tag [language-translation]...
✅ Total collected so far: 68
Fetching page 3 for tag [language-translation]...
✅ Total collected so far: 98
Fetching page 4 for tag [language-translation]...
✅ Total collected so far: 128
Fetching page 5 for tag [language-translation]...
✅ Total collected so far: 129
Fetching page 6 for tag [language-translation]...
No items found for tag [language-translation], page 6
Fetching page 1 for tag [term-extraction]...
No items found for tag [term-extraction], page 1
Fetching page 1 for tag [ocr-to-text]...
No items found for tag [ocr-to-text], page 1
Fetching page 1 for tag [unsupervised-nlp]...
No ite