In [1]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [None]:
import requests
import csv
import json
import time
import pandas as pd

# Load existing CSV to avoid duplicates
existing_df = pd.read_csv('combined_V2_csv.csv')
existing_ids = set(existing_df['question_id'].astype(int))

# API Setup
API_KEY = 'rl_MdVXEpV7UgL1XMwe5e5BwoqEa'  # Replace with your real API key
BASE_URL = "https://api.stackexchange.com/2.3/questions"
SITE = "stackoverflow"
TAG = "nlp"
PAGESIZE = 100
MAX_PAGES = 200

# Output JSON file
output_data = []

# Pagination
for page in range(1, MAX_PAGES + 1):
    print(f"Fetching page {page}...")
    params = {
        'page': page,
        'pagesize': PAGESIZE,
        'order': 'desc',
        'sort': 'activity',
        'tagged': TAG,
        'site': SITE,
        'filter': 'withbody',
        'key': API_KEY
    }

    response = requests.get(BASE_URL, params=params)
    
    if response.status_code == 502:  # Retry on bad gateway
        print("502 Error. Retrying after 5 seconds...")
        time.sleep(5)
        continue

    if response.status_code != 200:
        print(f"API Error: {response.status_code}")
        break

    items = response.json().get('items', [])
    if not items:
        print("No more items.")
        break

    for item in items:
        qid = item.get('question_id')
        if not item.get('accepted_answer_id') or qid in existing_ids:
            continue

        # Fetch accepted answer body
        answer_id = item['accepted_answer_id']
        answer_url = f"https://api.stackexchange.com/2.3/answers/{answer_id}"
        ans_params = {
            'order': 'desc',
            'sort': 'activity',
            'site': SITE,
            'filter': 'withbody',
            'key': API_KEY
        }

        ans_resp = requests.get(answer_url, params=ans_params)
        if ans_resp.status_code != 200:
            continue

        ans_items = ans_resp.json().get('items', [])
        if not ans_items:
            continue

        answer_body = ans_items[0].get('body', '')
        if not answer_body:
            continue

        # All conditions met; collect required fields
        entry = {
            'question_id': qid,
            'title': item.get('title'),
            'body': item.get('body'),
            'tags': item.get('tags'),
            'accepted_answer_id': answer_id,
            'accepted_answer_body': answer_body,
            'score': item.get('score')
        }
        output_data.append(entry)

    time.sleep(1.5)  # Avoid hitting rate limits

# Save to JSON
with open('nlp_questions_output.json', 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"Done. Saved {len(output_data)} new entries to nlp_questions_output.json")


📥 Fetching page 1...
📥 Fetching page 2...
📥 Fetching page 3...
📥 Fetching page 4...
📥 Fetching page 5...
📥 Fetching page 6...
📥 Fetching page 7...
📥 Fetching page 8...
📥 Fetching page 9...
📥 Fetching page 10...
📥 Fetching page 11...
📥 Fetching page 12...
📥 Fetching page 13...
📥 Fetching page 14...
📥 Fetching page 15...
📥 Fetching page 16...
📥 Fetching page 17...
📥 Fetching page 18...
📥 Fetching page 19...
📥 Fetching page 20...
📥 Fetching page 21...
📥 Fetching page 22...
📥 Fetching page 23...
📥 Fetching page 24...
📥 Fetching page 25...
📥 Fetching page 26...
📥 Fetching page 27...
📥 Fetching page 28...
📥 Fetching page 29...
📥 Fetching page 30...
📥 Fetching page 31...
📥 Fetching page 32...
📥 Fetching page 33...
📥 Fetching page 34...
📥 Fetching page 35...
📥 Fetching page 36...
📥 Fetching page 37...
📥 Fetching page 38...
📥 Fetching page 39...
📥 Fetching page 40...
📥 Fetching page 41...
📥 Fetching page 42...
📥 Fetching page 43...
📥 Fetching page 44...
📥 Fetching page 45...
📥 Fetching page 46.

In [None]:
import json
import csv

# Load JSON data
with open('nlp_questions_output.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Define CSV file path
csv_file = 'nlp_questions_converted.csv'

# Define headers (columns)
headers = [
    'question_id',
    'title',
    'body',
    'tags',
    'accepted_answer_id',
    'accepted_answer_body',
    'link',
    'tag'
]

# Write to CSV
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()

    for entry in data:
        writer.writerow({
            'question_id': entry.get('question_id'),
            'title': entry.get('title'),
            'body': entry.get('body'),
            'tags': ", ".join(entry.get('tags', [])),
            'accepted_answer_id': entry.get('accepted_answer_id'),
            'accepted_answer_body': entry.get('accepted_answer_body'),
            'link': f"https://stackoverflow.com/q/{entry.get('question_id')}",
            'tag': ", ".join(entry.get('tags', []))  # 'tag' is repeated for clarity
        })

print("CSV file saved as 'nlp_questions_converted.csv'")


✅ CSV file saved as 'nlp_questions_converted.csv'


In [None]:
import pandas as pd

# Load both CSV files
csv1 = pd.read_csv('combined_V2_csv.csv')
csv2 = pd.read_csv('nlp_questions_converted.csv')

# Combine them
combined = pd.concat([csv1, csv2], ignore_index=True)

# Save the combined CSV
combined.to_csv('combined_V3_output.csv', index=False)

print("Combined file saved as 'combined_V3_output.csv'")


✅ Combined file saved as 'combined_V3_output.csv'


In [7]:
df = pd.read_csv("combined_V3_output.csv")
df.shape

(14589, 8)

In [None]:
import requests
import csv
import json
import time

# API Setup
API_KEY = 'rl_MdVXEpV7UgL1XMwe5e5BwoqEa'  # Replace with your actual API key
BASE_URL = "https://api.stackexchange.com/2.3/questions"
SITE = "stackoverflow"
TAG = "nlp"
PAGESIZE = 100
MAX_PAGES = 200

# Output JSON file
output_data = []

# Pagination
for page in range(1, MAX_PAGES + 1):
    print(f"Fetching page {page}...")
    params = {
        'page': page,
        'pagesize': PAGESIZE,
        'order': 'desc',
        'sort': 'activity',
        'tagged': TAG,
        'site': SITE,
        'filter': 'withbody',
        'key': API_KEY
    }

    response = requests.get(BASE_URL, params=params)
    
    if response.status_code == 502:  # Retry on bad gateway
        print("502 Error. Retrying after 5 seconds...")
        time.sleep(5)
        continue

    if response.status_code != 200:
        print(f"API Error: {response.status_code}")
        break

    items = response.json().get('items', [])
    if not items:
        print("No more items.")
        break

    for item in items:
        qid = item.get('question_id')
        if not item.get('accepted_answer_id'):
            continue

        # Fetch accepted answer body
        answer_id = item['accepted_answer_id']
        answer_url = f"https://api.stackexchange.com/2.3/answers/{answer_id}"
        ans_params = {
            'order': 'desc',
            'sort': 'activity',
            'site': SITE,
            'filter': 'withbody',
            'key': API_KEY
        }

        ans_resp = requests.get(answer_url, params=ans_params)
        if ans_resp.status_code != 200:
            continue

        ans_items = ans_resp.json().get('items', [])
        if not ans_items:
            continue

        answer_body = ans_items[0].get('body', '')
        if not answer_body:
            continue

        # Collect required fields
        entry = {
            'question_id': qid,
            'title': item.get('title'),
            'body': item.get('body'),
            'tags': item.get('tags'),
            'accepted_answer_id': answer_id,
            'accepted_answer_body': answer_body,
            'score': item.get('score')
        }
        output_data.append(entry)

    time.sleep(1.5)  # Avoid hitting rate limits

# Save to JSON
with open('nlp_questions_output_nodup.json', 'w', encoding='utf-8') as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print(f"Done. Saved {len(output_data)} entries to nlp_questions_output_nodup.json")


📥 Fetching page 1...
📥 Fetching page 2...
📥 Fetching page 3...
📥 Fetching page 4...
📥 Fetching page 5...
📥 Fetching page 6...
📥 Fetching page 7...
📥 Fetching page 8...
📥 Fetching page 9...
📥 Fetching page 10...
📥 Fetching page 11...
📥 Fetching page 12...
📥 Fetching page 13...
📥 Fetching page 14...
📥 Fetching page 15...
📥 Fetching page 16...
📥 Fetching page 17...
📥 Fetching page 18...
📥 Fetching page 19...
📥 Fetching page 20...
📥 Fetching page 21...
📥 Fetching page 22...
📥 Fetching page 23...
📥 Fetching page 24...
📥 Fetching page 25...
📥 Fetching page 26...
📥 Fetching page 27...
📥 Fetching page 28...
📥 Fetching page 29...
📥 Fetching page 30...
📥 Fetching page 31...
📥 Fetching page 32...
📥 Fetching page 33...
📥 Fetching page 34...
📥 Fetching page 35...
📥 Fetching page 36...
📥 Fetching page 37...
📥 Fetching page 38...
📥 Fetching page 39...
📥 Fetching page 40...
📥 Fetching page 41...
📥 Fetching page 42...
📥 Fetching page 43...
📥 Fetching page 44...
📥 Fetching page 45...
📥 Fetching page 46.

In [None]:
import json
import csv

# Load JSON data
with open('nlp_questions_output_nodup.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Define CSV file path
csv_file = 'nlp_questions_converted.csv'

# Define headers (columns)
headers = [
    'question_id',
    'title',
    'body',
    'tags',
    'accepted_answer_id',
    'accepted_answer_body',
    'link',
    'tag'
]

# Write to CSV
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=headers)
    writer.writeheader()

    for entry in data:
        writer.writerow({
            'question_id': entry.get('question_id'),
            'title': entry.get('title'),
            'body': entry.get('body'),
            'tags': ", ".join(entry.get('tags', [])),
            'accepted_answer_id': entry.get('accepted_answer_id'),
            'accepted_answer_body': entry.get('accepted_answer_body'),
            'link': f"https://stackoverflow.com/q/{entry.get('question_id')}",
            'tag': ", ".join(entry.get('tags', []))
        })

print("CSV file saved as 'nlp_questions_converted.csv'")


✅ CSV file saved as 'nlp_questions_converted.csv'


In [14]:
df_2 = pd.read_csv("nlp_questions_converted.csv")
df_2.head()
df_2.shape


(8136, 8)

In [16]:
import pandas as pd

# Load both CSV files
csv1 = pd.read_csv('combined_V3_output.csv')
csv2 = pd.read_csv('nlp_questions_converted.csv')

# Combine them
combined = pd.concat([csv1, csv2], ignore_index=True)

# Save the combined CSV
combined.to_csv('combined_V4_output.csv', index=False)

print("✅ Combined file saved as 'combined_V4_output.csv'")


✅ Combined file saved as 'combined_V4_output.csv'


In [17]:
df_f = pd.read_csv("combined_V4_output.csv")
df_f.shape

(22725, 8)

In [2]:
import pandas as pd 

In [3]:
df_f = pd.read_csv("combined_V4_output.csv")
df_f.head()

Unnamed: 0,question_id,title,body,tags,accepted_answer_id,accepted_answer_body,link,tag
0,79549787,Why does Presidio with spacy nlp engine not re...,<p>I'm using spaCy with the pl_core_news_lg mo...,"['python', 'nlp', 'spacy', 'presidio']",79552218,<p>The configuration file is missing the 'labe...,https://stackoverflow.com/questions/79549787,python
1,79548202,GPT-2 and other models from huggingface -100 l...,<p>I understand the -100 label id is used so t...,"['nlp', 'huggingface-transformers', 'pre-train...",79551169,<p>The author of the tutorial you mentioned se...,https://stackoverflow.com/questions/79548202,nlp
2,79523269,Trouble getting importing gensim to work in colab,<p>I am trying to import gensim into colab.</p...,"['numpy', 'nlp', 'dependencies', 'google-colab...",79523777,<p>You have to restart the session for the und...,https://stackoverflow.com/questions/79523269,numpy
3,79501178,Store images instead of showing in a server,<p>I am running the code found on this <a href...,"['python', 'nlp', 'large-language-model']",79501337,<p>I can't test it but ...</p>\n<p>I checked <...,https://stackoverflow.com/questions/79501178,python
4,79482283,Presidio with Langchain Experimental does not ...,<p>I am using presidio/langchain_experimental ...,"['python', 'nlp', 'spacy', 'langchain', 'presi...",79495969,<p>After some test I was able to find the solu...,https://stackoverflow.com/questions/79482283,python
