In [3]:
pip install stackapi

Collecting stackapi
  Downloading StackAPI-0.3.1-py3-none-any.whl.metadata (2.3 kB)
Downloading StackAPI-0.3.1-py3-none-any.whl (7.2 kB)
Installing collected packages: stackapi
Successfully installed stackapi-0.3.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
from stackapi import StackAPI
import pandas as pd
from datetime import datetime
import calendar
import time

In [2]:
API_KEYS = [
    'rl_aLCzCCSQbfYe8u564o3ZjHCbQ',
    'rl_A2PxPHTZyZXSGdpgXD1BdCVgS',
    'rl_XZH4h9xu9QcJ2N95h9M1Jn125'
]

TAGS = ['nlp', 'bert', 'tokenisation', 'language model', 'summarization', 'text classification', 'spacy', 'nltk', 'sentiment analysis', 'stemming', 'lemmatization', 'natural language processing']
BATCH_SIZE = 30
PAGE_SIZE = 100
MAX_POSTS = 30000  # Slightly above 30k to ensure full coverage
START_YEAR = 2008
END_YEAR = 2025



In [5]:
key_index = 0
posts = []
seen_ids = set()

In [7]:
def get_site():
    return StackAPI('stackoverflow', key=API_KEYS[key_index], page_size=PAGE_SIZE)

SITE = get_site()

def switch_key():
    global key_index, SITE
    key_index = (key_index + 1) % len(API_KEYS)
    print(f"\n Switching to key ending in: {API_KEYS[key_index][-5:]}")
    SITE = get_site()

def collect_questions_for_year(tag, year):
    global SITE
    fromdate = calendar.timegm(datetime(year, 1, 1).utctimetuple())
    todate = calendar.timegm(datetime(year, 12, 31).utctimetuple())
    page = 1

    while True:
        if len(posts) >= MAX_POSTS:
            break
        try:
            print(f"[{tag}] Year: {year} | Page: {page} | Key: {API_KEYS[key_index][-5:]}")
            questions = SITE.fetch(
                'questions',
                tagged=tag,
                fromdate=fromdate,
                todate=todate,
                filter='withbody',
                page=page
            )

            for item in questions.get('items', []):
                qid = item.get('question_id')
                if qid not in seen_ids:
                    seen_ids.add(qid)
                    posts.append({
                        'post_id': qid,
                        'title': item.get('title', ''),
                        'description': item.get('body', ''),
                        'tags': ', '.join(item.get('tags', [])),
                        'accepted_answer_id': item.get('accepted_answer_id'),
                        'creation_date': pd.to_datetime(item.get('creation_date', 0), unit='s'),
                        'view_count': item.get('view_count', 0),
                        'tag_matched': tag  # <-- New column to store which tag matched
                })

            if not questions.get('has_more', False):
                break
            page += 1
            time.sleep(0.5)

        except Exception as e:
            print(f" Error on page {page}, year {year}: {e}")
            switch_key()
            time.sleep(3)
            continue
    print(f"Collected {len(seen_ids)} unique posts so far.")

In [9]:
for year in range(START_YEAR, END_YEAR + 1):
    for tag in TAGS:
        collect_questions_for_year(tag, year)
    if len(posts) >= MAX_POSTS:
        break

df = pd.DataFrame(posts)
print(f"\n Total RAW posts collected: {len(df)}")


[nlp] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[bert] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[tokenisation] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[language model] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[summarization] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[text classification] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[spacy] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 44 unique posts so far.
[nltk] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 46 unique posts so far.
[sentiment analysis] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 46 unique posts so far.
[stemming] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 48 unique posts so far.
[lemmatization] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 48 unique posts so far.
[natural language processing] Year: 2008 | Page: 1 | Key: jHCbQ
Collected 48 unique posts so far.
[nlp] Ye

In [11]:
df.to_csv('stackoverflow_raw_posts.csv', index=False)
print(f"\n Saved ALL {len(df)} RAW posts to 'stackoverflow_raw_posts.csv'")



 Saved ALL 23089 RAW posts to 'stackoverflow_raw_posts.csv'


In [13]:
def fetch_accepted_answers(df):
    SITE_ANS = StackAPI('stackoverflow', key=API_KEYS[-1])
    answer_bodies = {}
    accepted_ids = df['accepted_answer_id'].dropna().astype(int).unique().tolist()
    print(f" Fetching {len(accepted_ids)} accepted answers in batches...")

    for i in range(0, len(accepted_ids), BATCH_SIZE):
        batch = accepted_ids[i:i + BATCH_SIZE]
        try:
            result = SITE_ANS.fetch('answers/{ids}', ids=batch, filter='withbody')
            for item in result.get('items', []):
                answer_bodies[item['answer_id']] = item.get('body', '')
        except Exception as e:
            print(f" Batch fetch error: {e}")
            time.sleep(5)
        time.sleep(0.5)

    return answer_bodies

accepted_answers = fetch_accepted_answers(df)

# CORRECT MERGE: Add answers WITHOUT REMOVING ANY ROW
df['accepted_answer'] = df['accepted_answer_id'].map(accepted_answers).fillna('')
df.drop(columns='accepted_answer_id', inplace=True)

# SAVE FINAL CSV CLEARLY
df.to_csv('stackoverflow_nlp_with_answers.csv', index=False)
print(f" Saved {len(df)} posts (with or without answers) to 'stackoverflow_nlp_with_answers.csv'")

 Fetching 9517 accepted answers in batches...
 Saved 23089 posts (with or without answers) to 'stackoverflow_nlp_with_answers.csv'
