In [13]:
from stackapi import StackAPI
from tqdm import tqdm
import polars as pl
import os
import requests

## Initialisation


In [2]:
BASE_MAX_PAGES = 400

KEY = "wDnPPn8eHO*UvPtOokRQxA(("

SITE = StackAPI("stackoverflow")
SITE.page_size = 100  # max page size

### Common schemas


In [3]:
badge_count_schema = {  # Based on https://api.stackexchange.com/docs/types/badge-count
    "bronze": pl.Int64,
    "gold": pl.Int64,
    "silver": pl.Int64,
}
user_type_enum = pl.Enum(
    [
        "unregistered",
        "registered",
        "moderator",
        "team_admin",
        "does_not_exist",
    ]
)
shallow_user_schema = {  # Based on https://api.stackexchange.com/docs/types/shallow-user
    "accept_rate": pl.Int64,
    # "account_id": pl.Int64, # Only available in 2.3
    "badge_counts": pl.Struct(badge_count_schema),
    "display_name": pl.String,
    "link": pl.String,
    "profile_image": pl.String,
    "reputation": pl.Int64,
    "user_id": pl.Int64,
    "user_type": user_type_enum,
}
comment_schema = {  # Based on https://api.stackexchange.com/docs/types/comment
    "body": pl.String,
    "body_markdown": pl.String,
    "can_flag": pl.Boolean,
    "comment_id": pl.Int64,
    "content_license": pl.String,
    "creation_date": pl.Date,
    "edited": pl.Boolean,
    "link": pl.String,
    "owner": pl.Struct(shallow_user_schema),
    "post_id": pl.Int64,
    "post_type": pl.Enum(["question", "answer", "article"]),
    "reply_to_user": pl.Struct(shallow_user_schema),
    "score": pl.Int64,
    # "upvoted": pl.Boolean, # Private
}
notice_schema = {  # Based on https://api.stackexchange.com/docs/types/notice
    "body": pl.String,
    "creation_date": pl.Date,
    "owner_user_id": pl.Int64,
}

### Question Type and Dataframe

Schema from the [Stack Exchange API Documentation](https://api.stackexchange.com/docs/types/question)

| name                  | type             | included in the default filter | note                                                       | is private |
| --------------------- | ---------------- | ------------------------------ | ---------------------------------------------------------- | ---------- |
| accepted_answer_id    | integer          | Yes                            | may be absent                                              |            |
| answer_count          | integer          | Yes                            |                                                            |            |
| answers               | array            | No                             | may be absent                                              |            |
| body                  | string           | No                             | unchanged in unsafe filters                                |            |
| body_markdown         | string           | No                             | introduced in version 2.2                                  |            |
| bounty_amount         | integer          | Yes                            | may be absent                                              |            |
| bounty_closes_date    | date             | Yes                            | may be absent                                              |            |
| bounty_user           | shallow_user     | No                             | may be absent, introduced in version 2.2                   |            |
| can_answer            | boolean          | No                             | introduced in version 2.3                                  |            |
| can_close             | boolean          | No                             | introduced in version 2.2                                  |            |
| can_comment           | boolean          | No                             | introduced in version 2.3                                  |            |
| can_edit              | boolean          | No                             | introduced in version 2.3                                  |            |
| can_flag              | boolean          | No                             | introduced in version 2.2                                  |            |
| can_suggest_edit      | boolean          | No                             | introduced in version 2.3                                  |            |
| close_vote_count      | integer          | No                             | introduced in version 2.1                                  |            |
| closed_date           | date             | Yes                            | may be absent                                              |            |
| closed_details        | closed_details   | No                             | may be absent, introduced in version 2.2                   |            |
| closed_reason         | string           | Yes                            | may be absent                                              |            |
| collectives           | array            | Yes                            | introduced in version 2.3                                  |            |
| comment_count         | integer          | No                             | introduced in version 2.2                                  |            |
| comments              | array            | No                             | may be absent                                              |            |
| community_owned_date  | date             | Yes                            | may be absent                                              |            |
| content_license       | string           | Yes                            | introduced in version 2.2                                  |            |
| creation_date         | date             | Yes                            |                                                            |            |
| delete_vote_count     | integer          | No                             | introduced in version 2.1                                  |            |
| down_vote_count       | integer          | No                             |                                                            |            |
| downvoted             | boolean          | No                             | requires the private_info scope, introduced in version 2.2 | Yes        |
| favorite_count        | integer          | No                             |                                                            |            |
| favorited             | boolean          | No                             | requires the private_info scope, introduced in version 2.2 | Yes        |
| is_answered           | boolean          | Yes                            |                                                            |            |
| last_activity_date    | date             | Yes                            |                                                            |            |
| last_edit_date        | date             | Yes                            | may be absent                                              |            |
| last_editor           | shallow_user     | No                             | introduced in version 2.2                                  |            |
| link                  | string           | Yes                            | unchanged in unsafe filters                                |            |
| locked_date           | date             | Yes                            | may be absent                                              |            |
| migrated_from         | migration_info   | Yes                            | may be absent                                              |            |
| migrated_to           | migration_info   | Yes                            | may be absent                                              |            |
| notice                | notice           | No                             | introduced in version 2.1                                  |            |
| owner                 | shallow_user     | Yes                            | may be absent                                              |            |
| posted_by_collectives | array            | Yes                            | introduced in version 2.3                                  |            |
| protected_date        | date             | Yes                            | may be absent                                              |            |
| question_id           | integer          | Yes                            | refers to a question                                       |            |
| reopen_vote_count     | integer          | No                             | introduced in version 2.1                                  |            |
| score                 | integer          | Yes                            |                                                            |            |
| share_link            | string           | No                             | unchanged in unsafe filters, introduced in version 2.2     |            |
| tags                  | array of strings | Yes                            |                                                            |            |
| title                 | string           | Yes                            |                                                            |            |
| up_vote_count         | integer          | No                             |                                                            |            |
| upvoted               | boolean          | No                             | requires the private_info scope, introduced in version 2.2 | Yes        |
| view_count            | integer          | Yes                            |                                                            |            |


In [4]:
schema = {
    "accepted_answer_id": pl.Int64,
    "answer_count": pl.Int64,
    "answers": pl.List,
    "body": pl.String,
    "body_markdown": pl.String,
    "bounty_amount": pl.Int64,
    "bounty_closes_date": pl.Date,
    "bounty_user": pl.Struct,  # assuming shallow_user is a structured type
    # "can_answer": pl.Boolean, # Only available in 2.3
    # "can_close": pl.Boolean, # Only available in 2.3
    # "can_comment": pl.Boolean, # Only available in 2.3
    # "can_edit": pl.Boolean, # Only available in 2.3
    # "can_flag": pl.Boolean, # Only available in 2.3
    # "can_suggest_edit": pl.Boolean, # Only available in 2.3
    "close_vote_count": pl.Int64,
    "closed_date": pl.Date,
    "closed_details": pl.Struct,  # assuming closed_details is a structured type
    "closed_reason": pl.String,
    # "collectives": pl.List, # Only available in 2.3
    "comment_count": pl.Int64,
    "comments": pl.List(
        pl.Struct(comment_schema)
    ),  # assuming comment is a structured type
    "community_owned_date": pl.Date,
    "content_license": pl.String,
    "creation_date": pl.Date,
    "delete_vote_count": pl.Int64,
    "down_vote_count": pl.Int64,
    # "downvoted": pl.Boolean, # Private
    "favorite_count": pl.Int64,
    # "favorited": pl.Boolean, # Private
    "is_answered": pl.Boolean,
    "last_activity_date": pl.Date,
    "last_edit_date": pl.Date,
    "last_editor": pl.Struct(shallow_user_schema),
    "link": pl.String,
    "locked_date": pl.Date,
    "migrated_from": pl.Struct,  # assuming migration_info is a structured type
    "migrated_to": pl.Struct,  # assuming migration_info is a structured type
    "notice": pl.Struct(notice_schema),
    "owner": pl.Struct(shallow_user_schema),
    # "posted_by_collectives": pl.List, # Only available in 2.3
    "protected_date": pl.Date,
    "question_id": pl.Int64,
    "reopen_vote_count": pl.Int64,
    "score": pl.Int64,
    "share_link": pl.String,
    "tags": pl.List(pl.String),
    "title": pl.String,
    "up_vote_count": pl.Int64,
    # "upvoted": pl.Boolean, # Private
    "view_count": pl.Int64,
}

df_questions = pl.DataFrame(schema=schema)

### Answer Type and Dataframe

Schema from the [Stack Exchange API Documentation](https://api.stackexchange.com/docs/types/answer)

| name                  | type             | included in the default filter | note                                                   | is private |
| --------------------- | ---------------- | ------------------------------ | ------------------------------------------------------ | ---------- |
| accepted              | boolean          | No                             | introduced in version 2.2, requires private_info       | Yes        |
| answer_id             | integer          | Yes                            | refers to an answer                                    |            |
| awarded_bounty_amount | integer          | No                             | may be absent, introduced in version 2.2               |            |
| awarded_bounty_users  | array            | No                             | may be absent, introduced in version 2.2               |            |
| body                  | string           | No                             | unchanged in unsafe filters                            |            |
| body_markdown         | string           | No                             | introduced in version 2.2                              |            |
| can_comment           | boolean          | No                             | introduced in version 2.3                              |            |
| can_edit              | boolean          | No                             | introduced in version 2.3                              |            |
| can_flag              | boolean          | No                             | unchanged in unsafe filters, introduced in version 2.2 |            |
| can_suggest_edit      | boolean          | No                             | introduced in version 2.3                              |            |
| collectives           | array            | Yes                            | introduced in version 2.3                              |            |
| comment_count         | integer          | No                             | introduced in version 2.2                              |            |
| comments              | array            | No                             | may be absent                                          |            |
| community_owned_date  | date             | Yes                            | may be absent                                          |            |
| content_license       | string           | Yes                            | introduced in version 2.2                              |            |
| creation_date         | date             | Yes                            |                                                        |            |
| down_vote_count       | integer          | No                             |                                                        |            |
| downvoted             | boolean          | No                             | requires private_info, introduced in version 2.2       | Yes        |
| is_accepted           | boolean          | Yes                            |                                                        |            |
| last_activity_date    | date             | Yes                            |                                                        |            |
| last_edit_date        | date             | Yes                            | may be absent                                          |            |
| last_editor           | shallow_user     | No                             | introduced in version 2.2                              |            |
| link                  | string           | No                             | unchanged in unsafe filters                            |            |
| locked_date           | date             | Yes                            | may be absent                                          |            |
| owner                 | shallow_user     | Yes                            | may be absent                                          |            |
| posted_by_collectives | array            | Yes                            | introduced in version 2.3                              |            |
| question_id           | integer          | Yes                            | refers to a question                                   |            |
| recommendations       | array            | Yes                            | introduced in version 2.3                              |            |
| score                 | integer          | Yes                            |                                                        |            |
| share_link            | string           | No                             | unchanged in unsafe filters, introduced in version 2.2 |            |
| tags                  | array of strings | No                             | introduced in version 2.1                              |            |
| title                 | string           | No                             |                                                        |            |
| up_vote_count         | integer          | No                             |                                                        |            |
| upvoted               | boolean          | No                             | requires private_info, introduced in version 2.2       | Yes        |


In [5]:
schema = {
    # "accepted": pl.Boolean, # Private
    "answer_id": pl.Int64,
    "awarded_bounty_amount": pl.Int64,
    "awarded_bounty_users": pl.List(
        pl.Struct(shallow_user_schema)
    ),  # assuming shallow_user is a structured type
    "body": pl.String,
    "body_markdown": pl.String,
    "can_comment": pl.Boolean,
    # "can_edit": pl.Boolean, # Only available in 2.3
    "can_flag": pl.Boolean,
    # "can_suggest_edit": pl.Boolean,# Only available in 2.3
    # "collectives": pl.List(pl.Struct),  # assuming collective is a structured type # Only available in 2.3
    "comment_count": pl.Int64,
    "comments": pl.List(
        pl.Struct(comment_schema)
    ),  # assuming comment is a structured type
    "community_owned_date": pl.Date,
    "content_license": pl.String,
    "creation_date": pl.Date,
    "down_vote_count": pl.Int64,
    # "downvoted": pl.Boolean, # Private
    "is_accepted": pl.Boolean,
    "last_activity_date": pl.Date,
    "last_edit_date": pl.Date,
    "last_editor": pl.Struct(shallow_user_schema),
    "link": pl.String,
    "locked_date": pl.Date,
    "owner": pl.Struct(shallow_user_schema),
    # "posted_by_collectives": pl.List( # Only available in 2.3
    #    pl.Struct
    # ),  # assuming collective is a structured type
    "question_id": pl.Int64,
    # "recommendations": pl.List( # Only available in 2.3
    #    pl.Struct
    # ),  # assuming collective_recommendation is a structured type
    "score": pl.Int64,
    "share_link": pl.String,
    "tags": pl.List(pl.String),
    "title": pl.String,
    "up_vote_count": pl.Int64,
    # "upvoted": pl.Boolean, # Private
}

df_answers = pl.DataFrame(schema=schema)

### User Type and Dataframe

Schema from the [Stack Exchange API Documentation](https://api.stackexchange.com/docs/types/user)

| name                      | type        | included in the default filter | note                                       |
| ------------------------- | ----------- | ------------------------------ | ------------------------------------------ |
| about_me                  | string      | No                             | may be absent, unchanged in unsafe filters |
| accept_rate               | integer     | Yes                            | may be absent                              |
| account_id                | integer     | Yes                            |                                            |
| age                       | integer     | Yes                            | may be absent                              |
| answer_count              | integer     | No                             |                                            |
| badge_counts              | badge_count | Yes                            |                                            |
| collectives               | array       | Yes                            | may be absent, introduced in version 2.3   |
| creation_date             | date        | Yes                            |                                            |
| display_name              | string      | Yes                            |                                            |
| down_vote_count           | integer     | No                             |                                            |
| is_employee               | boolean     | Yes                            |                                            |
| last_access_date          | date        | Yes                            |                                            |
| last_modified_date        | date        | Yes                            | may be absent                              |
| link                      | string      | Yes                            | unchanged in unsafe filters                |
| location                  | string      | Yes                            | may be absent                              |
| profile_image             | string      | Yes                            | unchanged in unsafe filters                |
| question_count            | integer     | No                             |                                            |
| reputation                | integer     | Yes                            |                                            |
| reputation_change_day     | integer     | Yes                            |                                            |
| reputation_change_month   | integer     | Yes                            |                                            |
| reputation_change_quarter | integer     | Yes                            |                                            |
| reputation_change_week    | integer     | Yes                            |                                            |
| reputation_change_year    | integer     | Yes                            |                                            |
| timed_penalty_date        | date        | Yes                            | may be absent                              |
| up_vote_count             | integer     | No                             |                                            |
| user_id                   | integer     | Yes                            | refers to a user                           |
| user_type                 | enum        | Yes                            | one of several user types                  |
| view_count                | integer     | No                             |                                            |
| website_url               | string      | Yes                            | may be absent, unchanged in unsafe filters |


In [7]:
schema = {
    "about_me": pl.String,
    "accept_rate": pl.Int64,
    "account_id": pl.Int64,
    "age": pl.Int64,
    "answer_count": pl.Int64,
    "badge_counts": pl.Struct(
        badge_count_schema
    ),  # Assuming badge_count is a structured type
    # "collectives": pl.List( # Only available in 2.3
    #    pl.Struct
    # ),  # Assuming collective_membership is a structured type
    "creation_date": pl.Date,
    "display_name": pl.String,
    "down_vote_count": pl.Int64,
    "is_employee": pl.Boolean,
    "last_access_date": pl.Date,
    "last_modified_date": pl.Date,
    "link": pl.String,
    "location": pl.String,
    "profile_image": pl.String,
    "question_count": pl.Int64,
    "reputation": pl.Int64,
    "reputation_change_day": pl.Int64,
    "reputation_change_month": pl.Int64,
    "reputation_change_quarter": pl.Int64,
    "reputation_change_week": pl.Int64,
    "reputation_change_year": pl.Int64,
    "timed_penalty_date": pl.Date,
    "up_vote_count": pl.Int64,
    "user_id": pl.Int64,
    "user_type": user_type_enum,
    "view_count": pl.Int64,
    "website_url": pl.String,
}

df_users = pl.DataFrame(schema=schema)

## Scraping the Data

We are going to scrape the following:

- users with the most reputation on Stack Overflow.
  - then scrape the questions of these users.
    - then scrape the answer(s) to these questions
      - then scrape the user(s) who answered the question
  - then scrape the answers of these users
    - then scrape the question(s) to these answers
      - then scrape the user(s) who asked the question
- the most popular questions on Stack Overflow.
  - then scrape the user(s) who asked the question
  - then scrape the answer(s) to these questions
    - then scrape the user(s) who answered the question


#### Defining helper functions


In [8]:
def fetch_users_with_most_reputation(filter):
    """
    Fetches users with the most reputation.

    Parameters:
        filter (str): The filter string to use for fetching user data.

    Returns:
        A list of users sorted in descending order of reputation.
    """
    SITE.max_pages = BASE_MAX_PAGES
    return SITE.fetch("users", order="desc", sort="reputation", filter=filter)


def fetch_questions_by_user(user_id: int, filter):
    """
    Fetches questions asked by a user.

    Parameters:
        user_id (int): The ID of the user whose questions are to be fetched.
        filter (str): The filter string to use for fetching question data.

    Returns:
        list: A list of questions asked by the user.
    """
    SITE.max_pages = BASE_MAX_PAGES // 2
    return SITE.fetch(f"users/{user_id}/questions", filter=filter)


def fetch_answers_to_question(question_id: int, filter):
    """
    Fetches answers to a specific question.

    Parameters:
        question_id (int): The ID of the question.
        filter (str): The filter string to use for fetching answer data.

    Returns:
        list: A list of answers to the question, sorted by votes.
    """
    SITE.max_pages = 3
    return SITE.fetch(f"questions/{question_id}/answers", sort="votes", filter=filter)


def fetch_user(user_id: int, filter):
    """
    Fetches user information based on the given user ID.

    Parameters:
        user_id (int): The ID of the user to fetch.
        filter (str): The filter string to use for fetching user data.

    Returns:
        dict: A dictionary containing the user information.
    """
    return SITE.fetch(f"users/{user_id}", filter=filter)


def fetch_answers_by_user(user_id: int, filter):
    """
    Fetches answers by a given user.

    Parameters:
        user_id (int): The ID of the user.
        filter (str): The filter string to use for fetching answers data.

    Returns:
        List: A list of answers by the user.
    """
    SITE.max_pages = BASE_MAX_PAGES // 2
    return SITE.fetch(f"users/{user_id}/answers", sort="votes", filter=filter)


def fetch_question(question_id: int, filter):
    """
    Fetches a question based on the given question ID.

    Parameters:
        question_id (int): The ID of the question to fetch.
        filter (str): The filter string to use for fetching question data.

    Returns:
        dict: A dictionary containing the details of the fetched question.
    """
    return SITE.fetch(f"questions/{question_id}", filter=filter)


def fetch_most_popular_questions(filter):
    """
    Fetches the most popular questions from the site.

    Parameters:
        filter (str): The filter string to use for fetching popular questions.

    Returns:
        A list of popular questions.
    """
    SITE.max_pages = BASE_MAX_PAGES
    return SITE.fetch("questions", order="desc", sort="votes", filter=filter)

#### Creating the filters


In [15]:
def create_filter(include_fields, base="default", unsafe=False):
    """
    Create a custom filter on Stack Exchange API to include specified fields.

    Parameters:
      include_fields (List(str)): List of fields to include in the filter.
      base (str): Base filter to use, default is 'default'.
      unsafe (bool): Boolean indicating whether to create an unsafe filter.

    Returns:
      The filter string.
    """
    url = "https://api.stackexchange.com/2.2/filters/create"
    params = {
        "include": ";".join(include_fields),
        "base": base,
        "unsafe": str(unsafe).lower(),
    }
    response = requests.post(url, params=params)
    if response.status_code != 200:
        raise Exception(
            f"Failed to create filter: {response.status_code} {response.text}"
        )
    data = response.json()
    return data["items"][0]["filter"]

In [10]:
include_fields_comments = [
    "comment.body",
    "comment.body_markdown",
    "comment.can_flag",
    "comment.comment_id",
    "comment.content_license",
    "comment.creation_date",
    "comment.edited",
    "comment.link",
    "comment.owner",
    "comment.post_id",
    "comment.post_type",
    "comment.reply_to_user",
    "comment.score",
    # "comment.upvoted",
]
include_fields_notices = [
    "notice.body",
    "notice.creation_date",
    "notice.owner_user_id",
]
include_fields_shallow_user = [
    "shallow_user.accept_rate",
    # "shallow_user.account_id",
    "shallow_user.badge_counts",
    "shallow_user.display_name",
    "shallow_user.link",
    "shallow_user.profile_image",
    "shallow_user.reputation",
    "shallow_user.user_id",
    "shallow_user.user_type",
]

In [11]:
include_fields_users = [
    "user.about_me",
    "user.accept_rate",
    "user.account_id",
    "user.age",
    "user.answer_count",
    "user.badge_counts",
    # "user.collectives",
    "user.creation_date",
    "user.display_name",
    "user.down_vote_count",
    "user.is_employee",
    "user.last_access_date",
    "user.last_modified_date",
    "user.link",
    "user.location",
    "user.profile_image",
    "user.question_count",
    "user.reputation",
    "user.reputation_change_day",
    "user.reputation_change_month",
    "user.reputation_change_quarter",
    "user.reputation_change_week",
    "user.reputation_change_year",
    "user.timed_penalty_date",
    "user.up_vote_count",
    "user.user_id",
    "user.user_type",
    "user.view_count",
    "user.website_url",
]
include_fields_questions = (
    [
        "question.accepted_answer_id",
        "question.answer_count",
        "question.answers",
        "question.body",
        "question.body_markdown",
        "question.bounty_amount",
        "question.bounty_closes_date",
        "question.bounty_user",
        # "question.can_answer",
        # "question.can_close",
        # "question.can_comment",
        # "question.can_edit",
        # "question.can_flag",
        # "question.can_suggest_edit",
        "question.close_vote_count",
        "question.closed_date",
        "question.closed_details",
        "question.closed_reason",
        # "question.collectives",
        "question.comment_count",
        "question.comments",
        "question.community_owned_date",
        "question.content_license",
        "question.creation_date",
        "question.delete_vote_count",
        "question.down_vote_count",
        # "question.downvoted",
        # "question.favorited",
        "question.favorite_count",
        "question.is_answered",
        "question.last_activity_date",
        "question.last_edit_date",
        "question.last_editor",
        "question.link",
        "question.locked_date",
        "question.migrated_from",
        "question.migrated_to",
        "question.notice",
        "question.owner",
        "question.posted_by_collectives",
        "question.protected_date",
        "question.question_id",
        "question.reopen_vote_count",
        "question.score",
        "question.share_link",
        "question.tags",
        "question.title",
        "question.up_vote_count",
        "question.view_count",
    ]
    + include_fields_comments
    + include_fields_notices
    + include_fields_shallow_user
)
include_fields_answers = (
    [
        # "answer.accepted",
        "answer.answer_id",
        "answer.awarded_bounty_amount",
        "answer.awarded_bounty_users",
        "answer.body",
        "answer.body_markdown",
        "answer.can_comment",
        # "answer.can_edit",
        "answer.can_flag",
        # "answer.can_suggest_edit",
        "answer.comment_count",
        "answer.comments",
        "answer.community_owned_date",
        "answer.content_license",
        "answer.creation_date",
        "answer.down_vote_count",
        # "answer.downvoted",
        "answer.is_accepted",
        "answer.last_activity_date",
        "answer.last_edit_date",
        "answer.last_editor",
        "answer.link",
        "answer.locked_date",
        "answer.owner",
        "answer.question_id",
        # "answer.recommendations",
        "answer.score",
        "answer.share_link",
        "answer.tags",
        "answer.title",
        "answer.up_vote_count",
        # "answer.upvoted",
    ]
    + include_fields_comments
    + include_fields_shallow_user
)

In [16]:
filter_string_users = create_filter(include_fields_users, unsafe=True)
filter_string_questions = create_filter(include_fields_questions, unsafe=True)
filter_string_answers = create_filter(include_fields_answers, unsafe=True)

# Print the filter strings
print(f"Users filter: {filter_string_users}")
print(f"Questions filter: {filter_string_questions}")
print(f"Answers filter: {filter_string_answers}")

Users filter: )SpJt3h4EyN6R1eVkMKLA0rE5cpLe3AfnRPEXU_L-IAXX0(XqJ7hp4v07n4aC
Questions filter: ufw)7OHo5Ofx9WQmh2dMlMN7Gh)KJ9KM9jO9pc*19gsBP3hVa1s5bn(-_Su6o1WHwsAr
Answers filter: ufw)7OHo5OfohTRiWXCN4vu2tw.SBW2t3PL4gsMnEDnXbeSpFPS373FRQP7wR0zCNtPR


#### Getting the data


In [18]:
# Fetch users with the most reputation
users_data = fetch_users_with_most_reputation(filter_string_users)
for user in tqdm(users_data["items"], desc="Users with Most Reputation"):
    # Add user to DataFrame
    df_users = df_users.append(user, ignore_index=True)

    # Fetch questions asked by the user
    questions_data = fetch_questions_by_user(user["user_id"], filter_string_questions)
    for question in tqdm(
        questions_data["items"], desc="Questions by User", leave=False
    ):
        # Add question to DataFrame
        df_questions = df_questions.append(question, ignore_index=True)

        # Fetch answers to the question
        answers_data = fetch_answers_to_question(
            question["question_id"], filter_string_answers
        )
        for answer in tqdm(
            answers_data["items"], desc="Answers to Question", leave=False
        ):
            # Add answer to DataFrame
            df_answers = df_answers.append(answer, ignore_index=True)

            # Fetch user who answered
            answering_user_data = fetch_user(
                answer["owner"]["user_id"], filter_string_users
            )
            # Assuming a check for 'items' presence is done here
            df_users = df_users.append(
                answering_user_data["items"][0], ignore_index=True
            )

# Fetch most popular questions
popular_questions_data = fetch_most_popular_questions(filter_string_questions)
for question in tqdm(popular_questions_data["items"], desc="Most Popular Questions"):
    # Add question to DataFrame
    df_questions = df_questions.append(question, ignore_index=True)

    # Fetch answers to the question
    answers_data = fetch_answers_to_question(
        question["question_id"], filter_string_answers
    )
    for answer in tqdm(
        answers_data["items"], desc="Answers to Popular Question", leave=False
    ):
        # Add answer to DataFrame
        df_answers = df_answers.append(answer, ignore_index=True)

        # Fetch user who answered
        answering_user_data = fetch_user(
            answer["owner"]["user_id"], filter_string_users
        )
        # Assuming a check for 'items' presence is done here
        df_users = df_users.append(answering_user_data["items"][0], ignore_index=True)

StackAPIError: ('https://api.stackexchange.com/2.3/users/?pagesize=100&page=26&filter=%29SpJt3h4EyN6R1eVkMKLA0rE5cpLe3AfnRPEXU_L-IAXX0%28XqJ7hp4v07n4aC&order=desc&sort=reputation&site=stackoverflow', 403, 'access_denied', 'page above 25 requires access token or app key')