In [4]:
from stackapi import StackAPI
from tqdm import tqdm
import polars as pl
import os

## Initialisation


In [10]:
SITE = StackAPI("stackoverflow")
SITE.page_size = 100  # max page size

### Question Type and Dataframe

Schema from the [Stack Exchange API Documentation](https://api.stackexchange.com/docs/types/question)

| name                  | type             | included in the default filter | note                                                       | is private |
| --------------------- | ---------------- | ------------------------------ | ---------------------------------------------------------- | ---------- |
| accepted_answer_id    | integer          | Yes                            | may be absent                                              |            |
| answer_count          | integer          | Yes                            |                                                            |            |
| answers               | array            | No                             | may be absent                                              |            |
| body                  | string           | No                             | unchanged in unsafe filters                                |            |
| body_markdown         | string           | No                             | introduced in version 2.2                                  |            |
| bounty_amount         | integer          | Yes                            | may be absent                                              |            |
| bounty_closes_date    | date             | Yes                            | may be absent                                              |            |
| bounty_user           | shallow_user     | No                             | may be absent, introduced in version 2.2                   |            |
| can_answer            | boolean          | No                             | introduced in version 2.3                                  |            |
| can_close             | boolean          | No                             | introduced in version 2.2                                  |            |
| can_comment           | boolean          | No                             | introduced in version 2.3                                  |            |
| can_edit              | boolean          | No                             | introduced in version 2.3                                  |            |
| can_flag              | boolean          | No                             | introduced in version 2.2                                  |            |
| can_suggest_edit      | boolean          | No                             | introduced in version 2.3                                  |            |
| close_vote_count      | integer          | No                             | introduced in version 2.1                                  |            |
| closed_date           | date             | Yes                            | may be absent                                              |            |
| closed_details        | closed_details   | No                             | may be absent, introduced in version 2.2                   |            |
| closed_reason         | string           | Yes                            | may be absent                                              |            |
| collectives           | array            | Yes                            | introduced in version 2.3                                  |            |
| comment_count         | integer          | No                             | introduced in version 2.2                                  |            |
| comments              | array            | No                             | may be absent                                              |            |
| community_owned_date  | date             | Yes                            | may be absent                                              |            |
| content_license       | string           | Yes                            | introduced in version 2.2                                  |            |
| creation_date         | date             | Yes                            |                                                            |            |
| delete_vote_count     | integer          | No                             | introduced in version 2.1                                  |            |
| down_vote_count       | integer          | No                             |                                                            |            |
| downvoted             | boolean          | No                             | requires the private_info scope, introduced in version 2.2 | Yes        |
| favorite_count        | integer          | No                             |                                                            |            |
| favorited             | boolean          | No                             | requires the private_info scope, introduced in version 2.2 | Yes        |
| is_answered           | boolean          | Yes                            |                                                            |            |
| last_activity_date    | date             | Yes                            |                                                            |            |
| last_edit_date        | date             | Yes                            | may be absent                                              |            |
| last_editor           | shallow_user     | No                             | introduced in version 2.2                                  |            |
| link                  | string           | Yes                            | unchanged in unsafe filters                                |            |
| locked_date           | date             | Yes                            | may be absent                                              |            |
| migrated_from         | migration_info   | Yes                            | may be absent                                              |            |
| migrated_to           | migration_info   | Yes                            | may be absent                                              |            |
| notice                | notice           | No                             | introduced in version 2.1                                  |            |
| owner                 | shallow_user     | Yes                            | may be absent                                              |            |
| posted_by_collectives | array            | Yes                            | introduced in version 2.3                                  |            |
| protected_date        | date             | Yes                            | may be absent                                              |            |
| question_id           | integer          | Yes                            | refers to a question                                       |            |
| reopen_vote_count     | integer          | No                             | introduced in version 2.1                                  |            |
| score                 | integer          | Yes                            |                                                            |            |
| share_link            | string           | No                             | unchanged in unsafe filters, introduced in version 2.2     |            |
| tags                  | array of strings | Yes                            |                                                            |            |
| title                 | string           | Yes                            |                                                            |            |
| up_vote_count         | integer          | No                             |                                                            |            |
| upvoted               | boolean          | No                             | requires the private_info scope, introduced in version 2.2 | Yes        |
| view_count            | integer          | Yes                            |                                                            |            |


In [11]:
schema = {
    "accepted_answer_id": pl.Int64,
    "answer_count": pl.Int64,
    "answers": pl.List,
    "body": pl.Utf8,
    "body_markdown": pl.Utf8,
    "bounty_amount": pl.Int64,
    "bounty_closes_date": pl.Date,
    "bounty_user": pl.Struct,  # assuming shallow_user is a structured type
    "can_answer": pl.Boolean,
    "can_close": pl.Boolean,
    "can_comment": pl.Boolean,
    "can_edit": pl.Boolean,
    "can_flag": pl.Boolean,
    "can_suggest_edit": pl.Boolean,
    "close_vote_count": pl.Int64,
    "closed_date": pl.Date,
    "closed_details": pl.Struct,  # assuming closed_details is a structured type
    "closed_reason": pl.Utf8,
    "collectives": pl.List,
    "comment_count": pl.Int64,
    "comments": pl.List,
    "community_owned_date": pl.Date,
    "content_license": pl.Utf8,
    "creation_date": pl.Date,
    "delete_vote_count": pl.Int64,
    "down_vote_count": pl.Int64,
    "downvoted": pl.Boolean,
    "favorite_count": pl.Int64,
    "favorited": pl.Boolean,
    "is_answered": pl.Boolean,
    "last_activity_date": pl.Date,
    "last_edit_date": pl.Date,
    "last_editor": pl.Struct,  # assuming shallow_user is a structured type
    "link": pl.Utf8,
    "locked_date": pl.Date,
    "migrated_from": pl.Struct,  # assuming migration_info is a structured type
    "migrated_to": pl.Struct,  # assuming migration_info is a structured type
    "notice": pl.Struct,  # assuming notice is a structured type
    "owner": pl.Struct,  # assuming shallow_user is a structured type
    "posted_by_collectives": pl.List,
    "protected_date": pl.Date,
    "question_id": pl.Int64,
    "reopen_vote_count": pl.Int64,
    "score": pl.Int64,
    "share_link": pl.Utf8,
    "tags": pl.List(pl.Utf8),
    "title": pl.Utf8,
    "up_vote_count": pl.Int64,
    "upvoted": pl.Boolean,
    "view_count": pl.Int64,
}

df_questions = pl.DataFrame(schema=schema)

### Answer Type and Dataframe

Schema from the [Stack Exchange API Documentation](https://api.stackexchange.com/docs/types/answer)

| name                  | type             | included in the default filter | note                                                   | is private |
| --------------------- | ---------------- | ------------------------------ | ------------------------------------------------------ | ---------- |
| accepted              | boolean          | No                             | introduced in version 2.2, requires private_info       | Yes        |
| answer_id             | integer          | Yes                            | refers to an answer                                    |            |
| awarded_bounty_amount | integer          | No                             | may be absent, introduced in version 2.2               |            |
| awarded_bounty_users  | array            | No                             | may be absent, introduced in version 2.2               |            |
| body                  | string           | No                             | unchanged in unsafe filters                            |            |
| body_markdown         | string           | No                             | introduced in version 2.2                              |            |
| can_comment           | boolean          | No                             | introduced in version 2.3                              |            |
| can_edit              | boolean          | No                             | introduced in version 2.3                              |            |
| can_flag              | boolean          | No                             | unchanged in unsafe filters, introduced in version 2.2 |            |
| can_suggest_edit      | boolean          | No                             | introduced in version 2.3                              |            |
| collectives           | array            | Yes                            | introduced in version 2.3                              |            |
| comment_count         | integer          | No                             | introduced in version 2.2                              |            |
| comments              | array            | No                             | may be absent                                          |            |
| community_owned_date  | date             | Yes                            | may be absent                                          |            |
| content_license       | string           | Yes                            | introduced in version 2.2                              |            |
| creation_date         | date             | Yes                            |                                                        |            |
| down_vote_count       | integer          | No                             |                                                        |            |
| downvoted             | boolean          | No                             | requires private_info, introduced in version 2.2       | Yes        |
| is_accepted           | boolean          | Yes                            |                                                        |            |
| last_activity_date    | date             | Yes                            |                                                        |            |
| last_edit_date        | date             | Yes                            | may be absent                                          |            |
| last_editor           | shallow_user     | No                             | introduced in version 2.2                              |            |
| link                  | string           | No                             | unchanged in unsafe filters                            |            |
| locked_date           | date             | Yes                            | may be absent                                          |            |
| owner                 | shallow_user     | Yes                            | may be absent                                          |            |
| posted_by_collectives | array            | Yes                            | introduced in version 2.3                              |            |
| question_id           | integer          | Yes                            | refers to a question                                   |            |
| recommendations       | array            | Yes                            | introduced in version 2.3                              |            |
| score                 | integer          | Yes                            |                                                        |            |
| share_link            | string           | No                             | unchanged in unsafe filters, introduced in version 2.2 |            |
| tags                  | array of strings | No                             | introduced in version 2.1                              |            |
| title                 | string           | No                             |                                                        |            |
| up_vote_count         | integer          | No                             |                                                        |            |
| upvoted               | boolean          | No                             | requires private_info, introduced in version 2.2       | Yes        |


In [None]:
schema = {
    "accepted": pl.Boolean,
    "answer_id": pl.Int64,
    "awarded_bounty_amount": pl.Int64,
    "awarded_bounty_users": pl.List(
        pl.Struct
    ),  # assuming shallow_user is a structured type
    "body": pl.Utf8,
    "body_markdown": pl.Utf8,
    "can_comment": pl.Boolean,
    "can_edit": pl.Boolean,
    "can_flag": pl.Boolean,
    "can_suggest_edit": pl.Boolean,
    "collectives": pl.List(pl.Struct),  # assuming collective is a structured type
    "comment_count": pl.Int64,
    "comments": pl.List(pl.Struct),  # assuming comment is a structured type
    "community_owned_date": pl.Date,
    "content_license": pl.Utf8,
    "creation_date": pl.Date,
    "down_vote_count": pl.Int64,
    "downvoted": pl.Boolean,
    "is_accepted": pl.Boolean,
    "last_activity_date": pl.Date,
    "last_edit_date": pl.Date,
    "last_editor": pl.Struct,  # assuming shallow_user is a structured type
    "link": pl.Utf8,
    "locked_date": pl.Date,
    "owner": pl.Struct,  # assuming shallow_user is a structured type
    "posted_by_collectives": pl.List(
        pl.Struct
    ),  # assuming collective is a structured type
    "question_id": pl.Int64,
    "recommendations": pl.List(
        pl.Struct
    ),  # assuming collective_recommendation is a structured type
    "score": pl.Int64,
    "share_link": pl.Utf8,
    "tags": pl.List(pl.Utf8),
    "title": pl.Utf8,
    "up_vote_count": pl.Int64,
    "upvoted": pl.Boolean,
}

df_answers = pl.DataFrame(schema=schema)

### User Type and Dataframe

Schema from the [Stack Exchange API Documentation](https://api.stackexchange.com/docs/types/user)

| name                      | type        | included in the default filter | note                                       |
| ------------------------- | ----------- | ------------------------------ | ------------------------------------------ |
| about_me                  | string      | No                             | may be absent, unchanged in unsafe filters |
| accept_rate               | integer     | Yes                            | may be absent                              |
| account_id                | integer     | Yes                            |                                            |
| age                       | integer     | Yes                            | may be absent                              |
| answer_count              | integer     | No                             |                                            |
| badge_counts              | badge_count | Yes                            |                                            |
| collectives               | array       | Yes                            | may be absent, introduced in version 2.3   |
| creation_date             | date        | Yes                            |                                            |
| display_name              | string      | Yes                            |                                            |
| down_vote_count           | integer     | No                             |                                            |
| is_employee               | boolean     | Yes                            |                                            |
| last_access_date          | date        | Yes                            |                                            |
| last_modified_date        | date        | Yes                            | may be absent                              |
| link                      | string      | Yes                            | unchanged in unsafe filters                |
| location                  | string      | Yes                            | may be absent                              |
| profile_image             | string      | Yes                            | unchanged in unsafe filters                |
| question_count            | integer     | No                             |                                            |
| reputation                | integer     | Yes                            |                                            |
| reputation_change_day     | integer     | Yes                            |                                            |
| reputation_change_month   | integer     | Yes                            |                                            |
| reputation_change_quarter | integer     | Yes                            |                                            |
| reputation_change_week    | integer     | Yes                            |                                            |
| reputation_change_year    | integer     | Yes                            |                                            |
| timed_penalty_date        | date        | Yes                            | may be absent                              |
| up_vote_count             | integer     | No                             |                                            |
| user_id                   | integer     | Yes                            | refers to a user                           |
| user_type                 | enum        | Yes                            | one of several user types                  |
| view_count                | integer     | No                             |                                            |
| website_url               | string      | Yes                            | may be absent, unchanged in unsafe filters |


In [None]:
schema = {
    "about_me": pl.Utf8,
    "accept_rate": pl.Int64,
    "account_id": pl.Int64,
    "age": pl.Int64,
    "answer_count": pl.Int64,
    "badge_counts": pl.Struct,  # Assuming badge_count is a structured type
    "collectives": pl.List(
        pl.Struct
    ),  # Assuming collective_membership is a structured type
    "creation_date": pl.Date,
    "display_name": pl.Utf8,
    "down_vote_count": pl.Int64,
    "is_employee": pl.Boolean,
    "last_access_date": pl.Date,
    "last_modified_date": pl.Date,
    "link": pl.Utf8,
    "location": pl.Utf8,
    "profile_image": pl.Utf8,
    "question_count": pl.Int64,
    "reputation": pl.Int64,
    "reputation_change_day": pl.Int64,
    "reputation_change_month": pl.Int64,
    "reputation_change_quarter": pl.Int64,
    "reputation_change_week": pl.Int64,
    "reputation_change_year": pl.Int64,
    "timed_penalty_date": pl.Date,
    "up_vote_count": pl.Int64,
    "user_id": pl.Int64,
    "user_type": pl.Categorical,  # Assuming user_type can be mapped to a finite set of categories
    "view_count": pl.Int64,
    "website_url": pl.Utf8,
}

df_users = pl.DataFrame(schema=schema)

## Scraping the Data

We are going to scrape the following:

- users with the most reputation on Stack Overflow.
  - then scrape the questions of these users.
    - then scrape the answer(s) to these questions
      - then scrape the user(s) who answered the question
  - then scrape the answers of these users
    - then scrape the question(s) to these answers
      - then scrape the user(s) who asked the question
- the most popular questions on Stack Overflow.
  - then scrape the user(s) who asked the question
  - then scrape the answer(s) to these questions
    - then scrape the user(s) who answered the question
