In [1]:
import setup  # notebooks/setup.py
setup.init(verbose = True)

Applied nest_asyncio patch for Jupyter compatibility
Changed working directory to: C:\Users\gepanago\PycharmProjects\stack-exchange\StackExchange-Statistics-Service\src


In [2]:
from datetime import datetime
from app.components.stackexchange import StackExchangeClient
import json
from pathlib import Path
MOCK = False
MOCK_DATA = Path('mock_data')

In [6]:
client = StackExchangeClient()
if not MOCK:
    ##
    start_date = datetime(2025, 1, 1)
    end_date = datetime(2025, 1, 4)
    ##
    since = int(start_date.timestamp())
    until = int(end_date.timestamp())
    answers = client.get_answers(since, until)
else:
    filename = "answers_2025-06-01-00-00-00_2025-06-05-00-00-00.json"
    full_path = MOCK_DATA / filename
    answers = json.loads(full_path.read_text(encoding='utf-8'))
print(f"Fetched {len(answers)} number of answers")


Fetched 2450 number of answers


### Save fetched answers
``` If MOCK is set as False, API response is saved in the mock_data directory.```

In [9]:

if not MOCK:
    start_date_string = start_date.__str__().replace(' ', '_').replace(":", "-")
    end_date_string = end_date.__str__().replace(' ', '_').replace(":", "-")
    filename = f"answers_{start_date_string}_{end_date_string}.json"
    full_path = MOCK_DATA / filename
    full_path.write_text(json.dumps(answers, indent=4), encoding='utf-8')
    print(f"File {filename} saved sucessfully.")

File answers_2025-01-01_00-00-00_2025-01-04_00-00-00.json saved sucessfully.


In [10]:
accepted_answers = [answer for answer in answers if answer.get('is_accepted') == True]

In [None]:
accepted_answers[:5]

In [12]:
print(len(accepted_answers))

547


In [13]:
answer_ids = [answer.get('answer_id') for answer in answers]

In [14]:
len(answer_ids)

2450

In [10]:
# if not MOCK:
#     comments = client.get_comments(answer_ids)
# else:    
#     filename = "comments_2025-06-01-00-00-00_2025-06-05-00-00-00.json"
#     full_path = MOCK_DATA / filename
#     comments = json.loads(full_path.read_text(encoding='utf-8'))
#     print(f"Fetched {len(comments)} number of comments")

Fetched 4 number of comments


In [11]:
len(comments)

4

### Some edge cases

#### Empty 

In [12]:
# Example test for empty results
MOCK = False
empty_answers = client.get_answers(int(datetime(2000, 1, 1).timestamp()), int(datetime(2000, 1, 2).timestamp()))
assert len(empty_answers) == 0, "Expected no answers for this date range."

empty_comments = client.get_comments([])
assert len(empty_comments) == 0, "Expected no comments for empty answer IDs."

In [13]:

empty_answers

[]

In [14]:
empty_comments

[]

#### Invalid range

In [15]:
# Invalid date range
invalid_answers = client.get_answers(int(datetime(2025, 6, 5).timestamp()), int(datetime(2025, 6, 1).timestamp()))
assert len(invalid_answers) == 0, "Expected no answers for invalid date range."

# Invalid answer IDs
invalid_comments = client.get_comments([-1, 0, 999999999999])
assert len(invalid_comments) == 0, "Expected no comments for invalid answer IDs."

Error fetching data from StackExchange API, url:https://api.stackexchange.com/2.3/answers/-1;0;999999999999/comments, page: 1 : 400 Client Error: Bad Request for url: https://api.stackexchange.com/2.3/answers/-1;0;999999999999/comments?order=asc&sort=creation&site=stackoverflow&page=1


In [16]:
invalid_answers

[]

In [17]:
invalid_comments

[]

### Statistics computation Schedio

```
import time

start = time.time()
# 1. Initialize counters and a set for unique question IDs
counter_of_accepted_answers = 0
counter_of_not_accepted_answers = 0
sum_accepted_scores = 0
distinct_question_ids = set()


for answer in answers:
    
    qid = answer.get('question_id')
    if qid and qid not in distinct_question_ids:
        distinct_question_ids.add(qid)
    
    
    if answer.get('is_accepted'):
        counter_of_accepted_answers += 1
        sum_accepted_scores += float(answer.get('score', 0))
    else:
        counter_of_not_accepted_answers += 1

if counter_of_accepted_answers > 0:
    avg_score_accepted_answers = sum_accepted_scores / counter_of_accepted_answers
else:
    avg_score_accepted_answers = 0


total_answers = counter_of_accepted_answers + counter_of_not_accepted_answers
if len(distinct_question_ids) > 0:
    avg_answer_count_per_question = total_answers / len(distinct_question_ids)
else:
    avg_answer_count_per_question = 0

end = time.time()
# 6. Print or return the results
print("Total accepted answers:", counter_of_accepted_answers)
print("Average score of accepted answers:", avg_score_accepted_answers)
print("Average answer count per question:", avg_answer_count_per_question)
print("time elapsed for 5 days range (~2000 records): ", (end - start) * 1000, "ms")
```

```
start = time.time()
top_10_answers = sorted(answers, key=lambda x: x['score'], reverse=True)[:10]
end = time.time()
print("time elapsed for 5 days range (~2000 records): ", (end - start) * 1000, "ms")
```
