In [1]:
import setup  # notebooks/setup.py
setup.init(verbose = True)

Applied nest_asyncio patch for Jupyter compatibility
Changed working directory to: /Users/gphome/Desktop/projects/Obrela-assignment/StackExchange-Statistics-Service/src


In [2]:
from datetime import datetime
from app.components.stackexchange import StackExchangeClient
import json
from pathlib import Path
MOCK = True
MOCK_DATA = Path('mock_data')

In [3]:
client = StackExchangeClient()
if not MOCK:
    ##
    start_date = datetime(2025, 6, 1)
    end_date = datetime(2025, 6, 5)
    ##
    since = int(start_date.timestamp())
    until = int(end_date.timestamp())
    answers = client.get_answers(since, until)
else:
    filename = "answers_2025-06-01-00-00-00_2025-06-05-00-00-00.json"
    full_path = MOCK_DATA / filename
    answers = json.loads(full_path.read_text(encoding='utf-8'))
print(f"Fetched {len(answers)} number of answers")


Fetched 2057 number of answers


### Save fetched answers
``` If MOCK is set as False, API response is saved in the mock_data directory.```

In [4]:
if not MOCK:
    filename = f"answers_{start_date_string}_{end_date_string}.json"
    full_path = MOCK_DATA / filename
    full_path.write_text(json.dumps(answers, indent=4), encoding='utf-8')
    print(f"File {filename} saved sucessfully.")

In [5]:
accepted_answers = [answer for answer in answers if answer.get('is_accepted') == True]

In [6]:
accepted_answers[:5]

[{'owner': {'account_id': 9938133,
   'reputation': 53812,
   'user_id': 7355741,
   'user_type': 'registered',
   'profile_image': 'https://i.sstatic.net/6CMNt.jpg?s=256',
   'display_name': 'fmw42',
   'link': 'https://stackoverflow.com/users/7355741/fmw42'},
  'is_accepted': True,
  'score': 3,
  'last_activity_date': 1748725286,
  'creation_date': 1748725286,
  'answer_id': 79646989,
  'question_id': 79644827,
  'content_license': 'CC BY-SA 4.0'},
 {'owner': {'account_id': 23627132,
   'reputation': 341,
   'user_id': 17659480,
   'user_type': 'registered',
   'profile_image': 'https://lh3.googleusercontent.com/a/AATXAJw8tYRnPGJplk7WIWrznLDbVa-Wb90uLGnawndL=k-s256',
   'display_name': 'Navid Abedini',
   'link': 'https://stackoverflow.com/users/17659480/navid-abedini'},
  'is_accepted': True,
  'score': 2,
  'last_activity_date': 1748725425,
  'creation_date': 1748725425,
  'answer_id': 79646993,
  'question_id': 79646705,
  'content_license': 'CC BY-SA 4.0'},
 {'posted_by_collecti

In [6]:
print(len(accepted_answers))

440


In [7]:
answer_ids = [answer.get('answer_id') for answer in answers]

In [8]:
len(answer_ids)

2057

In [16]:
if not MOCK:
    comments = client.get_comments(answer_ids)
else:    
    filename = "comments_2025-06-01-00-00-00_2025-06-05-00-00-00.json"
    full_path = MOCK_DATA / filename
    comments = json.loads(full_path.read_text(encoding='utf-8'))
    print(f"Fetched {len(comments)} number of comments")

Fetched 2057 number of comments


In [17]:
len(comments)

2057

In [18]:
comments[:10]

[{'owner': {'account_id': 9938133,
   'reputation': 53812,
   'user_id': 7355741,
   'user_type': 'registered',
   'profile_image': 'https://i.sstatic.net/6CMNt.jpg?s=256',
   'display_name': 'fmw42',
   'link': 'https://stackoverflow.com/users/7355741/fmw42'},
  'is_accepted': True,
  'score': 3,
  'last_activity_date': 1748725286,
  'creation_date': 1748725286,
  'answer_id': 79646989,
  'question_id': 79644827,
  'content_license': 'CC BY-SA 4.0'},
 {'owner': {'account_id': 2375785,
   'reputation': 4919,
   'user_id': 2079189,
   'user_type': 'registered',
   'profile_image': 'https://www.gravatar.com/avatar/6bc9430eeffd18d42ca4fb2efa91e418?s=256&d=identicon&r=PG',
   'display_name': 'mugiseyebrows',
   'link': 'https://stackoverflow.com/users/2079189/mugiseyebrows'},
  'is_accepted': False,
  'score': 0,
  'last_activity_date': 1748725293,
  'creation_date': 1748725293,
  'answer_id': 79646990,
  'question_id': 79646664,
  'content_license': 'CC BY-SA 4.0'},
 {'owner': {'account_i

### Some edge cases

#### Empty 

In [23]:
# Example test for empty results
MOCK = False
empty_answers = client.get_answers(int(datetime(2000, 1, 1).timestamp()), int(datetime(2000, 1, 2).timestamp()))
assert len(empty_answers) == 0, "Expected no answers for this date range."

empty_comments = client.get_comments([])
assert len(empty_comments) == 0, "Expected no comments for empty answer IDs."

In [24]:

empty_answers

[]

In [25]:
empty_comments

[]

#### Invalid range

In [26]:
# Invalid date range
invalid_answers = client.get_answers(int(datetime(2025, 6, 5).timestamp()), int(datetime(2025, 6, 1).timestamp()))
assert len(invalid_answers) == 0, "Expected no answers for invalid date range."

# Invalid answer IDs
invalid_comments = client.get_comments([-1, 0, 999999999999])
assert len(invalid_comments) == 0, "Expected no comments for invalid answer IDs."

Error fetching data from StackExchange API, url:https://api.stackexchange.com/2.3/answers/-1;0;999999999999/comments, page: 1 : 400 Client Error: Bad Request for url: https://api.stackexchange.com/2.3/answers/-1;0;999999999999/comments?order=asc&sort=creation&site=stackoverflow&page=1


In [27]:
invalid_answers

[]

In [28]:
invalid_comments

[]

In [None]:
bool(true)

### Statistics computation

In [48]:
import time

start = time.time()
# 1. Initialize counters and a set for unique question IDs
counter_of_accepted_answers = 0
counter_of_not_accepted_answers = 0
sum_accepted_scores = 0
distinct_question_ids = set()


for answer in answers:
    
    qid = answer.get('question_id')
    if qid and qid not in distinct_question_ids:
        distinct_question_ids.add(qid)
    
    
    if answer.get('is_accepted'):
        counter_of_accepted_answers += 1
        sum_accepted_scores += float(answer.get('score', 0))
    else:
        counter_of_not_accepted_answers += 1

if counter_of_accepted_answers > 0:
    avg_score_accepted_answers = sum_accepted_scores / counter_of_accepted_answers
else:
    avg_score_accepted_answers = 0


total_answers = counter_of_accepted_answers + counter_of_not_accepted_answers
if len(distinct_question_ids) > 0:
    avg_answer_count_per_question = total_answers / len(distinct_question_ids)
else:
    avg_answer_count_per_question = 0

end = time.time()
# 6. Print or return the results
print("Total accepted answers:", counter_of_accepted_answers)
print("Average score of accepted answers:", avg_score_accepted_answers)
print("Average answer count per question:", avg_answer_count_per_question)
print("time elapsed for 5 days range (~2000 records): ", (end - start) * 1000, "ms")

Total accepted answers: 440
Average score of accepted answers: 1.6363636363636365
Average answer count per question: 1.1209809264305177
time elapsed for 5 days range (~2000 records):  2.880096435546875 ms


In [49]:
start = time.time()
top_10_answers = sorted(answers, key=lambda x: x['score'], reverse=True)[:10]
end = time.time()
print("time elapsed for 5 days range (~2000 records): ", (end - start) * 1000, "ms")


time elapsed for 5 days range (~2000 records):  1.4238357543945312 ms


In [21]:
counter_of_not_accepted_answers

1617

In [28]:
round(sum_accepted_scores / counter_of_accepted_answers, 2) if counter_of_accepted_answers != 0 else 0

1.64

In [39]:
len(answers) / len(question_asnwercount_dict.keys())

1.1209809264305177

## 