## bicycles

- [Tags] 525
- [PostLinks] 6,140
- [Badges] 80,935
- [Users] 40,571
- [Votes] 283,664
- [Comments] 131,281
- [Posts] 56,860
- [PostHistory] 146,878

## coffee

- [Tags] 115
- [PostLinks] 602
- [Comments] 4,365
- [Badges] 10,852
- [Votes] 20,663
- [Posts] 3,936
- [Users] 8,256
- [PostHistory] 10,178

## ukrainian

- [Tags] 120
- [PostLinks] 399
- [Badges] 6,248
- [Users] 3,080
- [Comments] 6,954
- [Votes] 28,867
- [Posts] 5,069
- [PostHistory] 16,102

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import enum
import matplotlib.pyplot as plt


In [3]:
class ModelType(str, enum.Enum):
    BADGES = 'Badges'
    COMMENTS = 'Comments'
    POST_HISTORY = 'PostHistory'
    POST_LINKS = 'PostLinks'
    POSTS = 'Posts'
    TAGS = 'Tags'
    USERS = 'Users'
    VOTES = 'Votes'


class ForumType(str, enum.Enum):
    BICYCLES = 'bicycles'
    CODEGOLF = 'codegolf'
    COFFEE = 'coffee'
    CRYPTO = 'crypto'
    GAMEDEV = 'gamedev'
    PETS = 'pets'
    UKRAINIAN = 'ukrainian'

    
def read_stackexchange_csv(name, forum_name, forum_sufix='stackexchange.com', base_path='data'):
    file_path = Path(base_path) / f'{forum_name}.{forum_sufix}' / f'{name}.csv'
    return pd.read_csv(file_path)


def read_stackexchange(model_type, forum_type):
    return read_stackexchange_csv(model_type.value, forum_type.value)


In [4]:
bicycles_posts_df = read_stackexchange(ModelType.POSTS, ForumType.BICYCLES)
coffee_posts_df = read_stackexchange(ModelType.POSTS, ForumType.COFFEE)
ukrainian_posts_df = read_stackexchange(ModelType.POSTS, ForumType.UKRAINIAN)


In [5]:
def compare(posts):
    # PostTypeId: 1 -> Question, 2 -> Answer
    questions = posts.loc[posts.PostTypeId == 1, ['Id', 'CreationDate']]
    answers = posts.loc[posts.PostTypeId == 2, ['Id', 'ParentId', 'CreationDate']]
    
    questions['CreationDate_datetime'] = pd.to_datetime(questions['CreationDate'])
    answers['CreationDate_datetime'] = pd.to_datetime(answers['CreationDate'])
    
    df = questions.set_index('Id').join(answers.set_index('ParentId'), lsuffix='_question', rsuffix='_answer')
    df['diff_hours'] = df['CreationDate_datetime_answer'] - df['CreationDate_datetime_question']
    df['diff_hours']=df['diff_hours']/np.timedelta64(1,'h')
    
    return df.head(30)


In [6]:
compare(bicycles_posts_df)


Unnamed: 0,CreationDate_question,CreationDate_datetime_question,Id,CreationDate_answer,CreationDate_datetime_answer,diff_hours
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,7.0,2010-08-25T19:43:43.510,2010-08-25 19:43:43.510,0.040465
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,29.0,2010-08-25T19:57:37.627,2010-08-25 19:57:37.627,0.272164
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,30.0,2010-08-25T19:57:46.833,2010-08-25 19:57:46.833,0.274721
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,32.0,2010-08-25T19:58:30.700,2010-08-25 19:58:30.700,0.286906
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,39.0,2010-08-25T20:01:34.487,2010-08-25 20:01:34.487,0.337958
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,55.0,2010-08-25T20:12:54.970,2010-08-25 20:12:54.970,0.526981
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,138.0,2010-08-25T22:12:30.850,2010-08-25 22:12:30.850,2.520281
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,3494.0,2011-04-21T00:02:34.430,2011-04-21 00:02:34.430,5716.354609
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,3509.0,2011-04-21T19:56:52.790,2011-04-21 19:56:52.790,5736.259709
1,2010-08-25T19:41:17.837,2010-08-25 19:41:17.837,10905.0,2012-08-22T04:14:33.700,2012-08-22 04:14:33.700,17456.554406
