## bicycles

- [Tags] 525
- [PostLinks] 6,140
- [Badges] 80,935
- [Users] 40,571
- [Votes] 283,664
- [Comments] 131,281
- [Posts] 56,860
- [PostHistory] 146,878

## coffee

- [Tags] 115
- [PostLinks] 602
- [Comments] 4,365
- [Badges] 10,852
- [Votes] 20,663
- [Posts] 3,936
- [Users] 8,256
- [PostHistory] 10,178

## ukrainian

- [Tags] 120
- [PostLinks] 399
- [Badges] 6,248
- [Users] 3,080
- [Comments] 6,954
- [Votes] 28,867
- [Posts] 5,069
- [PostHistory] 16,102

In [3]:
import numpy as np
import pandas as pd
from pathlib import Path
import enum
import matplotlib.pyplot as plt

In [5]:
class ModelType(str, enum.Enum):
    BADGES = 'Badges'
    COMMENTS = 'Comments'
    POST_HISTORY = 'PostHistory'
    POST_LINKS = 'PostLinks'
    POSTS = 'Posts'
    TAGS = 'Tags'
    USERS = 'Users'
    VOTES = 'Votes'


class ForumType(str, enum.Enum):
    BICYCLES = 'bicycles'
    CODEGOLF = 'codegolf'
    COFFEE = 'coffee'
    CRYPTO = 'crypto'
    GAMEDEV = 'gamedev'
    PETS = 'pets'
    UKRAINIAN = 'ukrainian'

    
def read_stackexchange_csv(name, forum_name, forum_sufix='stackexchange.com', base_path='data'):
    file_path = Path(base_path) / f'{forum_name}.{forum_sufix}' / f'{name}.csv'
    return pd.read_csv(file_path)


def read_stackexchange(model_type, forum_type):
    return read_stackexchange_csv(model_type.value, forum_type.value)

## Załadowanie danych

In [7]:
bicycles_posts_df = read_stackexchange(ModelType.POSTS, ForumType.BICYCLES)
coffee_posts_df = read_stackexchange(ModelType.POSTS, ForumType.COFFEE)
ukrainian_posts_df = read_stackexchange(ModelType.POSTS, ForumType.UKRAINIAN)

### 

In [131]:
def compare(posts):
    # PostTypeId: 1 -> Question, 2 -> Answer
    questions = posts.loc[posts.PostTypeId == 1, ['Id', 'CreationDate', 'Title']]
    answers = posts.loc[posts.PostTypeId == 2, ['Id', 'ParentId', 'CreationDate']]
    
    questions['CreationDate_datetime'] = pd.to_datetime(questions['CreationDate'])
    answers['CreationDate_datetime'] = pd.to_datetime(answers['CreationDate'])
    
    df = questions.join(answers.set_index('ParentId'), on = 'Id', 
                        lsuffix='_question', rsuffix='_answer', how = 'inner')
    df['diff_time'] = df['CreationDate_datetime_answer'] - df['CreationDate_datetime_question']
    #df['diff_hours']=df['diff_hours']/np.timedelta64(1,'h')
    
    
    # Aggregate diff_time by Id_question
    df = df.groupby('Id_question').agg({'diff_time': ['min', 'max']})
    df.columns = ['_'.join(col) for col in df.columns.values]
    
    # Get (min, max) from min and max
    return df.agg({'diff_time_min': ['min', 'max'], 'diff_time_max': ['min', 'max']})


In [132]:
compare(bicycles_posts_df)

Unnamed: 0,diff_time_min,diff_time_max
min,-292 days +20:16:49.880000,0 days 00:00:00
max,1619 days 12:16:06.473000,3806 days 00:28:05.467000


In [133]:
compare(coffee_posts_df)

Unnamed: 0,diff_time_min,diff_time_max
min,0 days 00:00:00,0 days 00:00:00
max,1679 days 21:41:25.017000,2178 days 04:13:18.530000


In [None]:
(ukrainian_posts_df)