In [93]:
import datetime
from dask import dataframe as dd
import pandas as pd

In [94]:
csv = '/home/centos/stackoverflow_med.csv'
# load the csv file into a dask dataframe
df = dd.read_csv(csv)

In [95]:
len(df)

49999

In [96]:
# list the columns
df.columns

Index([u'Unnamed: 0', u'id', u'owneruserid', u'posttypeid', u'creationdate',
       u'CreationDayOfWeek', u'acceptedanswerid', u'score', u'answercount',
       u'viewcount', u'owneruserid.1', u'lasteditoruserid', u'lasteditdate',
       u'LastActivityDate', u'CommentCount', u'FavoriteCount', u'site',
       u'acceptedAnswerId', u'acceptedAnswerCreationDate',
       u'acceptedAnswerCreationDayOfWeek', u'acceptedAnswerScore',
       u'acceptedAnswerCommentCount', u'acceptedAnswerOwnerUserId', u'site.1'],
      dtype='object')

In [97]:
# compute the average number of answers for each day of the week
df.groupby(df.CreationDayOfWeek).answercount.mean().compute()

CreationDayOfWeek
1    5.553293
2    5.481440
3    5.525365
4    5.423727
5    5.314909
6    5.511560
7    5.611783
Name: answercount, dtype: float64

In [104]:
def to_datetime(row, col="creationdate"):
    try:
        return datetime.datetime.strptime(row[col], '%Y-%m-%d %H:%M:%S')
    except ValueError:
        # return a future date for easy discard
        return datetime.datetime.strptime('2020-01-01 12:00:00', '%Y-%m-%d %H:%M:%S')

def time_to_answer(row):
    try:
        td = datetime.datetime.strptime(row['acceptedAnswerCreationDate'], '%Y-%m-%d %H:%M:%S') - datetime.datetime.strptime(row['creationdate'], '%Y-%m-%d %H:%M:%S')
        return td.total_seconds()
    except ValueError:
        return None

# create a row that holds the creation time hour
def to_hour(row, col="creationDateTime"):
    # covert a datetime row to the hour
    return row[col].hour

# create a new column with datetime objects built from the creationdatetime field
df["creationDateTime"] = df.apply(lambda row: to_datetime(row), axis=1)

df['timeToAnswer'] = df.apply(lambda row: time_to_answer(row), axis=1)

df['creationHour'] = df.apply(lambda row: to_hour(row), axis=1)

In [105]:
df.columns

Index([u'Unnamed: 0', u'id', u'owneruserid', u'posttypeid', u'creationdate',
       u'CreationDayOfWeek', u'acceptedanswerid', u'score', u'answercount',
       u'viewcount', u'owneruserid.1', u'lasteditoruserid', u'lasteditdate',
       u'LastActivityDate', u'CommentCount', u'FavoriteCount', u'site',
       u'acceptedAnswerId', u'acceptedAnswerCreationDate',
       u'acceptedAnswerCreationDayOfWeek', u'acceptedAnswerScore',
       u'acceptedAnswerCommentCount', u'acceptedAnswerOwnerUserId', u'site.1',
       u'creationDateTime', u'timeToAnswer', u'creationHour'],
      dtype='object')

In [106]:
df.timeToAnswer.head(25)

0         2105.0
1        51283.0
2       320946.0
3       309708.0
4      1266293.0
5        40631.0
6       134599.0
7        28376.0
8     35704540.0
9        61683.0
10        2568.0
11        4502.0
12     2321923.0
13     2902455.0
14      502328.0
15     3889517.0
16       10514.0
17       16066.0
18        5580.0
19        6586.0
20        5449.0
21      356724.0
22       16279.0
23        5849.0
24       18650.0
Name: timeToAnswer, dtype: float64

In [107]:
# weed out the bad rows that have a future creation date:
df2 = df[(df.creationDateTime < pd.to_datetime('now'))]

In [108]:
len(df2)

49999

In [109]:
# with the new dataframe, compute the number of questions by hour
df2.groupby('creationHour').count().compute()

Unnamed: 0_level_0,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1,creationDateTime,timeToAnswer
creationHour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1584,1584,1584,1584,1584,1584,1222,1584,1584,1584,...,1584,1584,1584,1584,1584,1584,1584,1584,1584,1584
1,1393,1393,1393,1393,1393,1393,1050,1393,1393,1393,...,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393
2,1318,1318,1318,1318,1318,1318,1006,1318,1318,1318,...,1318,1318,1318,1318,1318,1318,1318,1318,1318,1318
3,1291,1291,1291,1291,1291,1291,958,1291,1291,1291,...,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291
4,1271,1271,1271,1271,1271,1271,958,1271,1271,1271,...,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271
5,1197,1197,1197,1197,1197,1197,866,1197,1197,1197,...,1197,1197,1197,1197,1197,1197,1197,1197,1197,1197
6,1128,1128,1128,1128,1128,1128,818,1128,1128,1128,...,1128,1128,1128,1128,1128,1128,1128,1128,1128,1128
7,1331,1331,1331,1331,1331,1331,990,1331,1331,1331,...,1331,1331,1331,1331,1331,1331,1331,1331,1331,1331
8,1465,1465,1465,1465,1465,1465,1093,1465,1465,1465,...,1465,1465,1465,1465,1465,1465,1465,1465,1465,1465
9,1843,1843,1843,1843,1843,1843,1373,1843,1843,1843,...,1843,1843,1843,1843,1843,1843,1843,1843,1843,1843


In [110]:
df2.describe()

Unnamed: 0_level_0,Unnamed: 0,id,owneruserid,posttypeid,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,owneruserid.1,lasteditoruserid,CommentCount,FavoriteCount,acceptedAnswerId,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,creationHour
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,int64,int64,int64,int64,int64,float64,int64,int64,int64,int64,float64,int64,int64,float64,float64,float64,float64,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [111]:
df2.acceptedanswerid.head(10)

0        7.0
1       31.0
2     1404.0
3     1248.0
4        NaN
5        NaN
6    12446.0
7       26.0
8      531.0
9       49.0
Name: acceptedanswerid, dtype: float64

In [112]:
# count how many questions have an accepted answer:
df3 = df2[(df2.acceptedanswerid.notnull())]

In [113]:
total_with_accepted_answer = len(df3)

In [114]:
# get the percentage:
float(total_with_accepted_answer)/49999.00*100.00

76.1695233904678

In [115]:
# what percentage of questions that have an accepted answer got that answer within: 
#   i) one hour of the original post?
#  ii) one day of the original post?
# iii) one week of the original post?
df_i = df[(df.timeToAnswer < 60*60)]
df_ii = df[(df.timeToAnswer < 60*60*24)]
df_iii = df[(df.timeToAnswer < 60*60*24*7)]

In [116]:
tot_within_hour = len(df_i)

In [117]:
tot_within_day = len(df_ii)

In [118]:
tot_within_week = len(df_iii)

In [121]:
# percentages
total = 49999.0
print "Within one hour: {}%".format(float(tot_within_hour)/total*100.0)
print "Within one day: {}%".format(float(tot_within_day)/total*100.0)
print "Within one week: {}%".format(float(tot_within_week)/total*100.0)

Within one hour: 0.0040000800016%
Within one day: 0.894017880358%
Within one week: 26.4625292506%


In [120]:
tot_within_week

13231

In [122]:
# within one month
df_month =  df[(df.timeToAnswer < 60*60*24*30)]

In [123]:
tot_within_month = len(df_month)

In [125]:
print "Within one month: {}%".format(float(tot_within_month)/total*100.0)

Within one month: 76.0975219504%
