In [48]:
from dask.distributed import Client, Executor, progress
from dask import dataframe as dd
import datetime
import numpy as np

In [2]:
client = Client('10.10.100.15:8786')

In [3]:
client

<Client: scheduler='tcp://10.10.100.15:8786' processes=18 cores=18>

In [4]:
client.get_versions(check=True)

{'client': {'host': [('python', '3.6.1.final.0'),
   ('python-bits', 64),
   ('OS', 'Linux'),
   ('OS-release', '4.4.0-81-generic'),
   ('machine', 'x86_64'),
   ('processor', 'x86_64'),
   ('byteorder', 'little'),
   ('LC_ALL', 'None'),
   ('LANG', 'en_US.UTF-8'),
   ('LOCALE', 'en_US.UTF-8')],
  'packages': {'optional': [('numpy', '1.12.1'), ('pandas', '0.20.1')],
   'required': [('dask', '0.14.3'),
    ('distributed', '1.16.3'),
    ('msgpack', '0.4.8'),
    ('cloudpickle', '0.2.2'),
    ('toolz', '0.8.2')]}},
 'scheduler': {'host': [['python', '3.6.1.final.0'],
   ['python-bits', 64],
   ['OS', 'Linux'],
   ['OS-release', '4.4.0-81-generic'],
   ['machine', 'x86_64'],
   ['processor', 'x86_64'],
   ['byteorder', 'little'],
   ['LC_ALL', 'None'],
   ['LANG', 'en_US.UTF-8'],
   ['LOCALE', 'en_US.UTF-8']],
  'packages': {'optional': [['numpy', '1.12.1'], ['pandas', '0.20.1']],
   'required': [['dask', '0.14.3'],
    ['distributed', '1.16.3'],
    ['msgpack', '0.4.8'],
    ['cloudpickl

In [30]:
# create the dataframe by reading in the CSV
csv = '/root/stackoverflow.csv'

# load the csv file into a dask dataframe
df = dd.read_csv(csv, parse_dates=['creationdate', 'acceptedAnswerCreationDate'])
df = client.persist(df)
progress(df)

In [31]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,CommentCount,FavoriteCount,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1
0,0,4,8,1,2008-07-31 21:42:53,5,7.0,491,13,31416,...,4,37,stackoverflow,7.0,2008-07-31 22:17:58,5.0,349.0,1.0,9.0,stackoverflow
1,1,6,9,1,2008-07-31 22:08:09,5,31.0,217,5,14712,...,0,8,stackoverflow,31.0,2008-08-01 12:22:52,6.0,111.0,0.0,0.0,stackoverflow
2,2,9,1,1,2008-07-31 23:41:00,5,1404.0,1509,58,388201,...,6,341,stackoverflow,1404.0,2008-08-04 16:50:06,2.0,1403.0,30.0,212.0,stackoverflow
3,3,11,1,1,2008-07-31 23:55:38,5,1248.0,1182,33,120658,...,3,517,stackoverflow,1248.0,2008-08-04 13:57:26,2.0,792.0,16.0,268.0,stackoverflow
4,4,13,9,1,2008-08-01 00:42:39,6,,482,24,138485,...,3,123,stackoverflow,12446.0,2008-08-15 16:27:32,6.0,64.0,1.0,5.0,stackoverflow


In [33]:
df.acceptedAnswerCreationDate

Dask Series Structure:
npartitions=36
None    datetime64[ns]
None               ...
             ...      
None               ...
None               ...
Name: acceptedAnswerCreationDate, dtype: datetime64[ns]
Dask Name: getitem, 72 tasks

In [7]:
# count the number of total rows:
len(df)

14015379

In [8]:
# inspect the data set
df.columns

Index(['Unnamed: 0', 'id', 'owneruserid', 'posttypeid', 'creationdate',
       'CreationDayOfWeek', 'acceptedanswerid', 'score', 'answercount',
       'viewcount', 'owneruserid.1', 'lasteditoruserid', 'lasteditdate',
       'LastActivityDate', 'CommentCount', 'FavoriteCount', 'site',
       'acceptedAnswerId', 'acceptedAnswerCreationDate',
       'acceptedAnswerCreationDayOfWeek', 'acceptedAnswerScore',
       'acceptedAnswerCommentCount', 'acceptedAnswerOwnerUserId', 'site.1'],
      dtype='object')

In [34]:
# do some basic aggregation -- 
# compute the average number of answers for each day of the week
df.groupby(df.CreationDayOfWeek).answercount.mean().compute()

CreationDayOfWeek
1    1.538740
2    1.564522
3    1.581523
4    1.579910
5    1.579372
6    1.590603
7    1.547162
Name: answercount, dtype: float64

In [35]:
# count number of rows with an accepted answer
len(df[(df.acceptedanswerid.notnull())])

7561435

In [36]:
# count the number of rows without an accepted answer -- sanity check
len(df[(df.acceptedanswerid.isnull())])

6453944

In [72]:
# create a new dataframe that only has the rows with an accepted answer:
df2 = df[(df.acceptedanswerid.notnull())]

# Also filter out rows where the acceptedAnswerCreationDate is before the creationdate
df3 = df[(df['acceptedAnswerCreationDate'] > df['creationdate'])]

# create a row that holds the time to answer, in seconds
df2['timeToAnswer'] = df2['acceptedAnswerCreationDate'] - df2['creationdate']
df3['timeToAnswer'] = df3['acceptedAnswerCreationDate'] - df3['creationdate']

In [73]:
len(df3)

7561154

In [74]:
df3.timeToAnswer.head()

0    0 days 00:35:05
1    0 days 14:14:43
2    3 days 17:09:06
3    3 days 14:01:48
4   14 days 15:44:53
Name: timeToAnswer, dtype: timedelta64[ns]

In [44]:
df2.head()

Unnamed: 0.1,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,FavoriteCount,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1,timeToAnswer
0,0,4,8,1,2008-07-31 21:42:53,5,7.0,491,13,31416,...,37,stackoverflow,7.0,2008-07-31 22:17:58,5.0,349.0,1.0,9.0,stackoverflow,0 days 00:35:05
1,1,6,9,1,2008-07-31 22:08:09,5,31.0,217,5,14712,...,8,stackoverflow,31.0,2008-08-01 12:22:52,6.0,111.0,0.0,0.0,stackoverflow,0 days 14:14:43
2,2,9,1,1,2008-07-31 23:41:00,5,1404.0,1509,58,388201,...,341,stackoverflow,1404.0,2008-08-04 16:50:06,2.0,1403.0,30.0,212.0,stackoverflow,3 days 17:09:06
3,3,11,1,1,2008-07-31 23:55:38,5,1248.0,1182,33,120658,...,517,stackoverflow,1248.0,2008-08-04 13:57:26,2.0,792.0,16.0,268.0,stackoverflow,3 days 14:01:48
6,6,16,2,1,2008-08-01 04:59:34,6,12446.0,91,5,74191,...,13,stackoverflow,531.0,2008-08-02 18:22:53,7.0,162.0,2.0,157.0,stackoverflow,1 days 13:23:19


In [51]:
# create a new column from the timeToAnswer timedelta column
df2['daysToAnswer'] = (df2['timeToAnswer'] / np.timedelta64(1, 'D')).astype(int)

# make a simple assumption that 30 days in a month
df2['monthsToAnswer'] = (df2['timeToAnswer'] / np.timedelta64(30, 'D')).astype(int)

In [75]:
# create a new column from the timeToAnswer timedelta column
df3['daysToAnswer'] = (df3['timeToAnswer'] / np.timedelta64(1, 'D')).astype(int)

# make a simple assumption that 30 days in a month
df3['monthsToAnswer'] = (df3['timeToAnswer'] / np.timedelta64(30, 'D')).astype(int)

In [52]:
df2.head()

Unnamed: 0.1,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1,timeToAnswer,daysToAnswer,monthsToAnswer
0,0,4,8,1,2008-07-31 21:42:53,5,7.0,491,13,31416,...,7.0,2008-07-31 22:17:58,5.0,349.0,1.0,9.0,stackoverflow,0 days 00:35:05,0,0
1,1,6,9,1,2008-07-31 22:08:09,5,31.0,217,5,14712,...,31.0,2008-08-01 12:22:52,6.0,111.0,0.0,0.0,stackoverflow,0 days 14:14:43,0,0
2,2,9,1,1,2008-07-31 23:41:00,5,1404.0,1509,58,388201,...,1404.0,2008-08-04 16:50:06,2.0,1403.0,30.0,212.0,stackoverflow,3 days 17:09:06,3,0
3,3,11,1,1,2008-07-31 23:55:38,5,1248.0,1182,33,120658,...,1248.0,2008-08-04 13:57:26,2.0,792.0,16.0,268.0,stackoverflow,3 days 14:01:48,3,0
6,6,16,2,1,2008-08-01 04:59:34,6,12446.0,91,5,74191,...,531.0,2008-08-02 18:22:53,7.0,162.0,2.0,157.0,stackoverflow,1 days 13:23:19,1,0


In [76]:
#df2.monthsToAnswer.value_counts().compute()
df3.groupby('monthsToAnswer').count().compute()

Unnamed: 0_level_0,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1,timeToAnswer,daysToAnswer
monthsToAnswer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,38054,38054,38054,38054,38054,38054,29247,38054,38054,38054,...,38054,38054,38054,38054,38054,38054,38054,38054,38054,38054
1,117763,117763,117763,117763,117763,117763,86737,117763,117763,117763,...,117763,117763,117763,117763,117763,117763,117763,117763,117763,117763
2,159730,159730,159730,159730,159730,159730,115459,159730,159730,159730,...,159730,159730,159730,159730,159730,159730,159730,159730,159730,159730
3,259275,259275,259275,259275,259275,259275,188265,259275,259275,259275,...,259275,259275,259275,259275,259275,259275,259275,259275,259275,259275
4,538631,538631,538631,538631,538631,538631,380584,538631,538631,538631,...,538631,538631,538631,538631,538631,538631,538631,538631,538631,538631
5,263313,263313,263313,263313,263313,263313,181642,263313,263313,263313,...,263313,263313,263313,263313,263313,263313,263313,263313,263313,263313
6,307447,307447,307447,307447,307447,307447,210334,307447,307447,307447,...,307447,307447,307447,307447,307447,307447,307447,307447,307447,307447
7,535396,535396,535396,535396,535396,535396,364099,535396,535396,535396,...,535396,535396,535396,535396,535396,535396,535396,535396,535396,535396
8,332646,332646,332646,332646,332646,332646,223215,332646,332646,332646,...,332646,332646,332646,332646,332646,332646,332646,332646,332646,332646
9,316091,316091,316091,316091,316091,316091,206958,316091,316091,316091,...,316091,316091,316091,316091,316091,316091,316091,316091,316091,316091


In [23]:
# count the number of rows without an accepted answer creation date
len(df2)

7561435

In [26]:
df2.timeToAnswer.head(1)

0    2105
Name: timeToAnswer, dtype: int64

In [38]:
import pandas as pd
def to_year(x):
    if x < 0:
        return 'neg'
    if x < 60*60*24*365:
        return 'year 1'
    if x < 60*60*24*365*2:
        return 'year 2'
    if x < 60*60*24*365*3:
        return 'year 3'
    if x < 60*60*24*365*4:
        return 'year 4'
    
df2.groupby(by=to_year)[df2.timeToAnswer]

KeyError: 'Column not found: Dask Series Structure:\nnpartitions=36\nNone    int64\nNone      ...\n        ...  \nNone      ...\nNone      ...\nName: timeToAnswer, dtype: int64\nDask Name: getitem, 252 tasks'

In [33]:
import numpy as np
pd.Series.hist(df2.timeToAnswer.astype(np.int))

TypeError: 'float' object cannot be interpreted as an integer

In [23]:
# what percentage of questions that have an accepted answer got that answer within: 
#   i) one hour of the original post?
#  ii) one day of the original post?
# iii) one week of the original post?
#  iv) two weeks of the original post?
df_i = df2[(df2.timeToAnswer < 60*60)]
df_ii = df2[(df2.timeToAnswer < 60*60*24)]
df_iii = df2[(df2.timeToAnswer < 60*60*24*7)]
df_iv = df2[(df2.timeToAnswer < 60*60*24*7*2)]

In [24]:
tot_within_hour = len(df_i)

In [25]:
tot_within_day = len(df_ii)

In [26]:
tot_within_week = len(df_iii)

In [27]:
tot_within_two_weekd = len(df_iv)

In [29]:
total = len(df2)
print("Within one hour: {}%".format(float(tot_within_hour)/total*100.0))
print("Within one day: {}%".format(float(tot_within_day)/total*100.0))
print("Within one week: {}%".format(float(tot_within_week)/total*100.0))
print("Within two weeks: {}%".format(float(tot_within_two_weekd)/total*100.0))

Within one hour: 38.33878093245528%
Within one day: 38.343899008587655%
Within one week: 38.47818568829858%
Within two weeks: 38.554586530202265%


In [30]:
tot_within_hour

2898962

In [31]:
tot_within_day

2899349

In [32]:
tot_within_week

2909503

In [33]:
tot_within_two_weekd

2915280

In [34]:
df_month = df2[(df2.timeToAnswer < 60*60*24*7*4)]

In [35]:
within_month = len(df_month)

tornado.application - ERROR - Exception in callback None
Traceback (most recent call last):
  File "/root/anaconda3/lib/python3.6/site-packages/tornado/ioloop.py", line 887, in start
    fd_obj, handler_func = self._handlers[fd]
KeyError: 58


In [36]:
within_month

2926533

In [40]:
df2[(df2.timeToAnswer > 0)].mean().compute()

Unnamed: 0                         3.540919e+06
id                                 1.261774e+07
owneruserid                        1.105332e+06
posttypeid                         1.000000e+00
CreationDayOfWeek                  3.999107e+00
acceptedanswerid                   1.278405e+07
score                              3.762527e+00
answercount                        2.126443e+00
viewcount                          3.964038e+03
owneruserid.1                      1.105332e+06
lasteditoruserid                   9.195380e+05
CommentCount                       1.604489e+00
FavoriteCount                      1.211838e+00
acceptedAnswerId                   1.987514e+07
acceptedAnswerCreationDayOfWeek    4.000965e+00
acceptedAnswerScore                4.515524e+00
acceptedAnswerCommentCount         1.978022e+00
acceptedAnswerOwnerUserId          1.412591e+06
timeToAnswer                       3.621236e+07
dtype: float64

In [41]:
len(df2[(df2.timeToAnswer > 0)])

4662475

In [42]:
df2[(df2.timeToAnswer < 0)].head()

Unnamed: 0.1,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,FavoriteCount,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1,timeToAnswer
182127,182127,1030715,95624,1,2009-06-23 05:07:17,3,1031536.0,1,4,5388,...,1,stackoverflow,237686.0,2008-10-26 07:19:22,1.0,14.0,0.0,16371.0,stackoverflow,-20728075.0


In [7]:
df3 = df[(df.creationdate > df.acceptedAnswerCreationDate)]

In [8]:
df3.head(n=10)

Unnamed: 0.1,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,CommentCount,FavoriteCount,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1
164385,164385,946700,116834,1,2009-06-03 19:33:18,4,,4,5,3500,...,0,9,stackoverflow,84599.0,2008-09-17 15:37:46,4.0,15.0,2.0,13850.0,stackoverflow
182127,182127,1030715,95624,1,2009-06-23 05:07:17,3,1031536.0,1,4,5388,...,0,1,stackoverflow,237686.0,2008-10-26 07:19:22,1.0,14.0,0.0,16371.0,stackoverflow


In [9]:
len(df3)

40

In [50]:
df3[:10]

Unnamed: 0_level_0,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,owneruserid.1,lasteditoruserid,lasteditdate,LastActivityDate,CommentCount,FavoriteCount,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1
npartitions=36,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
,int64,int64,int64,int64,object,int64,float64,int64,int64,int64,int64,float64,object,object,int64,int64,object,float64,object,float64,float64,float64,float64,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [22]:
df3.head(n=15, npartitions=-1)

Unnamed: 0.1,Unnamed: 0,id,owneruserid,posttypeid,creationdate,CreationDayOfWeek,acceptedanswerid,score,answercount,viewcount,...,CommentCount,FavoriteCount,site,acceptedAnswerId,acceptedAnswerCreationDate,acceptedAnswerCreationDayOfWeek,acceptedAnswerScore,acceptedAnswerCommentCount,acceptedAnswerOwnerUserId,site.1
164385,164385,946700,116834,1,2009-06-03 19:33:18,4,,4,5,3500,...,0,9,stackoverflow,84599.0,2008-09-17 15:37:46,4.0,15.0,2.0,13850.0,stackoverflow
182127,182127,1030715,95624,1,2009-06-23 05:07:17,3,1031536.0,1,4,5388,...,0,1,stackoverflow,237686.0,2008-10-26 07:19:22,1.0,14.0,0.0,16371.0,stackoverflow
34663,379353,1883316,8835,1,2009-12-10 19:19:59,5,2059165.0,1,2,547,...,0,0,stackoverflow,2443025.0,2009-08-07 13:18:47,6.0,3.0,0.0,107156.0,stackoverflow
134061,478751,2291015,171365,1,2010-02-18 18:13:26,5,2291050.0,0,2,3597,...,4,1,stackoverflow,2970430.0,2009-12-08 15:03:11,3.0,3.0,5.0,0.0,stackoverflow
81521,766924,3373073,4067,1,2010-07-30 15:41:02,6,,1,1,285,...,0,1,stackoverflow,1882765.0,2009-12-10 17:51:20,5.0,149.0,4.0,163053.0,stackoverflow
258007,943410,4005873,485273,1,2010-10-23 20:15:53,7,,1,1,21049,...,5,1,stackoverflow,5328063.0,2010-03-27 21:17:28,7.0,3.0,1.0,187793.0,stackoverflow
67754,1433393,5727458,644384,1,2011-04-20 08:22:18,4,,5,2,702,...,5,0,stackoverflow,7785209.0,2011-01-22 15:27:38,7.0,2.0,1.0,158483.0,stackoverflow
228674,1594313,6278921,522089,1,2011-06-08 12:48:57,4,,2,3,3406,...,0,0,stackoverflow,8577237.0,2010-12-23 20:28:06,5.0,6.0,3.0,276311.0,stackoverflow
85167,1789000,6945572,710502,1,2011-08-04 17:05:12,5,6945989.0,6,5,27481,...,4,1,stackoverflow,9542948.0,2010-11-29 10:01:25,2.0,4.0,0.0,124238.0,stackoverflow
156720,1860553,7188670,911802,1,2011-08-25 10:17:36,5,,0,2,863,...,1,0,stackoverflow,9898235.0,2011-05-09 09:51:05,2.0,17.0,4.0,310174.0,stackoverflow


In [23]:
df4 = df[(df.creationdate <= df.acceptedAnswerCreationDate)]

In [24]:
len(df4)

7561154

In [29]:
df5 = df[(df.creationdate.isnull())]

In [30]:
len(df5)

0

In [31]:
df6 = df[(df.acceptedAnswerCreationDate.isnull())]

In [32]:
len(df6)

6454185

In [33]:
7561435 + _

14015620