In [79]:
import datetime
from dask import dataframe as dd
import pandas as pd

In [80]:
csv = '/home/centos/stackoverflow.csv'

In [81]:
# load the csv file into a dask dataframe
df = dd.read_csv(csv)

In [82]:
# count the total number of rows
len(df)

14015379

In [14]:
# list the columns
df.columns

Index([u'Unnamed: 0', u'id', u'owneruserid', u'posttypeid', u'creationdate',
       u'CreationDayOfWeek', u'acceptedanswerid', u'score', u'answercount',
       u'viewcount', u'owneruserid.1', u'lasteditoruserid', u'lasteditdate',
       u'LastActivityDate', u'CommentCount', u'FavoriteCount', u'site',
       u'acceptedAnswerId', u'acceptedAnswerCreationDate',
       u'acceptedAnswerCreationDayOfWeek', u'acceptedAnswerScore',
       u'acceptedAnswerCommentCount', u'acceptedAnswerOwnerUserId', u'site.1'],
      dtype='object')

In [7]:
# compute the average number of answers for each day of the week
df.groupby(df.CreationDayOfWeek).answercount.mean().compute()

CreationDayOfWeek
1    1.538740
2    1.564522
3    1.581523
4    1.579910
5    1.579372
6    1.590603
7    1.547162
Name: answercount, dtype: float64

In [16]:
df.creationdate.head()

0    2008-07-31 21:42:53
1    2008-07-31 22:08:09
2    2008-07-31 23:41:00
3    2008-07-31 23:55:38
4    2008-08-01 00:42:39
Name: creationdate, dtype: object

In [33]:
def to_datetime(row, col="creationdate"):
    try:
        return datetime.datetime.strptime(row[col], '%Y-%m-%d %H:%M:%S')
    except ValueError:
        # return a future date for easy discard
        return datetime.datetime.strptime('2020-01-01 12:00:00', '%Y-%m-%d %H:%M:%S')

# create a new column with datetime objects built from the creationdatetime field
df["creationDateTime"] = df.apply (lambda row: to_datetime(row), axis=1)

In [34]:
df.columns

Index([u'Unnamed: 0', u'id', u'owneruserid', u'posttypeid', u'creationdate',
       u'CreationDayOfWeek', u'acceptedanswerid', u'score', u'answercount',
       u'viewcount', u'owneruserid.1', u'lasteditoruserid', u'lasteditdate',
       u'LastActivityDate', u'CommentCount', u'FavoriteCount', u'site',
       u'acceptedAnswerId', u'acceptedAnswerCreationDate',
       u'acceptedAnswerCreationDayOfWeek', u'acceptedAnswerScore',
       u'acceptedAnswerCommentCount', u'acceptedAnswerOwnerUserId', u'site.1',
       u'creationDateTime'],
      dtype='object')

In [57]:
small = df.head(1000)

In [58]:
type(small)

pandas.core.frame.DataFrame

In [59]:
len(small)

1000

In [64]:
small.creationDateTime.iloc[0]

Timestamp('2008-07-31 21:42:53')

In [69]:
small.creationDateTime.iloc[0] < pd.to_datetime('now')#pd.Timestamp('2018-01-23 00:00:00', tz=None)

True

In [68]:
pd.to_datetime('now')

Timestamp('2017-07-15 18:38:05')

In [36]:
# create a row that holds the creation time hour
def to_hour(row, col="creationDateTime"):
    # covert a datetime row to the hour
    return row[col].hour

df['creationHour'] = df.apply(lambda row: to_hour(row), axis=1)

In [37]:
df.columns

Index([u'Unnamed: 0', u'id', u'owneruserid', u'posttypeid', u'creationdate',
       u'CreationDayOfWeek', u'acceptedanswerid', u'score', u'answercount',
       u'viewcount', u'owneruserid.1', u'lasteditoruserid', u'lasteditdate',
       u'LastActivityDate', u'CommentCount', u'FavoriteCount', u'site',
       u'acceptedAnswerId', u'acceptedAnswerCreationDate',
       u'acceptedAnswerCreationDayOfWeek', u'acceptedAnswerScore',
       u'acceptedAnswerCommentCount', u'acceptedAnswerOwnerUserId', u'site.1',
       u'creationDateTime', u'creationHour'],
      dtype='object')

In [70]:
# weed out the bad rows that have a future creation date:
df2 = df[(df.creationDateTime < pd.to_datetime('now'))]

<dask.dataframe.groupby.DataFrameGroupBy at 0x7f5b483d3510>

In [71]:
df2.columns

Index([u'Unnamed: 0', u'id', u'owneruserid', u'posttypeid', u'creationdate',
       u'CreationDayOfWeek', u'acceptedanswerid', u'score', u'answercount',
       u'viewcount', u'owneruserid.1', u'lasteditoruserid', u'lasteditdate',
       u'LastActivityDate', u'CommentCount', u'FavoriteCount', u'site',
       u'acceptedAnswerId', u'acceptedAnswerCreationDate',
       u'acceptedAnswerCreationDayOfWeek', u'acceptedAnswerScore',
       u'acceptedAnswerCommentCount', u'acceptedAnswerOwnerUserId', u'site.1',
       u'creationDateTime', u'creationHour'],
      dtype='object')

In [72]:
len(df2)

14015379

In [None]:
# with the new dataframe, compute the number of questions by hour
df2.groupby(df2.creationHour).compute()

In [73]:
df3 = df[(df.acceptedanswerid == pd.isnull)]

In [74]:
len(df3)

0