In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [4]:
itPostsRows = sc.textFile("../Data/Italian_Stack_Exchange/italianPosts.csv")
itPostsSplit = itPostsRows.map(lambda x: x.split("~"))
from pyspark.sql import Row
from datetime import datetime
def toIntSafe(inval):
  try:
    return int(inval)
  except ValueError:
    return None

def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, "%Y-%m-%d %H:%M:%S.%f")
  except ValueError:
    return None

def toLongSafe(inval):
  try:
    return long(inval)
  except ValueError:
    return None
    
def stringToPost(row):
  r = row.encode('utf8').split("~")
  return Row(
    toIntSafe(r[0]),
    toTimeSafe(r[1]),
    toIntSafe(r[2]),
    r[3],
    toIntSafe(r[4]),
    toTimeSafe(r[5]),
    toIntSafe(r[6]),
    toIntSafe(r[7]),
    r[8],
    toIntSafe(r[9]),
    toLongSafe(r[10]),
    toLongSafe(r[11]),
    long(r[12]))
from pyspark.sql.types import *
postSchema = StructType([
  StructField("commentCount", IntegerType(), True),
  StructField("lastActivityDate", TimestampType(), True),
  StructField("ownerUserId", LongType(), True),
  StructField("body", StringType(), True),
  StructField("score", IntegerType(), True),
  StructField("creationDate", TimestampType(), True),
  StructField("viewCount", IntegerType(), True),
  StructField("title", StringType(), True),
  StructField("tags", StringType(), True),
  StructField("answerCount", IntegerType(), True),
  StructField("acceptedAnswerId", LongType(), True),
  StructField("postTypeId", LongType(), True),
  StructField("id", LongType(), False)
  ])
rowRDD = itPostsRows.map(lambda x: stringToPost(x))
itPostsDFStruct = sqlContext.createDataFrame(rowRDD, postSchema)

In [5]:
from pyspark.sql.functions import *
itPostsDFStruct.withColumn('duration',datediff('lastActivityDate','creationDate')).sort('duration', ascending = 0).show()

+------------+--------------------+-----------+--------------------+-----+--------------------+---------+-----+--------------------+-----------+----------------+----------+----+--------+
|commentCount|    lastActivityDate|ownerUserId|                body|score|        creationDate|viewCount|title|                tags|answerCount|acceptedAnswerId|postTypeId|  id|duration|
+------------+--------------------+-----------+--------------------+-----+--------------------+---------+-----+--------------------+-----------+----------------+----------+----+--------+
|           3|2014-09-11 14:37:...|         63|&lt;p&gt;The plur...|    5|2013-11-12 13:34:...|       59| null|&lt;plural&gt;&lt...|          1|            1227|         1|1221|     303|
|           4|2014-09-09 08:54:...|         63|&lt;p&gt;Some wor...|    4|2013-11-12 11:03:...|       80| null|&lt;nouns&gt;&lt;...|          2|            1207|         1|1205|     301|
|           0|2014-09-12 10:55:...|          8|&lt;p&gt;Wikipedi.

In [6]:
itPostsDFStruct.groupBy('ownerUserId').avg('score').sort('avg(score)', ascending=0).show()

+-----------+-----------------+
|ownerUserId|       avg(score)|
+-----------+-----------------+
|          6|             15.0|
|        570|             15.0|
|        730|             12.0|
|        729|             11.0|
|        154|             11.0|
|        217|             10.0|
|        220|             10.0|
|        656|              9.0|
|        445|              9.0|
|        116|              9.0|
|         48|8.666666666666666|
|        114|              8.5|
|        124|8.333333333333334|
|         57|              8.3|
|        590|              8.0|
|         85|              8.0|
|        504|              8.0|
|        158|              8.0|
|         17|7.967741935483871|
|         56|              7.8|
+-----------+-----------------+
only showing top 20 rows



In [7]:
itPostsDFStruct.groupBy('ownerUserId').avg('score').sort('avg(score)', ascending=0)

DataFrame[ownerUserId: bigint, avg(score): double]