In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [2]:
itComentsRows = sc.textFile("../Data/Italian_Stack_Exchange/italianComments.csv")
itCommentsSplit = itComentsRows.map(lambda x : x.encode('utf8').split("~"))

In [3]:
from pyspark.sql import Row
from datetime import datetime

def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, "%Y-%m-%d %H:%M:%S.%f")
  except ValueError:
    return None

In [43]:
def stringToComment(r):
  return Row(
    long(r[0]),
    toTimeSafe(r[1]),
    r[2],
    )

In [44]:
from pyspark.sql.types import *
commentSchema = StructType([
    StructField("id_", LongType(), False),
    StructField("commentDate", TimestampType(), True),
    StructField("comment", StringType(), True),
    ])

In [45]:
rowRDD = itCommentsSplit.map(lambda x: stringToComment(x))

In [46]:
itCommentsDFStruct = sqlContext.createDataFrame(rowRDD, commentSchema)

In [47]:
itCommentsDFStruct.printSchema()

root
 |-- id_: long (nullable = false)
 |-- commentDate: timestamp (nullable = true)
 |-- comment: string (nullable = true)



In [48]:
itCommentsDFStruct.show()

+---+--------------------+--------------------+
|id_|         commentDate|             comment|
+---+--------------------+--------------------+
| 18|2013-11-05 20:39:...|It's going to be ...|
|  6|2013-11-05 20:41:...|Why not &quot;IL ...|
| 18|2013-11-05 20:43:...|    Yep, added that.|
|  6|2013-11-05 20:45:...|La squadra Milan ...|
|  6|2013-11-05 20:46:...|`ExamplesLa (squa...|
| 17|2013-11-05 20:48:...|Actually, no. As ...|
|  6|2013-11-05 20:52:...|Oh, c'mon: http:/...|
| 18|2013-11-05 20:54:...|There's no citati...|
| 12|2013-11-05 20:57:...|Se il genere dei ...|
| 18|2013-11-05 21:02:...|E' un'eccezione: ...|
| 12|2013-11-05 21:03:...|I agree with Dami...|
| 17|2013-11-05 21:14:...|Agreed, even thou...|
|  6|2013-11-05 21:15:...|@GabrielePetronel...|
| 12|2013-11-05 21:15:...|+1, but, neverthe...|
|  6|2013-11-05 21:17:...|@KyriakosKyritsis...|
| 17|2013-11-05 21:18:...|@KyriakosKyritsis...|
| 17|2013-11-05 21:21:...|*Il Cairo* Ã¨ il ...|
|  5|2013-11-05 21:42:...|Conversely, En

In [49]:
import pyspark.sql.functions as func
itCommentsDFStruct.filter(func.to_date(itCommentsDFStruct['commentDate'])=='2013-11-07').show()

+---+--------------------+--------------------+
|id_|         commentDate|             comment|
+---+--------------------+--------------------+
|  5|2013-11-07 07:15:...|Very related ques...|
|  5|2013-11-07 07:21:...|I would add that ...|
| 57|2013-11-07 09:59:...|Nice quote! :D I ...|
| 79|2013-11-07 10:34:...|Great question, I...|
| 22|2013-11-07 10:55:...|That's indeed the...|
|120|2013-11-07 12:08:...|Yes, exactly! Man...|
| 70|2013-11-07 13:23:...|I changed your â...|
| 70|2013-11-07 13:30:...|It depends on usa...|
| 77|2013-11-07 15:25:...|  right, I modify it|
| 37|2013-11-07 15:30:...|@Daniele B: I kno...|
| 12|2013-11-07 15:54:...|As per the posted...|
| 19|2013-11-07 17:11:...|&quot;Ci&quot; in...|
|  5|2013-11-07 18:43:...|@kiamlaluno: *dia...|
| 98|2013-11-07 18:50:...|@kiamlaluno I don...|
| 63|2013-11-07 20:00:...|I am sorry I didn...|
| 22|2013-11-07 20:04:...|Just for clarity,...|
| 63|2013-11-07 20:08:...|I was referring t...|
| 22|2013-11-07 20:12:...|&quot;Mi sei a

In [50]:
import pyspark.sql.functions as func
#instr() : Returns the position of the first occurrence of substr in str
itCommentsDFStruct.filter((func.instr(itCommentsDFStruct['comment'],"@Daniele") > 0)).show()

+---+--------------------+--------------------+
|id_|         commentDate|             comment|
+---+--------------------+--------------------+
| 37|2013-11-07 15:30:...|@Daniele B: I kno...|
+---+--------------------+--------------------+



In [51]:
#to_date() : Converts a string formatted like yyyy-MM-dd into a Date object.
itCommentsDFStruct.filter(func.to_date(itCommentsDFStruct['commentDate'])=='2013-11-07').filter((func.instr(itCommentsDFStruct['comment'],"@Daniele") > 0)).show()

+---+--------------------+--------------------+
|id_|         commentDate|             comment|
+---+--------------------+--------------------+
| 37|2013-11-07 15:30:...|@Daniele B: I kno...|
+---+--------------------+--------------------+

