In [1]:
println(s"Current spark version is ${spark.version}")

Current spark version is 2.4.5


In [2]:
import org.apache.spark.sql.types.{StructType, StructField, IntegerType, LongType, StringType}

val dataSchema = new StructType()
    .add("target", IntegerType)
    .add("id", LongType)
    .add("raw_timestamp", StringType)
    .add("query_status", StringType)
    .add("author", StringType)
    .add("tweet", StringType)

val dataPath= "/home/jovyan/data/training.1600000.processed.noemoticon.csv"

val raw_sentiment = spark.read
    .format("csv")
    .option("header",false)
    .schema(dataSchema)
    .load(dataPath)
    .selectExpr("(case when target=4 then 1 else 0 end) as label","tweet")

raw_sentiment.groupBy($"label").count.show

+-----+------+
|label| count|
+-----+------+
|    1|800000|
|    0|800000|
+-----+------+



dataSchema = StructType(StructField(target,IntegerType,true), StructField(id,LongType,true), StructField(raw_timestamp,StringType,true), StructField(query_status,StringType,true), StructField(author,StringType,true), StructField(tweet,StringType,true))
dataPath = /home/jovyan/data/training.1600000.processed.noemoticon.csv
raw_sentiment = [label: int, tweet: string]


[label: int, tweet: string]

In [6]:
raw_sentiment.head(5)

Array([0,@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D], [0,is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!], [0,@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds], [0,my whole body feels itchy and like its on fire ], [0,@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. ])

In [7]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
//import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.classification.RandomForestClassifier
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

val tokenizer = new Tokenizer()
    .setInputCol("tweet")
    .setOutputCol("words")

val hashingTF = new HashingTF()
    .setNumFeatures(1000)
    .setInputCol(tokenizer.getOutputCol)
    .setOutputCol("features")

/*
val lr = new LogisticRegression()
    .setMaxIter(10)
    .setRegParam(0.001)
*/
val rf = new RandomForestClassifier()
    .setLabelCol("label")
    .setFeaturesCol("features")
    .setNumTrees(100)
    .setMaxDepth(8)

val pipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, rf))


tokenizer = tok_cdcbc1c4c050
hashingTF = hashingTF_8415ed72c6db
rf = rfc_ab1fa668090c
pipeline = pipeline_68625e8bfb62


pipeline_68625e8bfb62

In [8]:
val model = pipeline.fit(raw_sentiment)

model = pipeline_68625e8bfb62


pipeline_68625e8bfb62

In [9]:
model.write.overwrite().save("/home/jovyan/models/spark-ml-model")

In [10]:
val sameModel = PipelineModel.load("/home/jovyan/models/spark-ml-model")

sameModel = pipeline_68625e8bfb62


pipeline_68625e8bfb62

In [19]:
raw_sentiment

[label: int, tweet: string]

In [11]:
val predictionsDF = sameModel.transform(raw_sentiment)

predictionsDF.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|               tweet|               words|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|@switchfoot http:...|[@switchfoot, htt...|(1000,[7,14,21,54...|[43.3951549588202...|[0.43395154958820...|       1.0|
|    0|is upset that he ...|[is, upset, that,...|(1000,[170,193,22...|[51.5212016380652...|[0.51521201638065...|       0.0|
|    0|@Kenichan I dived...|[@kenichan, i, di...|(1000,[10,36,77,1...|[56.8669760193184...|[0.56866976019318...|       0.0|
|    0|my whole body fee...|[my, whole, body,...|(1000,[82,191,296...|[49.6991503847535...|[0.49699150384753...|       1.0|
|    0|@nationwideclass ...|[@nationwideclass...|(1000,[18,96,130,...|[53.7419958713383...|[0.53741995871338...|       0.0|
|    0|@

predictionsDF = [label: int, tweet: string ... 5 more fields]


[label: int, tweet: string ... 5 more fields]

In [12]:
// predictionsDF.schema
// predictionsDF.describe()
// predictionsDF.stat
predictionsDF.printSchema()

root
 |-- label: integer (nullable = false)
 |-- tweet: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [13]:
import org.apache.spark.sql.functions._

val getProbability = udf((prediction: org.apache.spark.ml.linalg.Vector) => prediction(1))

getProbability = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [17]:
val getProbability0 = udf((prediction: org.apache.spark.ml.linalg.Vector) => prediction(0))

getProbability0 = UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))


UserDefinedFunction(<function1>,DoubleType,Some(List(org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7)))

In [18]:
predictionsDF.select(getProbability($"probability").alias("clean_probability"),
                     getProbability0($"probability").alias("clean_probability0"),
                     $"tweet").show(false)

+-------------------+-------------------+---------------------------------------------------------------------------------------------------------------------+
|clean_probability  |clean_probability0 |tweet                                                                                                                |
+-------------------+-------------------+---------------------------------------------------------------------------------------------------------------------+
|0.5660484504117975 |0.4339515495882024 |@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|0.48478798361934716|0.5152120163806528 |is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!      |
|0.43133023980681595|0.568669760193184  |@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |
|0.5030084961524645 |0.4969915038475355 