In [1]:
from pyspark.sql import SparkSession

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer

In [3]:
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

NameError: name 'spark' is not defined

In [4]:
spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate()

In [5]:
training = spark.createDataFrame([
    (0, "a b c d e spark", 1.0),
    (1, "b d", 0.0),
    (2, "spark f g h", 1.0),
    (3, "hadoop mapreduce", 0.0)
], ["id", "text", "label"])

In [6]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")

In [7]:
hashingTF=HashingTF(inputCol=tokenizer.getOutputCol(),outputCol="features")

In [8]:
lr=LogisticRegression(maxIter=10,regParam=0.001)

In [9]:
pipeline=Pipeline(stages=[tokenizer,hashingTF,lr])

In [10]:
model = pipeline.fit(training)

In [11]:
test = spark.createDataFrame([(4,"spark i,j,k"),
                             (5," l m n"),
                             (6,"spark hadoop spark"),
                             (7,"spark apache")],["id","text"])

In [12]:
prediction = model.transform(test)

In [13]:
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
    rid, text, prob, prediction = row
    print("(%d, %s) --> prob=%s, prediction=%f" % (rid, text, str(prob), prediction))

(4, spark i,j,k) --> prob=[0.1596407738787475,0.8403592261212525], prediction=1.000000
(5,  l m n) --> prob=[0.8378325685476744,0.16216743145232562], prediction=0.000000
(6, spark hadoop spark) --> prob=[0.06926633132976037,0.9307336686702395], prediction=1.000000
(7, spark apache) --> prob=[0.1596407738787475,0.8403592261212525], prediction=1.000000


In [14]:
from pyspark.ml.feature import HashingTF,IDF,Tokenizer

In [15]:
sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence")

In [16]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

In [17]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

In [18]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)

In [19]:
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(20,[0,5,9,13,17]...|
|    0|(20,[2,7,9,13,15]...|
|    1|(20,[4,6,13,15,18...|
+-----+--------------------+

