In [1]:
storage_account_name = "<Storage account name>"
storage_account_key = "<Storage account key>"
container = "<Container name>"

In [2]:
spark.conf.set("fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name), storage_account_key)

In [3]:
data = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .option("delimiter", ",") \
  .option("quote", '"') \
  .option("escape", '"') \
  .csv("wasbs://{0}@{1}.blob.core.windows.net/sentiment.csv".format(container, storage_account_name))

data.show(5)

In [4]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

tokenizer = Tokenizer(inputCol="text", outputCol="words")

hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')

idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)

label_stringIdx = StringIndexer(inputCol = "sentiment", outputCol = "label")

In [5]:
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

In [6]:
model = pipeline.fit(data)
train = model.transform(data)

train.show(5)

In [7]:
lr = LogisticRegression(maxIter=100)

lr_model = lr.fit(train)

In [8]:
test = spark.createDataFrame(["This movie is bad"], "string").toDF("text")

test_transformed = model.transform(test)

prediction = lr_model.transform(test_transformed)

In [9]:
prediction.show()