In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local", "pyspark-shell")

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Text classification

## Extract Transform Select

ETS: Extract, transform and select. Extraction is extracting features from raw data. Transformation involves scaling, converting or modifying features. Selection is subset of features.

You can define your own function to use with spark. You need to import udf first. To return values you need to import StringType, IntegerType, FloatType, ArrayType or BooleanType from pyspark.sql.types.

### Practicing creating a UDF

In [118]:
df = spark.read.text("sherlock.txt")
df = df.filter(~(df.value == ""))
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import regexp_replace
from pyspark.sql.types import ArrayType, StringType, BooleanType

df = df.select(regexp_replace("value", ",", "").alias("words"))
df = df.select(regexp_replace("words", "  ", " ").alias("words"))
split_df = df.select(split("words", " ").alias("words"))

split_df.show(10, truncate=False)


in_udf = udf(lambda x: x[0:len(x)-1] if x and len(x) > 1 else [], ArrayType(StringType()))
out_udf = udf(lambda x: x[len(x)-1:len(x)] if x and len(x) > 1 else [], ArrayType(StringType()))
df2 = split_df.select("words", in_udf("words").alias("in"), out_udf("words").alias("out"))
df2.show(10,truncate=False)

+----------------------------------------------------------------------------------+
|words                                                                             |
+----------------------------------------------------------------------------------+
|[The, Project, Gutenberg, EBook, of, The, Adventures, of, Sherlock, Holmes]       |
|[by, Sir, Arthur, Conan, Doyle]                                                   |
|[(#15, in, our, series, by, Sir, Arthur, Conan, Doyle)]                           |
|[Copyright, laws, are, changing, all, over, the, world., Be, sure, to, check, the]|
|[copyright, laws, for, your, country, before, downloading, or, redistributing]    |
|[this, or, any, other, Project, Gutenberg, eBook.]                                |
|[This, header, should, be, the, first, thing, seen, when, viewing, this, Project] |
|[Gutenberg, file., Please, do, not, remove, it., Do, not, change, or, edit, the]  |
|[header, without, written, permission.]                         

In [119]:
nonempty_udf = udf(lambda x:  
    True if x and x.numNonzeros()
    else False, BooleanType())

s_udf = udf(lambda x: str(x[0]) if (x and type(x) is list and len(x) > 0) else "", StringType())

### Practicing array column

In [127]:
df2.dtypes

[('words', 'array<string>'), ('in', 'array<string>'), ('out', 'array<string>')]

In [132]:
from pyspark.sql.functions import array_contains
df2.where(array_contains("words", "5")).show()

TRIVIAL_TOKENS = {'',  '0',  '1',  '2',  '3',  '4',  '5',  '6',  '7',  '8',  '9',  'b',  'c',  'e',  'f',  'g',  'h',  'j',  'k',  'l',  'm',  'n',  'o',  'p',  'pp',  'q',  'r',  's',  't',  'u',  'v',  'w',  'x',  'y',  'z'}

rm_trivial_udf = udf(lambda x:
                     list(set(x) - TRIVIAL_TOKENS) if x
                     else x,
                     ArrayType(StringType()))

df_after = df2.withColumn('in', rm_trivial_udf('in')).withColumn('out', rm_trivial_udf('out'))

df_after.where(array_contains('words','5')).show(truncate=False)

+--------------------+--------------------+------------+
|               words|                  in|         out|
+--------------------+--------------------+------------+
|[On, the, night, ...|[On, the, night, ...|        [to]|
|[during, the, sum...|[during, the, sum...|        [in]|
|[August, 5, the, ...|[August, 5, the, ...|        [in]|
|[[11], Proposed, ...|[[11], Proposed, ...|     [1798.]|
|[5, James, Monroe...|[5, James, Monroe...|  [Tompkins]|
|   [Friends, the, 5]|      [Friends, the]|         [5]|
|[Fundamental, art...|[Fundamental, art...|         [5]|
|[Fundamental, ord...|[Fundamental, ord...|         [5]|
| [Hooker, Thomas, 5]|    [Hooker, Thomas]|         [5]|
|[Hutchinson, Anne...|  [Hutchinson, Anne]|         [5]|
|[Williams, Roger,...|[Williams, Roger, 5]|        [42]|
|[nucleinate, of, ...|[nucleinate, of, ...|[solution).]|
|[freely, as, poss...|[freely, as, poss...|     [hours]|
|[out, by, an, exp...|[out, by, an, exp...|         [5]|
|[needle, of, plat...|[needle, 

### Creating feature data for classification

With numNonzeros() you can check it an array contains at least one item.

CountVectorizer is a feature extractor. Its input is an array of strings and converts them into a sparse vector.

### Creating a UDF for vector data

In [201]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
source_data = [
    Row(output = Vectors.sparse(12847,[65],[1.0])),
    Row(output = Vectors.sparse(12847,[8],[1.0])),
    Row(output = Vectors.sparse(12847,[47],[1.0])),
    Row(output = Vectors.sparse(12847,[89],[1.0])),
    Row(output = Vectors.sparse(12847,[94],[1.0]))
              ]
df = spark.createDataFrame(source_data)
df.show()
df.dtypes

+------------------+
|            output|
+------------------+
|(12847,[65],[1.0])|
| (12847,[8],[1.0])|
|(12847,[47],[1.0])|
|(12847,[89],[1.0])|
|(12847,[94],[1.0])|
+------------------+



[('output', 'vector')]

In [211]:
from pyspark.sql.types import FloatType

first_udf = udf(lambda x: float(x.indices[0]), FloatType())

df.select(first_udf("output").alias("result")).show(5)

+------+
|result|
+------+
|  65.0|
|   8.0|
|  47.0|
|  89.0|
|  94.0|
+------+



### Applying a UDF to vector data

In [214]:
df_new = df.withColumn("label", first_udf("output"))
df_new.show()

+------------------+-----+
|            output|label|
+------------------+-----+
|(12847,[65],[1.0])| 65.0|
| (12847,[8],[1.0])|  8.0|
|(12847,[47],[1.0])| 47.0|
|(12847,[89],[1.0])| 89.0|
|(12847,[94],[1.0])| 94.0|
+------------------+-----+



### Transforming text to vector format


In [286]:
df2 = df2.withColumnRenamed("sentence", "words")

In [287]:
cv = CountVectorizer(inputCol="words", outputCol="vec")
model = cv.fit(df2)

In [288]:
df2 = df2.withColumnRenamed("words", "sentence")

In [290]:
result = model.transform(df2.withColumnRenamed('in', 'words')).withColumnRenamed('words', 'in').withColumnRenamed('vec', 'invec')
result.drop('sentence').show(3, False)

result = model.transform(result.withColumnRenamed('out', 'words')).withColumnRenamed('words', 'out').withColumnRenamed('vec', 'outvec')
result.select('invec', 'outvec').show(3, False)

+-------------------------------------------------------------------+--------+------------------------------------------------------------------------------+
|in                                                                 |out     |invec                                                                         |
+-------------------------------------------------------------------+--------+------------------------------------------------------------------------------+
|[The, Project, Gutenberg, EBook, of, The, Adventures, of, Sherlock]|[Holmes]|(69609,[1,20,516,1139,1275,13924,30142],[2.0,2.0,1.0,1.0,1.0,1.0,1.0])        |
|[by, Sir, Arthur, Conan]                                           |[Doyle] |(69609,[16,3434,4703,21302],[1.0,1.0,1.0,1.0])                                |
|[(#15, in, our, series, by, Sir, Arthur, Conan]                    |[Doyle)]|(69609,[6,16,98,1226,3434,4703,21302,61595],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
+---------------------------------------------------

## Text Classification

To guess end of the sentence you can use machine learning algorithms. The technique that will be used is not sensitive to the order of the words. This also can be used for recommendations based on a person's viewing history (like video or music). Logistic regression is a suitable approach for this kind of problem if there are two (true and false) differently labeled examples.

### Label the data

In [298]:
result = result.withColumn("endword", result.out[0])
result.show()

+--------------------+--------------------+------------------+--------------------+--------------------+----------------+
|            sentence|                  in|               out|               invec|              outvec|         endword|
+--------------------+--------------------+------------------+--------------------+--------------------+----------------+
|[The, Project, Gu...|[The, Project, Gu...|          [Holmes]|(69609,[1,20,516,...| (69609,[323],[1.0])|          Holmes|
|[by, Sir, Arthur,...|[by, Sir, Arthur,...|           [Doyle]|(69609,[16,3434,4...|(69609,[20000],[1...|           Doyle|
|[(#15, in, our, s...|[(#15, in, our, s...|          [Doyle)]|(69609,[6,16,98,1...|(69609,[49957],[1...|          Doyle)|
|[Copyright, laws,...|[Copyright, laws,...|             [the]|(69609,[0,3,32,33...|   (69609,[0],[1.0])|             the|
|[copyright, laws,...|[copyright, laws,...|  [redistributing]|(69609,[17,25,84,...|(69609,[15592],[1...|  redistributing|
|[this, or, any, o...|[t

In [307]:
from pyspark.sql.functions import lit
df_pos = result.where("endword= 'him'").withColumn("label", lit(1))
df_neg = result.where("endword <> 'him'").withColumn("label", lit(0))

df_pos.show(5)
df_neg.show(5)

+--------------------+--------------------+-----+--------------------+------------------+-------+-----+
|            sentence|                  in|  out|               invec|            outvec|endword|label|
+--------------------+--------------------+-----+--------------------+------------------+-------+-----+
|[weavers, wagon-m...|[weavers, wagon-m...|[him]|(69609,[2,19,1811...|(69609,[30],[1.0])|    him|    1|
|[Bute, his, teach...|[Bute, his, teach...|[him]|(69609,[2,7,10,14...|(69609,[30],[1.0])|    him|    1|
|[George, III, Wil...|[George, III, Wil...|[him]|(69609,[0,1,8,58,...|(69609,[30],[1.0])|    him|    1|
|[addressed, Georg...|[addressed, Georg...|[him]|(69609,[0,2,3,71,...|(69609,[30],[1.0])|    him|    1|
|[ease, while, the...|[ease, while, the...|[him]|(69609,[0,1,154,1...|(69609,[30],[1.0])|    him|    1|
+--------------------+--------------------+-----+--------------------+------------------+-------+-----+
only showing top 5 rows

+--------------------+-----------------

In [339]:
df_examples = df_pos.union(df_neg.limit(df_pos.count()))
print("Number of examples: ", df_examples.count())
df_examples.where("endword <> 'him'").sample(False, .1, 42).show(5)

Number of examples:  484
+--------------------+--------------------+------------------+--------------------+--------------------+----------------+-----+
|            sentence|                  in|               out|               invec|              outvec|         endword|label|
+--------------------+--------------------+------------------+--------------------+--------------------+----------------+-----+
|[Gutenberg, file....|[Gutenberg, file....|             [the]|(69609,[18,25,83,...|   (69609,[0],[1.0])|             the|    0|
|[*****These, eBoo...|[*****These, eBoo...|[Volunteers!*****]|(69609,[1,574,269...|(69609,[24127],[1...|Volunteers!*****|    0|
|[Author:, Sir, Ar...|[Author:, Sir, Ar...|           [Doyle]|(69609,[3434,4703...|(69609,[20000],[1...|           Doyle|    0|
|["I, see, it, I, ...|["I, see, it, I, ...|          [girl?"]|(69609,[2,5,7,15,...|(69609,[53992],[1...|          girl?"|    0|
|["Then, how, many...|["Then, how, many...|         [there?"]|(69609,[33,113,18

### Split the data

In [340]:
df_examples = df_examples.select("endword", "sentence", "invec", "outvec", "label")

In [341]:
df_examples = df_examples.withColumnRenamed("invec", "features")
df_trainset, df_testset = df_examples.randomSplit((0.8,0.2), 42)

print("Number of training: ", df_trainset.count())
print("Number of test: ", df_testset.count())

Number of training:  404
Number of test:  80


### Train the classifier


In [342]:
from pyspark.ml.classification import LogisticRegression

logistic = LogisticRegression(maxIter = 100, regParam=0.4, elasticNetParam=0.0)
df_fitted = logistic.fit(df_trainset)
print("Training iterations: ", df_fitted.summary.totalIterations)

Training iterations:  27


## Predicting and evaluating

Predicting and evaluating its accuracy. To apply a trained model to the test data, use the transform() operation. Transformation operation returns a dataframe. It adds prediction and probability columns to the data. Prediction is a double but in here it is 0 and 1. The probability column is a vector containing two numbers. The first number is the estimated probability that the prediction is false, the second number is the estimated probability that the prediction is true. To calculate the perfonmance of this classification Area Under Curve can be used.

to evaluate the results use model.evaluate(df_test) and model_stats.areaUnderROC

In [354]:
testSummary = df_fitted.evaluate(df_testset)
print("test AUC: %.3f" %testSummary.areaUnderROC) # or testSummary.accuracy

test AUC: 0.929


### Predict test data


In [355]:
df_fitted.transform(df_testset).select("endword", "label", "prediction").show(10)

+-------+-----+----------+
|endword|label|prediction|
+-------+-----+----------+
|    him|    1|       1.0|
|    him|    1|       0.0|
|    him|    1|       1.0|
|    him|    1|       0.0|
|    him|    1|       1.0|
|    him|    1|       1.0|
|    him|    1|       1.0|
|    him|    1|       1.0|
|    him|    1|       1.0|
|    him|    1|       1.0|
+-------+-----+----------+
only showing top 10 rows



In [373]:
# predictions = df_fitted.transform(df_testset)
fields = ['prediction', 'label', 'endword', 'sentence', 'probability']
for x in predictions.take(8):
    print()
    if x.label != int(x.prediction):
        print("INCORRECT ==> ")
    for y in fields:
        print(y,":", x[y])



prediction : 1.0
label : 1
endword : him
sentence : ['"But', 'between', 'not', 'recognizing', 'him', 'as', 'Emperor', 'and', 'calling', 'him']
probability : [0.30829130715909214,0.6917086928409079]

INCORRECT ==> 
prediction : 0.0
label : 1
endword : him
sentence : ['"How', 'about', 'my', 'son', 'Boris', 'Prince?"', 'said', 'she', 'hurrying', 'after', 'him']
probability : [0.5026205303581466,0.4973794696418534]

prediction : 1.0
label : 1
endword : him
sentence : ['"May', 'I', 'call', 'in', 'that', 'boy', 'who', 'was', 'taken', 'prisoner', 'and', 'give', 'him']
probability : [0.3678261091637896,0.6321738908362105]

INCORRECT ==> 
prediction : 0.0
label : 1
endword : him
sentence : ['"Well', 'perhaps"', 'said', 'he', 'with', 'a', 'sigh.', '"We', "don't", 'expect', 'to', 'get', 'him']
probability : [0.6642412270035128,0.33575877299648715]

prediction : 1.0
label : 1
endword : him
sentence : ['Alpatych', 'went', 'back', 'to', 'the', 'house', 'called', 'the', 'coachman', 'and', 'told', 'h

In [374]:
# TRAIN - PREDICT - EVALUATE