In [1]:
import findspark
findspark.init()

from pyspark import SparkContext
sc = SparkContext("local", "pyspark-shell")

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Classification

If the missing values are not that high in amount, you can simply remove the record with the missing value. You can use the filter method or more aggresively you can drop all missing values in any column. This should be donw with care because it could result in the loss of a lot of otherwise useful data. 

String columns that represent categories should be converted to integer by using StringIndexer. Fit and transform the data using the indexer. Index values are assigned according to the descending relative frequency of each of the string values. Rather than using frequency of occurence, string can be ordered alpabetically (stringOrderType).

The final step in preparing the data is to consolidate the varius input columns into a single column. Machine learning algorithms in Spark operate on a single vector of predictors (VectorAssembler).

### Removing columns and rows

In [4]:
flights = spark.read.csv("flights.csv", sep=",", header=True, inferSchema=True, nullValue="NA")

flights_drop_column = flights.drop("flight")
print(flights_drop_column.filter("delay IS NULL").count())
flights_valid_delay = flights_drop_column.filter("delay IS NOT NULL")
flights_none_missing = flights_valid_delay.dropna()
print(flights_none_missing.count())

2978
47022


### Column manipulation

In [14]:
from pyspark.sql.functions import round

flights_km = flights_none_missing.withColumn("km", round(flights_none_missing.mile * 1.60934, 0)).drop("mile")
flights_km = flights_km.withColumn("label", (flights_none_missing.delay >=15).cast("integer"))
flights_km.show(5)

+---+---+---+-------+---+------+--------+-----+------+-----+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|
+---+---+---+-------+---+------+--------+-----+------+-----+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|
+---+---+---+-------+---+------+--------+-----+------+-----+
only showing top 5 rows



### Categorical columns


Categorical columns (org and carrier) will be transformed into indexed numerical columns.

In [17]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="carrier", outputCol="carrier_idx")
indexer_model = indexer.fit(flights_km)
flights_indexed = indexer_model.transform(flights_km)
flights_indexed = StringIndexer(inputCol="org", outputCol="org_idx").fit(flights_indexed).transform(flights_indexed)
flights_indexed.show()

+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|mon|dom|dow|carrier|org|depart|duration|delay|    km|label|carrier_idx|org_idx|
+---+---+---+-------+---+------+--------+-----+------+-----+-----------+-------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30| 509.0|    1|        0.0|    0.0|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8| 542.0|    0|        0.0|    1.0|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|1989.0|    0|        1.0|    0.0|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2| 885.0|    0|        0.0|    1.0|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|1180.0|    1|        1.0|    0.0|
|  1| 16|  6|     UA|ORD|   8.0|     232|   -7|2317.0|    0|        0.0|    0.0|
|  1| 22|  5|     UA|SJC|  7.98|     250|  -13|2943.0|    0|        0.0|    5.0|
| 11|  8|  1|     OO|SFO|  7.77|      60|   88| 254.0|    1|        2.0|    1.0|
|  4| 26|  1|     AA|SFO| 13.25|     210|  -10|2356.0|    0|        1.0|    1.0|
|  4| 25|  0|     AA|ORD| 13

### Assembling columns

In [29]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols= ["mon", "dom", "dow", "carrier_idx", "org_idx", "km", "depart", "duration"],
                            outputCol="features")
fligt_assembled = assembler.transform(flights_indexed)
fligt_assembled.select("features", "delay").show(5, truncate=False)

+-----------------------------------------+-----+
|features                                 |delay|
+-----------------------------------------+-----+
|[0.0,22.0,2.0,0.0,0.0,509.0,16.33,82.0]  |30   |
|[2.0,20.0,4.0,0.0,1.0,542.0,6.17,82.0]   |-8   |
|[9.0,13.0,1.0,1.0,0.0,1989.0,10.33,195.0]|-5   |
|[5.0,2.0,1.0,0.0,1.0,885.0,7.98,102.0]   |2    |
|[7.0,2.0,6.0,1.0,0.0,1180.0,10.83,135.0] |54   |
+-----------------------------------------+-----+
only showing top 5 rows



## Decision Tree

A decision Tree is constructed using an algorithm called "Recursive  Partitionong". 

Random split method to randomly split data into two sets, a training set and a testing set.

Accuracy = (TN + TP) / (TN + TP + FN + FP) proportion of correct predictions.

### Train/test split

In [42]:
flights_train, flights_test = fligt_assembled.randomSplit([0.8, 0.2], 17)
training_ratio = flights_train.count() / flights_test.count()
print(training_ratio)

3.9253168534618204


### Build a Decision Tree

In [44]:
from pyspark.ml.classification import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree_model = tree.fit(flights_train)
prediction = tree_model.transform(flights_test)
prediction.select("label", "prediction", "probability").show(5, False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1    |0.0       |[0.5386092969650403,0.46139070303495966]|
|0    |1.0       |[0.3535976324064369,0.6464023675935631] |
|0    |0.0       |[0.5386092969650403,0.46139070303495966]|
|1    |1.0       |[0.3535976324064369,0.6464023675935631] |
|1    |1.0       |[0.3535976324064369,0.6464023675935631] |
+-----+----------+----------------------------------------+
only showing top 5 rows



### Evaluate the Decision Tree

In [50]:
prediction.groupBy("label", 'prediction').count().show()

TN = prediction.filter("label = 0 AND prediction = label").count()
TP = prediction.filter("label = 1 AND prediction = label").count()
FP = prediction.filter("label = 1 AND prediction != label").count()
FN = prediction.filter("label = 0 AND prediction != label").count()

accuracy = (TN + TP) / (TN + TP + FP + FN)
print(accuracy)

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 1725|
|    0|       0.0| 2828|
|    1|       1.0| 3185|
|    0|       1.0| 1809|
+-----+----------+-----+

0.6298313606368493


## Logistic Regression

It's another commonly used classification model. It uses a logistic function to model a binary target (1,0 or TRUE, FALSE). 

Coefficients can shift the curve to the right or to the left. They might make the transistion between states more gradual or more rapid. These characteristics are all extracted from the training data and will vary from one set of data to another. 

Precision is the proportion of positive predictions which are correct. TP / (TP + FP)

Recall is the proportion of positive targets which are correctly predicted. TP / (TP + FN)

Choosing a larger or smaller value for the threshold will affect the performance of the model. The ROC curve plots the true positive rate versus the false positive rate. AUC is the are under the ROC curve. AUC indicates how well a model performs across all values of the threshold.

### Build a Logistic Regression model

In [65]:
from pyspark.ml.classification import LogisticRegression

flights_train_lr = flights_train.select("mon", "depart", "duration", "features", "label")
flights_test_lr = flights_test.select("mon", "depart", "duration", "features", "label")

logistic = LogisticRegression().fit(flights_train_lr)
prediction = logistic.transform(flights_train_lr)

prediction.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 6707|
|    0|       0.0|10445|
|    1|       1.0|12426|
|    0|       1.0| 7897|
+-----+----------+-----+



### Evaluate the Logistic Regression model

In [69]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

TN = prediction.filter("label = 0 AND prediction = label").count()
TP = prediction.filter("label = 1 AND prediction = label").count()
FP = prediction.filter("label = 1 AND prediction != label").count()
FN = prediction.filter("label = 0 AND prediction != label").count()

precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision = {:.2f}\nrecall    = {:.2f}'.format(precision, recall))

multi_evaluator = MulticlassClassificationEvaluator()
weighted_precision = multi_evaluator.evaluate(prediction, {multi_evaluator.metricName: "weightedPrecision"})
print(weighted_precision)

binary_evaluator = BinaryClassificationEvaluator()
auc = binary_evaluator.evaluate(prediction, {binary_evaluator.metricName:"areaUnderROC"})
print(auc)

precision = 0.65
recall    = 0.61
0.6102221286967238
0.6480160400989486


The weighted precision indicates what proportion of predictions (positive and negative) are correct.

## Turning Text into Tables

Tokenizing. You split the text into words or tokens. Each document will be transformed into a list of words. Stop words will be removed using StopWordsRemover. It would also be handy to convert the words into numbers. Hashing trick is converting words into numbers. The output in the hash column is presented in sparse format. The first list contains the hashed values and the second list indicates how many time each of those values occurs.

If a word appears in many documents the it's probably going to be less useful for building a classifier. Weight the number of counts for a word in a particular document against how frequently that word occurs across all documents. The effective count are reduced for more common words, giving what is known as the "inverse document frequency"

* remove punctuation and numbers
* tokenize (split into individual words)
* remove stop words
* apply the hashing trick
* convert to TF-IDF representation.

### Punctuation, numbers and tokens

In [122]:
sms = spark.read.csv("sms.csv", sep=";",header=True, inferSchema=True)

from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

wrangled = sms.withColumn("text", regexp_replace(sms.text, '[_():;,.!?\\-]', " "))
wrangled = wrangled.withColumn("text", regexp_replace(wrangled.text, "[0-9]", " "))
wrangled = wrangled.withColumn("text", regexp_replace(wrangled.text, " +", " "))
wrangled = wrangled.withColumn("text", regexp_replace(wrangled.text, "I", "i"))
wrangled = Tokenizer(inputCol="text", outputCol="words").transform(wrangled)
wrangled.show(4, truncate=False)

+---+----------------------------------+-----+------------------------------------------+
|id |text                              |label|words                                     |
+---+----------------------------------+-----+------------------------------------------+
|1  |Sorry i'll call later in meeting  |0    |[sorry, i'll, call, later, in, meeting]   |
|2  |Dont worry i guess he's busy      |0    |[dont, worry, i, guess, he's, busy]       |
|3  |Call FREEPHONE now                |1    |[call, freephone, now]                    |
|4  |Win a cash prize or a prize worth |1    |[win, a, cash, prize, or, a, prize, worth]|
+---+----------------------------------+-----+------------------------------------------+
only showing top 4 rows



### Stop words and hashing

IDF calculation weight of the words.

In [123]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

sw_removed = StopWordsRemover(inputCol="words", outputCol="terms").transform(wrangled)
hashed = HashingTF(inputCol="terms", outputCol="hash", numFeatures=1024).transform(sw_removed)
tf_idf = IDF(inputCol="hash", outputCol="features").fit(hashed).transform(hashed)

tf_idf.select('terms', 'features').show(4, truncate=False)

+--------------------------------+----------------------------------------------------------------------------------------------------+
|terms                           |features                                                                                            |
+--------------------------------+----------------------------------------------------------------------------------------------------+
|[sorry, call, later, meeting]   |(1024,[138,384,577,996],[2.273418200008753,3.6288353225642043,3.5890949939146903,4.104259019279279])|
|[dont, worry, guess, busy]      |(1024,[215,233,276,329],[3.9913186080986836,3.3790235241678332,4.734227298217693,4.58299632849377]) |
|[call, freephone]               |(1024,[133,138],[5.367951058306837,2.273418200008753])                                              |
|[win, cash, prize, prize, worth]|(1024,[31,47,62,389],[3.6632029660684124,4.754846585420428,4.072170704727778,7.064594791043114])    |
+--------------------------------+--------------

### Training a spam classifier


In [124]:
sms_train, sms_test = tf_idf.randomSplit([0.8, 0.2], 13)

logistic = LogisticRegression(regParam=0.2).fit(sms_train)
prediction = logistic.transform(sms_test)
prediction.groupBy("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0|   41|
|    0|       0.0|  948|
|    1|       1.0|  105|
|    0|       1.0|    2|
+-----+----------+-----+

