In [2]:
from pythagorasutils import get_spark
spark = get_spark()['spark']

In [3]:
from pythagorasutils.generic.notebook_utils import Utils
Utils.fix_show(3)
Utils.fix_to_pandas(3)

In [4]:
import pandas as pd
from pyspark.sql import functions as f
from pyspark.sql.window import Window
c = f.col
from calendar import monthrange
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql.types import *

In [5]:
df_books = spark.read.csv('galbraith.csv', header=True)
df_books.show()

+--------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+-----------------+-----------------+----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-------------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-

In [8]:
df_books_cast = df_books.select([df_books.columns[0]] 
                                + [c(x).cast('double').alias(x) for x in df_books.columns[1:]])
df_books_cast.cache()
df_books_cast



In [10]:
df_books_cast.select('book_name').toPandas(40)

Unnamed: 0,book_name
0,coben_breaker
1,coben_dropshot
2,coben_fadeaway
3,coben_falsemove
4,coben_goneforgood
5,coben_nosecondchance
6,coben_tellnoone
7,galbraith_cuckoos
8,lewis_battle
9,lewis_caspian


In [13]:
df_target = df_books_cast.withColumn('label', f.when(c('book_name').like('%rowling%'), f.lit(1)).otherwise(0))
df_target.select(['book_name', 'label']).show(40)

+--------------------+-----+
|book_name           |label|
+--------------------+-----+
|coben_breaker       |0    |
|coben_dropshot      |0    |
|coben_fadeaway      |0    |
|coben_falsemove     |0    |
|coben_goneforgood   |0    |
|coben_nosecondchance|0    |
|coben_tellnoone     |0    |
|galbraith_cuckoos   |0    |
|lewis_battle        |0    |
|lewis_caspian       |0    |
|lewis_chair         |0    |
|lewis_horse         |0    |
|lewis_lion          |0    |
|lewis_nephew        |0    |
|lewis_voyage        |0    |
|rowling_casual      |1    |
|rowling_chamber     |1    |
|rowling_goblet      |1    |
|rowling_hallows     |1    |
|rowling_order       |1    |
|rowling_prince      |1    |
|rowling_prisoner    |1    |
|rowling_stone       |1    |
|tolkien_lord1       |0    |
|tolkien_lord2       |0    |
|tolkien_lord3       |0    |
+--------------------+-----+



In [14]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

In [25]:
splits = df_target.randomSplit([0.6, 0.4], 126)
train = splits[0]
test = splits[1]

In [26]:
train.select(['book_name', 'label']).show(40)

+--------------------+-----+
|book_name           |label|
+--------------------+-----+
|coben_breaker       |0    |
|coben_dropshot      |0    |
|coben_fadeaway      |0    |
|coben_goneforgood   |0    |
|coben_nosecondchance|0    |
|lewis_battle        |0    |
|lewis_caspian       |0    |
|lewis_chair         |0    |
|lewis_horse         |0    |
|lewis_lion          |0    |
|lewis_nephew        |0    |
|lewis_voyage        |0    |
|rowling_casual      |1    |
|rowling_goblet      |1    |
|rowling_hallows     |1    |
|rowling_order       |1    |
|rowling_prince      |1    |
|rowling_stone       |1    |
|tolkien_lord2       |0    |
+--------------------+-----+



In [27]:
test.select(['book_name', 'label']).show(40)

+-----------------+-----+
|book_name        |label|
+-----------------+-----+
|coben_falsemove  |0    |
|coben_tellnoone  |0    |
|galbraith_cuckoos|0    |
|rowling_chamber  |1    |
|rowling_prisoner |1    |
|tolkien_lord1    |0    |
|tolkien_lord3    |0    |
+-----------------+-----+



In [29]:
train_df = VectorAssembler(inputCols=df_books.columns[1:], outputCol='features').transform(train).drop(*df_books.columns[1:])
train_df

DataFrame[book_name: string, label: int, features: vector]

In [30]:
train_df.show(1)

+-------------+-----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [41]:
nb = NaiveBayes(modelType="gaussian")
model = nb.fit(train_df)

In [33]:
test_df = VectorAssembler(inputCols=df_books.columns[1:], outputCol='features').transform(test).drop(*df_books.columns[1:])\
.withColumnRenamed('label', 'fixed_label')
test_df

DataFrame[book_name: string, fixed_label: int, features: vector]

In [42]:
%%time
predictions = model.transform(test_df)

CPU times: user 3.74 ms, sys: 1.89 ms, total: 5.63 ms
Wall time: 116 ms


In [43]:
predictions.drop('features').show(50)

+-----------------+-----------+---------------------------------------------+-----------+----------+
|book_name        |fixed_label|rawPrediction                                |probability|prediction|
+-----------------+-----------+---------------------------------------------+-----------+----------+
|coben_falsemove  |0          |[-2.19557537891548E8,-2.290896694862004E9]   |[1.0,0.0]  |0.0       |
|coben_tellnoone  |0          |[-1.2961147923178922E8,-2.1255519766380888E8]|[1.0,0.0]  |0.0       |
|galbraith_cuckoos|0          |[-1.2919532305433773E8,-7.772658318504822E8] |[1.0,0.0]  |0.0       |
|rowling_chamber  |1          |[-3.1736394725826424E8,-26845.389456126228]  |[0.0,1.0]  |1.0       |
|rowling_prisoner |1          |[-3.2028088922886175E8,-430.27857253436775]  |[0.0,1.0]  |1.0       |
|tolkien_lord1    |0          |[-3967.268157110627,-3.2091288007573915E8]   |[1.0,0.0]  |0.0       |
|tolkien_lord3    |0          |[-9759005.244841699,-1.7127798902352235E8]   |[1.0,0.0]  |0.