In [1]:
import os
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    "--repositories http://repo.hortonworks.com/content/groups/public/ "
    "--packages com.hortonworks:shc-core:1.1.1-2.1-s_2.11 "
    " pyspark-shell")
findspark.init()

In [2]:
from pyspark import SQLContext, SparkConf, SparkContext
from pyspark.sql import SparkSession
conf = SparkConf()
conf.setAppName("project1")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName("test").getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True), # "label" replaces "product_title"
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body_raw", StringType(), True),
    StructField("review_date", StringType(), True)])

df_test = spark.read.csv('amazon_reviews_us_Grocery_v1_00.tsv', sep="\t", header=True, schema=schema)
#df_test = spark.read.csv('sample.csv', header=True, schema=schema)

In [5]:
df_test.count()

2402458

In [6]:
df_test = df_test.dropna()
df_test.count()
dft = df_test.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')

In [7]:
dft.count()

2402211

In [8]:
dft2 = dft.select('*').where('product_title is not null or review_body_raw is not null or product_title <> "" or review_body_raw <> "" ')

In [9]:
dft2.count()

2402211

In [10]:
dft.show(50)

+--------------------+--------------------+
|       product_title|     review_body_raw|
+--------------------+--------------------+
|The Cravings Plac...|As a family aller...|
|Mauna Loa Macadam...|My favorite nut. ...|
|Organic Matcha Gr...|This green tea ta...|
|15oz Raspberry Ly...|I love Melissa's ...|
|Stride Spark Kine...|                good|
|Herr's Popcorn Ho...|The popcorn was s...|
|Larabar uber, 1.4...|Love these bars, ...|
|Shirakiku Soba No...|Love the taste bu...|
|Jif Chocolate Nut...|I'm a member of t...|
|Orgain Organic Pl...|Used to be a dece...|
|Bragg - All Natur...|I cannot tell the...|
|Wholesome Sweeten...|Good flavor and s...|
|Kadoya Pure Sesam...|Great to use in r...|
|Nishiki Premium B...|It's rice. Have e...|
|Everly Passion Fr...|Very good tasting...|
|Charms Blue Razzb...|They were perfect...|
|Food Should Taste...|Wow, these are so...|
|Skippy Creamy Pea...|I bought this fro...|
|Celestial Seasoni...|I love this tea, ...|
|Nutiva Organic Vi...|I have use

In [11]:
dft2 = dft2.select('*').where('length(review_body_raw)>200')

In [12]:
dft2.count()

912170

In [13]:
#dft2.select('*').where(dft2.product_title.startswith('Jif')).show()

In [14]:
dft2.show(20)

+--------------------+--------------------+
|       product_title|     review_body_raw|
+--------------------+--------------------+
|The Cravings Plac...|As a family aller...|
|Jif Chocolate Nut...|I'm a member of t...|
|Orgain Organic Pl...|Used to be a dece...|
|Bragg - All Natur...|I cannot tell the...|
|Skippy Creamy Pea...|I bought this fro...|
|Celestial Seasoni...|I love this tea, ...|
|Nutiva Organic Vi...|I have used servo...|
|Hershey's Hallowe...|This is a variety...|
|LifeSavers Hard W...|LifeSavers brand ...|
|Amoretti Premium ...|This product is b...|
|Organic Cotton Ca...|My kids and I are...|
|80pk White Coffee...|I am disappointed...|
|Sorbee Sugar Free...|These are really ...|
|V8 V-Fusion Peach...|Who knew you coul...|
|Natural Touch Kaf...|I have used Kaffr...|
|Jif Chocolate Nut...|Im a choosie Moth...|
|Fiber One Chewy B...|I needed more fib...|
|Gerber Good Start...|Like most working...|
|Eatsmart Naturals...|I LOVE cheese puf...|
|Jelly Belly Bean ...|Played the

In [15]:
from pyspark.sql.functions import col
dft2.groupBy("product_title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(20)

+--------------------+-----+
|       product_title|count|
+--------------------+-----+
|San Francisco Bay...| 5738|
|Viva Naturals Org...| 3598|
| Davidson's Tea Bulk| 2604|
|Nutiva Organic Vi...| 2562|
|Amazing Grass Gre...| 2057|
|Ekobrew Coffee Re...| 1980|
|Brooklyn Beans Si...| 1558|
|  Senseo Coffee Pods| 1265|
|Tuscan Dairy Whol...| 1231|
|Celestial Seasoni...| 1196|
|Keurig, The Origi...| 1168|
|Grove Square Capp...| 1136|
|Reese's Spreads P...| 1133|
|Timothy's World C...| 1043|
|Keurig Green Moun...| 1017|
|Matcha Green Tea ...|  997|
|Vita Coco Coconut...|  989|
|Nutiva Hi Fiber H...|  977|
|Surge Citrus Flav...|  945|
|YumEarth Organic ...|  909|
+--------------------+-----+
only showing top 20 rows



In [16]:
dft3 = dft2
dft3.show(20)

+--------------------+--------------------+
|       product_title|     review_body_raw|
+--------------------+--------------------+
|The Cravings Plac...|As a family aller...|
|Jif Chocolate Nut...|I'm a member of t...|
|Orgain Organic Pl...|Used to be a dece...|
|Bragg - All Natur...|I cannot tell the...|
|Skippy Creamy Pea...|I bought this fro...|
|Celestial Seasoni...|I love this tea, ...|
|Nutiva Organic Vi...|I have used servo...|
|Hershey's Hallowe...|This is a variety...|
|LifeSavers Hard W...|LifeSavers brand ...|
|Amoretti Premium ...|This product is b...|
|Organic Cotton Ca...|My kids and I are...|
|80pk White Coffee...|I am disappointed...|
|Sorbee Sugar Free...|These are really ...|
|V8 V-Fusion Peach...|Who knew you coul...|
|Natural Touch Kaf...|I have used Kaffr...|
|Jif Chocolate Nut...|Im a choosie Moth...|
|Fiber One Chewy B...|I needed more fib...|
|Gerber Good Start...|Like most working...|
|Eatsmart Naturals...|I LOVE cheese puf...|
|Jelly Belly Bean ...|Played the

In [17]:
from pyspark.sql.functions import lower, col

dft3 = dft3.withColumn("review_body", lower(col("review_body_raw")))
dft3 = dft3.selectExpr("product_title as product_title", "review_body as review_body")  

In [18]:
dft3.show(3)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|The Cravings Plac...|as a family aller...|
|Jif Chocolate Nut...|i'm a member of t...|
|Orgain Organic Pl...|used to be a dece...|
+--------------------+--------------------+
only showing top 3 rows



In [19]:
import pyspark.sql.functions as F
from pyspark.sql import Window 
counts = dft3.groupBy('product_title').count()
counts = counts.selectExpr("product_title as product_title_tmp", "count as count")  

In [20]:
counts.show(1000)

+--------------------+-----+
|   product_title_tmp|count|
+--------------------+-----+
|Eatsmart Naturals...|    6|
|Caps for Keurig K...|    5|
|40 Count - Cake B...|    2|
|Wonka Laffy TaffyJar|   99|
|Banquet, Morning ...|    1|
|Torani Sugar Free...|  195|
|Dogswell, Happy H...|    3|
|Golden Barrel Bla...|   17|
|Juanitas H And S ...|    6|
|Aikane Kona Coffe...|    1|
|Lenny & Larry's t...|    1|
|Philippine Brand ...|   52|
|Carbon's Golden M...|   11|
| Hershey's Chocolate|  112|
|Kashi Crunchy Hon...|    2|
|BetterBody Foods ...|   46|
|The Spice Lab's P...|  254|
|Brooke Bond Taj M...|    5|
|Wake the F'Up Unc...|   24|
|Keurig Starbucks ...|    5|
|Iris Gummies Hemp...|    3|
|Organic Valley - ...|    3|
|GRASS FED, NON-GM...|    1|
|Lance Peanut Bar ...|    3|
|Stash Tea Single-...|    9|
|The Reaper Puree ...|   17|
|C. Howard Violet ...|   53|
|Italian Black Win...|    8|
|Johnnys Potatoes ...|    1|
|Sunsweet Lighter ...|    1|
|Anchovies Monte P...|    1|
|Carnation Bre

In [21]:
dft3 = dft3.join(counts, dft3.product_title == counts.product_title_tmp)
dft3 = dft3.drop('product_title_tmp')

In [22]:
dft3.count()

912170

In [23]:
dft3 = dft3.select('*').where("count > 500")
dft3.show(20)

+--------------------+--------------------+-----+
|       product_title|         review_body|count|
+--------------------+--------------------+-----+
|Orgain Organic Nu...|this particular f...|  544|
|Orgain Organic Nu...|the orgain iced c...|  544|
|Orgain Organic Nu...|my son has been c...|  544|
|Orgain Organic Nu...|i had gone throug...|  544|
|Orgain Organic Nu...|this is easy to d...|  544|
|Orgain Organic Nu...|tasty and nutriti...|  544|
|Orgain Organic Nu...|i initially purch...|  544|
|Orgain Organic Nu...|if you are lookin...|  544|
|Orgain Organic Nu...|i'm a chocoholic ...|  544|
|Orgain Organic Nu...|great for on the ...|  544|
|Orgain Organic Nu...|i absolutely love...|  544|
|Orgain Organic Nu...|my interest in fo...|  544|
|Orgain Organic Nu...|my wife loves the...|  544|
|Orgain Organic Nu...|it is wonderful! ...|  544|
|Orgain Organic Nu...|i bought this on ...|  544|
|Orgain Organic Nu...|these have become...|  544|
|Orgain Organic Nu...|normally after ha...|  544|


In [24]:
dft3.count()

63096

In [25]:
from pyspark.sql.functions import col
dft3.groupBy("product_title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(20)

+--------------------+-----+
|       product_title|count|
+--------------------+-----+
|San Francisco Bay...| 5738|
|Viva Naturals Org...| 3598|
| Davidson's Tea Bulk| 2604|
|Nutiva Organic Vi...| 2562|
|Amazing Grass Gre...| 2057|
|Ekobrew Coffee Re...| 1980|
|Brooklyn Beans Si...| 1558|
|  Senseo Coffee Pods| 1265|
|Tuscan Dairy Whol...| 1231|
|Celestial Seasoni...| 1196|
|Keurig, The Origi...| 1168|
|Grove Square Capp...| 1136|
|Reese's Spreads P...| 1133|
|Timothy's World C...| 1043|
|Keurig Green Moun...| 1017|
|Matcha Green Tea ...|  997|
|Vita Coco Coconut...|  989|
|Nutiva Hi Fiber H...|  977|
|Surge Citrus Flav...|  945|
|YumEarth Organic ...|  909|
+--------------------+-----+
only showing top 20 rows



# Model Pipeline

In [26]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern = "[^A-Za-z]+", toLowercase=True)


In [27]:
# define "irrelevant words
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/alexdziena/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [28]:
from nltk.tokenize import word_tokenize
import nltk.corpus
import re
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn

list_adv = []
list_adj = []
list_n = []

for s in wn.all_synsets():
    if s.pos() in ['r']: # if synset is adverb
        for i in s.lemmas(): # iterate through lemmas for each synset
            list_adv.append(i.name())
    elif s.pos() in ['a']:
        for i in s.lemmas(): # iterate through lemmas for each synset
            list_adj.append(i.name())
    elif s.pos() in ['n']: # if synset is noun
        for i in s.lemmas(): # iterate through lemmas for each synset
            list_n.append(i.name())
       

In [29]:
# remove stop words and irrelevant words
add_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "ve", "se", "didn", "hasn", "hadn", "hasnt", "isnt", "havent", "although", "despite", "however" ]
add_irrelevantwords = ["poor", "perfect", "good", "excellent", "excelent" ,"great", "horrible", "cheap", "expensive", "different", "awesome"]
single_alphabet = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
word_filter = add_stopwords + add_irrelevantwords + single_alphabet + list_adv + list_adj
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(word_filter)



In [32]:
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [33]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "product_title", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])


In [34]:
pipelineFit_t = pipeline.fit(dft3)
dataset_t = pipelineFit_t.transform(dft3)
dataset_t.show(5)
dataset_final = dataset_t.select('*').where('label < 10')

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Orgain Organic Nu...|this particular f...|  544|[this, particular...|[particular, flav...|(10000,[3,5,8,9,1...| 55.0|
|Orgain Organic Nu...|the orgain iced c...|  544|[the, orgain, ice...|[orgain, iced, ca...|(10000,[1,6,9,29,...| 55.0|
|Orgain Organic Nu...|my son has been c...|  544|[my, son, has, be...|[son, failure, th...|(10000,[3,17,19,2...| 55.0|
|Orgain Organic Nu...|i had gone throug...|  544|[i, had, gone, th...|[gone, numerous, ...|(10000,[3,8,34,53...| 55.0|
|Orgain Organic Nu...|this is easy to d...|  544|[this, is, easy, ...|[drink, taste, is...|(10000,[3,6,10,18...| 55.0|
+--------------------+--------------------+-----

In [35]:
dataset_final.show(3)

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Ekobrew Coffee Re...|this worked fine ...| 1980|[this, worked, fi...|[worked, keurig, ...|(10000,[1,10,11,2...|  5.0|
|Ekobrew Coffee Re...|still using these...| 1980|[still, using, th...|[using, years, ek...|(10000,[4,10,12,1...|  5.0|
|Ekobrew Coffee Re...|i had to slightly...| 1980|[i, had, to, slig...|[break, plastic, ...|(10000,[0,1,4,28,...|  5.0|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
only showing top 3 rows



In [36]:
data = dataset_final.drop('count', 'words', 'filtered')
data.show(3)
(trainingData_t, testData_t) = data.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 1 Count: " + str(trainingData_t.count()))
print("Test Dataset Count: " + str(testData_t.count()))

+--------------------+--------------------+-----+
|       product_title|            features|label|
+--------------------+--------------------+-----+
|Ekobrew Coffee Re...|(10000,[1,10,11,2...|  5.0|
|Ekobrew Coffee Re...|(10000,[4,10,12,1...|  5.0|
|Ekobrew Coffee Re...|(10000,[0,1,4,28,...|  5.0|
+--------------------+--------------------+-----+
only showing top 3 rows

Training Dataset 1 Count: 16808
Test Dataset Count: 6981


In [38]:
from pyspark.ml.classification import LogisticRegression

lrt = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8)
lrModelt = lrt.fit(trainingData_t)
predictions = lrModelt.transform(testData_t)
predictions.filter(predictions['prediction'] == 0) \
    .select("product_title","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 1000, truncate = 30)

+------------------------------+------------------------------+-----+----------+
|                 product_title|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil...|[0.24652785107194514,0.1468...|  5.0|       0.0|
|Ekobrew Coffee Reusable Fil

In [39]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.09406403841080413

# Cross Validation for hyperparameter tuning

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(lrt.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lrt.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build())
# 5-fold CrossValidator
cv = CrossValidator(estimator=lrt, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData_t)

predictions_cv = cvModel.transform(testData_t)

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_cv)

# Feature Selection

In [40]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

In [41]:
selector = ChiSqSelector(numTopFeatures=5, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="label")

result = selector.fit(data).transform(data)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())
result.show()

ChiSqSelector output with top 5 features selected
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+--------------------+
|       product_title|         review_body|count|               words|            filtered|            features|label|    selectedFeatures|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+--------------------+
|Ekobrew Coffee Re...|this worked fine ...| 1980|[this, worked, fi...|[worked, keurig, ...|(10000,[1,10,11,2...|  5.0|       (5,[1],[1.0])|
|Ekobrew Coffee Re...|still using these...| 1980|[still, using, th...|[using, years, ek...|(10000,[4,10,12,1...|  5.0|       (5,[4],[1.0])|
|Ekobrew Coffee Re...|i had to slightly...| 1980|[i, had, to, slig...|[break, plastic, ...|(10000,[0,1,4,28,...|  5.0|(5,[0,1,4],[2.0,1...|
|Ekobrew Coffee Re...|works with my b30...| 1980|[works, with, my,...|[works, se, work,...|(10000,[1,4,55,96..

In [42]:
result.show(3)

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+--------------------+
|       product_title|         review_body|count|               words|            filtered|            features|label|    selectedFeatures|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+--------------------+
|Ekobrew Coffee Re...|this worked fine ...| 1980|[this, worked, fi...|[worked, keurig, ...|(10000,[1,10,11,2...|  5.0|       (5,[1],[1.0])|
|Ekobrew Coffee Re...|still using these...| 1980|[still, using, th...|[using, years, ek...|(10000,[4,10,12,1...|  5.0|       (5,[4],[1.0])|
|Ekobrew Coffee Re...|i had to slightly...| 1980|[i, had, to, slig...|[break, plastic, ...|(10000,[0,1,4,28,...|  5.0|(5,[0,1,4],[2.0,1...|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+--------------------+
only showing top 3 r

In [43]:
selected_data = result.drop('count', 'review_body', 'words', 'filtered', 'features')
selected_data = selected_data.selectExpr("product_title as product_title", "selectedFeatures as features", "label as label")
selected_data.show(3)
(trainingData_s, testData_s) = selected_data.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData_s.count()))
print("Test Dataset Count: " + str(testData_s.count()))

+--------------------+--------------------+-----+
|       product_title|            features|label|
+--------------------+--------------------+-----+
|Ekobrew Coffee Re...|       (5,[1],[1.0])|  5.0|
|Ekobrew Coffee Re...|       (5,[4],[1.0])|  5.0|
|Ekobrew Coffee Re...|(5,[0,1,4],[2.0,1...|  5.0|
+--------------------+--------------------+-----+
only showing top 3 rows

Training Dataset Count: 16808
Test Dataset Count: 6981


In [44]:
from pyspark.ml.classification import LogisticRegression

lrs = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8)
lrModels = lrs.fit(trainingData_s)
predictions_s = lrModels.transform(testData_s)
predictions_s.filter(predictions['prediction'] == 0) \
    .select("product_title","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 1000, truncate = 30)

AnalysisException: 'Resolved attribute(s) prediction#1080 missing from rawPrediction#1557,probability#1562,label#1338,prediction#1568,product_title#1336,features#1337 in operator !Filter (prediction#1080 = cast(0 as double)). Attribute(s) with the same name appear in the operation: prediction. Please check if the right attribute(s) are used.;;\n!Filter (prediction#1080 = cast(0 as double))\n+- AnalysisBarrier\n      +- Project [product_title#1336, features#1337, label#1338, rawPrediction#1557, probability#1562, UDF(rawPrediction#1557) AS prediction#1568]\n         +- Project [product_title#1336, features#1337, label#1338, rawPrediction#1557, UDF(rawPrediction#1557) AS probability#1562]\n            +- Project [product_title#1336, features#1337, label#1338, UDF(features#1337) AS rawPrediction#1557]\n               +- Sample 0.7, 1.0, false, 100\n                  +- Sort [product_title#1336 ASC NULLS FIRST, features#1337 ASC NULLS FIRST, label#1338 ASC NULLS FIRST], false\n                     +- Project [product_title#280 AS product_title#1336, selectedFeatures#1212 AS features#1337, label#634 AS label#1338]\n                        +- Project [product_title#280, label#634, selectedFeatures#1212]\n                           +- Project [product_title#280, review_body#281, count#314L, words#616, filtered#621, features#627, label#634, UDF(features#627) AS selectedFeatures#1212]\n                              +- Filter (label#634 < cast(10 as double))\n                                 +- Project [product_title#280, review_body#281, count#314L, words#616, filtered#621, features#627, label#634]\n                                    +- Project [product_title#280, review_body#281, count#314L, words#616, filtered#621, features#627, UDF(cast(product_title#280 as string)) AS label#634]\n                                       +- Project [product_title#280, review_body#281, count#314L, words#616, filtered#621, UDF(filtered#621) AS features#627]\n                                          +- Project [product_title#280, review_body#281, count#314L, words#616, UDF(words#616) AS filtered#621]\n                                             +- Project [product_title#280, review_body#281, count#314L, UDF(review_body#281) AS words#616]\n                                                +- Filter (count#314L > cast(500 as bigint))\n                                                   +- Project [product_title#280, review_body#281, count#314L]\n                                                      +- Project [product_title#280, review_body#281, count#314L]\n                                                         +- Join Inner, (product_title#280 = product_title_tmp#313)\n                                                            :- Project [product_title#5 AS product_title#280, review_body#276 AS review_body#281]\n                                                            :  +- Project [product_title#5, review_body_raw#13, lower(review_body_raw#13) AS review_body#276]\n                                                            :     +- Filter (length(review_body_raw#13) > 200)\n                                                            :        +- Project [product_title#5, review_body_raw#13]\n                                                            :           +- Filter ((isnotnull(product_title#5) || isnotnull(review_body_raw#13)) || (NOT (product_title#5 = ) || NOT (review_body_raw#13 = )))\n                                                            :              +- Project [product_title#5, review_body_raw#13]\n                                                            :                 +- Project [product_title#5, review_body_raw#13]\n                                                            :                    +- Filter AtLeastNNulls(n, marketplace#0,customer_id#1,review_id#2,product_id#3,product_parent#4,product_title#5,product_category#6,star_rating#7,helpful_votes#8,total_votes#9,vine#10,verified_purchase#11,review_headline#12,review_body_raw#13,review_date#14)\n                                                            :                       +- Relation[marketplace#0,customer_id#1,review_id#2,product_id#3,product_parent#4,product_title#5,product_category#6,star_rating#7,helpful_votes#8,total_votes#9,vine#10,verified_purchase#11,review_headline#12,review_body_raw#13,review_date#14] csv\n                                                            +- Project [product_title#280 AS product_title_tmp#313, count#310L AS count#314L]\n                                                               +- Aggregate [product_title#280], [product_title#280, count(1) AS count#310L]\n                                                                  +- Project [product_title#5 AS product_title#280, review_body#276 AS review_body#281]\n                                                                     +- Project [product_title#5, review_body_raw#13, lower(review_body_raw#13) AS review_body#276]\n                                                                        +- Filter (length(review_body_raw#13) > 200)\n                                                                           +- Project [product_title#5, review_body_raw#13]\n                                                                              +- Filter ((isnotnull(product_title#5) || isnotnull(review_body_raw#13)) || (NOT (product_title#5 = ) || NOT (review_body_raw#13 = )))\n                                                                                 +- Project [product_title#5, review_body_raw#13]\n                                                                                    +- Project [product_title#5, review_body_raw#13]\n                                                                                       +- Filter AtLeastNNulls(n, marketplace#0,customer_id#1,review_id#2,product_id#3,product_parent#4,product_title#5,product_category#6,star_rating#7,helpful_votes#8,total_votes#9,vine#10,verified_purchase#11,review_headline#12,review_body_raw#13,review_date#14)\n                                                                                          +- Relation[marketplace#0,customer_id#1,review_id#2,product_id#3,product_parent#4,product_title#5,product_category#6,star_rating#7,helpful_votes#8,total_votes#9,vine#10,verified_purchase#11,review_headline#12,review_body_raw#13,review_date#14] csv\n'

In [None]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.util import MLUtils

nbt = NaiveBayes.train(trainingData_t, 1.0)
#predictionAndLabel = test.map(lambda p: (nbt.predict(p.features), p.label))
#accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
model = gbt.fit(trainingData_t)


In [None]:
predictions = model.transform(testData_t)
predictions.select("prediction", "indexedLabel", "features").show(5)
