# Py4JNetworkError test

# FIXED

In [1]:
import os
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    "--repositories http://repo.hortonworks.com/content/groups/public/ "
    "--packages com.hortonworks:shc-core:1.1.1-2.1-s_2.11 "
    " pyspark-shell")
findspark.init()

In [2]:
from pyspark import SQLContext, SparkConf, SparkContext
from pyspark.sql import SparkSession
conf = SparkConf()
conf.setAppName("project1")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName("test").getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True), # "label" replaces "product_title"
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", StringType(), True)])

#df_test = spark.read.csv('amazon_reviews_us_Grocery_v1_00.tsv', sep="\t", header=True, schema=schema)
df_test = spark.read.csv('sample.csv', header=True, schema=schema)

In [5]:
df_test.count()

6511

In [6]:
df_test = df_test.dropna()
df_test.count()
dft = df_test.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')

In [7]:
dft.count()

6496

In [8]:
dft2 = dft.select('*').where('product_title is not null or review_body is not null or product_title <> "" or review_body <> ""')

In [9]:
dft2.count()

6496

In [10]:
dft.show(50)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|The Cravings Plac...|As a family aller...|
|Mauna Loa Macadam...|My favorite nut. ...|
|Organic Matcha Gr...|This green tea ta...|
|15oz Raspberry Ly...|I love Melissa's ...|
|Stride Spark Kine...|                good|
|Herr's Popcorn Ho...|The popcorn was s...|
|Larabar uber, 1.4...|Love these bars, ...|
|Shirakiku Soba No...|Love the taste bu...|
|Jif Chocolate Nut...|I'm a member of t...|
|Orgain Organic Pl...|Used to be a dece...|
|Bragg - All Natur...|I cannot tell the...|
|Wholesome Sweeten...|Good flavor and s...|
|Kadoya Pure Sesam...|Great to use in r...|
|Nishiki Premium B...|It's rice. Have e...|
|Everly Passion Fr...|Very good tasting...|
|Charms Blue Razzb...|They were perfect...|
|Food Should Taste...|Wow, these are so...|
|Skippy Creamy Pea...|I bought this fro...|
|Celestial Seasoni...|I love this tea, ...|
|Nutiva Organic Vi...|I have use

In [11]:
dft2 = dft2.select('*').where('length(review_body)>30')

In [12]:
dft2.count()

4820

In [13]:
#dft2.select('*').where(dft2.product_title.startswith('Jif')).show()

In [14]:
dft2.show(3)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|The Cravings Plac...|As a family aller...|
|Mauna Loa Macadam...|My favorite nut. ...|
|Organic Matcha Gr...|This green tea ta...|
+--------------------+--------------------+
only showing top 3 rows



In [15]:
from pyspark.sql.functions import col
dft2.groupBy("product_title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(3)

+--------------------+-----+
|       product_title|count|
+--------------------+-----+
|Jif Chocolate Nut...|   80|
|San Francisco Bay...|   28|
|Viva Naturals Org...|   17|
+--------------------+-----+
only showing top 3 rows



In [16]:
dft3 = dft2
dft3.show(3)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|The Cravings Plac...|As a family aller...|
|Mauna Loa Macadam...|My favorite nut. ...|
|Organic Matcha Gr...|This green tea ta...|
+--------------------+--------------------+
only showing top 3 rows



In [17]:
import pyspark.sql.functions as F
from pyspark.sql import Window 
counts = dft3.groupBy('product_title').count()
counts = counts.selectExpr("product_title as product_title_tmp", "count as count")  

In [18]:
counts.show(3)

+--------------------+-----+
|   product_title_tmp|count|
+--------------------+-----+
|Eatsmart Naturals...|    1|
|Haribo Lakritz Ko...|    1|
|Del Monte Cherry ...|    1|
+--------------------+-----+
only showing top 3 rows



In [19]:
dft3 = dft3.join(counts, dft3.product_title == counts.product_title_tmp)
dft3 = dft3.drop('product_title_tmp')

In [20]:
dft3.count()

4820

In [21]:
dft3 = dft3.select('*').where('count > 5')
dft3.count()
dft3.show(20)

+--------------------+--------------------+-----+
|       product_title|         review_body|count|
+--------------------+--------------------+-----+
|Mauna Loa Macadam...|My favorite nut. ...|    6|
|Organic Matcha Gr...|This green tea ta...|    6|
|Jif Chocolate Nut...|I'm a member of t...|   80|
|Orgain Organic Pl...|Used to be a dece...|    8|
|Nutiva Organic Vi...|I have used servo...|    8|
|Mauna Loa Macadam...|What can I say, t...|    6|
|Jif Chocolate Nut...|Im a choosie Moth...|   80|
|Healthworks Cacao...|Great tasting...e...|   17|
|Nutiva Organic Vi...|Use instead of sh...|    8|
|Healthworks Chia ...|Much better value...|   12|
|YumEarth Organic ...|I don't know abou...|    6|
|Jif Chocolate Nut...|Iâ€™m a member of...|   80|
|Matcha Green Tea ...|I am enjoying my ...|   11|
|IZZE Fortified Sp...|Delicious fizz wi...|    6|
|San Francisco Bay...|Second purchase. ...|   28|
|Zevia Naturally S...|I loved the old f...|   17|
|Jif Chocolate Nut...|I reviewed this p...|   80|


In [22]:
dft3.drop('count')

DataFrame[product_title: string, review_body: string]

In [23]:
dft3.show(20)

+--------------------+--------------------+-----+
|       product_title|         review_body|count|
+--------------------+--------------------+-----+
|Mauna Loa Macadam...|My favorite nut. ...|    6|
|Organic Matcha Gr...|This green tea ta...|    6|
|Jif Chocolate Nut...|I'm a member of t...|   80|
|Orgain Organic Pl...|Used to be a dece...|    8|
|Nutiva Organic Vi...|I have used servo...|    8|
|Mauna Loa Macadam...|What can I say, t...|    6|
|Jif Chocolate Nut...|Im a choosie Moth...|   80|
|Healthworks Cacao...|Great tasting...e...|   17|
|Nutiva Organic Vi...|Use instead of sh...|    8|
|Healthworks Chia ...|Much better value...|   12|
|YumEarth Organic ...|I don't know abou...|    6|
|Jif Chocolate Nut...|Iâ€™m a member of...|   80|
|Matcha Green Tea ...|I am enjoying my ...|   11|
|IZZE Fortified Sp...|Delicious fizz wi...|    6|
|San Francisco Bay...|Second purchase. ...|   28|
|Zevia Naturally S...|I loved the old f...|   17|
|Jif Chocolate Nut...|I reviewed this p...|   80|


In [24]:
dft3.count()

397

In [25]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern = "[^A-Za-z]+", toLowercase=True)

# stop words
add_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [26]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "product_title", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])


In [27]:
pipelineFit_t = pipeline.fit(dft3)
dataset_t = pipelineFit_t.transform(dft3)
dataset_t.show(5)
dataset_final = dataset_t.select('*').where('label < 100')
(trainingData_t, testData_t) = dataset_final.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 1 Count: " + str(trainingData_t.count()))
print("Test Dataset Count: " + str(testData_t.count()))

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Mauna Loa Macadam...|My favorite nut. ...|    6|[my, favorite, nu...|[favorite, nut, c...|(366,[41,48,53,77...| 31.0|
|Organic Matcha Gr...|This green tea ta...|    6|[this, green, tea...|[green, tea, tast...|(366,[10,24,45,10...| 33.0|
|Jif Chocolate Nut...|I'm a member of t...|   80|[i, m, a, member,...|[m, member, crowd...|(366,[0,1,2,5,6,7...|  0.0|
|Orgain Organic Pl...|Used to be a dece...|    8|[used, to, be, a,...|[used, decent, pr...|(366,[3,13,33,46,...| 11.0|
|Nutiva Organic Vi...|I have used servo...|    8|[i, have, used, s...|[used, servo, dif...|(366,[3,21,22,23,...| 13.0|
+--------------------+--------------------+-----

In [28]:
dataset_final.show(100)
dataset_final.count()

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Mauna Loa Macadam...|My favorite nut. ...|    6|[my, favorite, nu...|[favorite, nut, c...|(366,[41,48,53,77...| 31.0|
|Organic Matcha Gr...|This green tea ta...|    6|[this, green, tea...|[green, tea, tast...|(366,[10,24,45,10...| 33.0|
|Jif Chocolate Nut...|I'm a member of t...|   80|[i, m, a, member,...|[m, member, crowd...|(366,[0,1,2,5,6,7...|  0.0|
|Orgain Organic Pl...|Used to be a dece...|    8|[used, to, be, a,...|[used, decent, pr...|(366,[3,13,33,46,...| 11.0|
|Nutiva Organic Vi...|I have used servo...|    8|[i, have, used, s...|[used, servo, dif...|(366,[3,21,22,23,...| 13.0|
|Mauna Loa Macadam...|What can I say, t...|    6

397

In [29]:
from pyspark.ml.classification import LogisticRegression

lrt = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8)
lrModelt = lrt.fit(trainingData_t)
predictions = lrModelt.transform(testData_t)
predictions.filter(predictions['prediction'] == 0) \
    .select("review_body","product_title","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 100, truncate = 30)

+------------------------------+------------------------------+------------------------------+-----+----------+
|                   review_body|                 product_title|                   probability|label|prediction|
+------------------------------+------------------------------+------------------------------+-----+----------+
|â€œIâ€™m a member of the Cr...|Jif Chocolate Nut Butter Al...|[0.37608665052178947,0.0621...|  0.0|       0.0|
|The chocolate was is amazin...|Jif Chocolate Nut Butter Al...|[0.3749155375759375,0.06225...|  0.0|       0.0|
|These Jif peanut butter bar...|Jif Chocolate Nut Butter Al...|[0.3749155375759375,0.06225...|  0.0|       0.0|
|I was very excited to try t...|Jif Chocolate Nut Butter Al...|[0.3749155375759375,0.06225...|  0.0|       0.0|
|These bars are really good ...|Jif Chocolate Nut Butter Al...|[0.3749155375759375,0.06225...|  0.0|       0.0|
|I have to say that I alread...|Jif Chocolate Nut Butter Al...|[0.3749155375759375,0.06225...|  0.0|    

In [30]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.0568075117370892