In [1]:
import os
import findspark
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    "--repositories http://repo.hortonworks.com/content/groups/public/ "
    "--packages com.hortonworks:shc-core:1.1.1-2.1-s_2.11 "
    " pyspark-shell")
findspark.init()

In [2]:
from pyspark import SQLContext, SparkConf, SparkContext
from pyspark.sql import SparkSession
conf = SparkConf()
conf.setAppName("project1")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.appName("test").getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType

schema = StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True), # "label" replaces "product_title"
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", StringType(), True)])

df_test = spark.read.csv('amazon_reviews_us_Electronics_v1_00.tsv', sep="\t", header=True, schema=schema)
#df_test = spark.read.csv('sample.csv', header=True, schema=schema)

In [5]:
df_test.count()

3093869

In [6]:
df_test = df_test.dropna()
df_test.count()
dft = df_test.drop('marketplace','customer_id','review_id','product_id','product_parent','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_date')

In [7]:
dft.count()

3093660

In [8]:
dft2 = dft.select('*').where('product_title is not null or review_body is not null or product_title <> "" or review_body <> "" ')

In [9]:
dft2.count()

3093660

In [10]:
dft.show(50)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|yoomall 5M Antenn...|       As described.|
|Hosa GPM-103 3.5m...|It works as adver...|
|Channel Master Ti...|         Works pissa|
|LIMTECH Wall char...|Did not work at all.|
|Skullcandy Air Ra...|Works well. Bass ...|
|Pioneer SP-BS22-L...|The quality on th...|
|C2G/Cables to Go ...|Wish I could give...|
|COOLEAD-HDMI Swit...|         works great|
|Philips Wireless ...|Great sound and c...|
|PlayStation 3 3D ...|    It works well~~~|
|JVC HAFR201A Xtre...|           Alll good|
|Sylvania Alarm Cl...|Love clock radio ...|
|Coby 8 GB 1.8-Inc...|Breaks very easil...|
|Diamond (Original...|Excellent gain in...|
|Kingvom 8gb 50 Ho...|everything I expe...|
|JBL Ultra-Portabl...|Love this small s...|
|YIPBOWPT Surface ...|works as advertis...|
|StarTech.com Mini...|very good especia...|
|TEAC CD-P650-B Co...|It does not copy ...|
|Philips SHS8100/2...|Did not la

In [11]:
dft2 = dft2.select('*').where('length(review_body)>200')

In [12]:
dft2.count()

1481128

In [13]:
#dft2.select('*').where(dft2.product_title.startswith('Jif')).show()

In [14]:
dft2.show(20)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|TEAC CD-P650-B Co...|It does not copy ...|
|Brainwavz Hengja ...|Can't get any sim...|
|Anker 60W 6-Port ...|Makes a differenc...|
|Cable Matters (2-...|Great little devi...|
|Monoprice 108323 ...|Excellent sound. ...|
|Battery1inc World...|Great Energy Leve...|
|Bose Bluetooth Au...|Excellent product...|
|Status Audio HD O...|Good looking over...|
|FiiO D3 (D03K) Di...|Appears to work j...|
|Crosley CR8005D-B...|We bought this fo...|
|Sony MDRZX110 Noi...|Great pair of hea...|
|White EZCast i5+ ...|I've decided to k...|
|Sennheiser Headph...|Bought these head...|
|VideoSecu Full Mo...|I am having a har...|
|G.G.Martinsen 32 ...|Junk - will play ...|
|iDeaUSA Wireless ...|Our family loves ...|
|Sony  Digital Med...|I got these and u...|
|iGadgitz Al Serie...|[[VIDEOID:3bdd177...|
|Grace Digital GDI...|So, I wouldn't ha...|
|Zipbuds SLIDE Spo...|Awesome ea

In [15]:
from pyspark.sql.functions import col
dft2.groupBy("product_title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(20)

+--------------------+-----+
|       product_title|count|
+--------------------+-----+
|Panasonic ErgoFit...|10386|
|Clip Plus 4 GB MP...| 6855|
|Mediabridge ULTRA...| 6185|
|Sennheiser On-Ear...| 5327|
|Cheetah APTMM2B T...| 4470|
|Mohu Leaf 30 TV A...| 4246|
|High Speed HDMI C...| 4158|
|MEElectronics Spo...| 4102|
|VideoSecu ML531BE...| 4067|
|Sanyo NEW 1500 en...| 3510|
|     HDMI-High-Speed| 2978|
|Bluetooth Speaker...| 2970|
|AmazonBasics Ultr...| 2823|
|Apple iPod touch ...| 2813|
|Electrohome EAAC6...| 2791|
|JLab JBuds Hi-Fi ...| 2783|
|CABTE High speed ...| 2752|
|AmazonBasics High...| 2653|
|HomeSpot NFC-Enab...| 2532|
|     SDMX22-004G-PAR| 2466|
+--------------------+-----+
only showing top 20 rows



In [16]:
dft3 = dft2
dft3.show(20)

+--------------------+--------------------+
|       product_title|         review_body|
+--------------------+--------------------+
|TEAC CD-P650-B Co...|It does not copy ...|
|Brainwavz Hengja ...|Can't get any sim...|
|Anker 60W 6-Port ...|Makes a differenc...|
|Cable Matters (2-...|Great little devi...|
|Monoprice 108323 ...|Excellent sound. ...|
|Battery1inc World...|Great Energy Leve...|
|Bose Bluetooth Au...|Excellent product...|
|Status Audio HD O...|Good looking over...|
|FiiO D3 (D03K) Di...|Appears to work j...|
|Crosley CR8005D-B...|We bought this fo...|
|Sony MDRZX110 Noi...|Great pair of hea...|
|White EZCast i5+ ...|I've decided to k...|
|Sennheiser Headph...|Bought these head...|
|VideoSecu Full Mo...|I am having a har...|
|G.G.Martinsen 32 ...|Junk - will play ...|
|iDeaUSA Wireless ...|Our family loves ...|
|Sony  Digital Med...|I got these and u...|
|iGadgitz Al Serie...|[[VIDEOID:3bdd177...|
|Grace Digital GDI...|So, I wouldn't ha...|
|Zipbuds SLIDE Spo...|Awesome ea

In [17]:
import pyspark.sql.functions as F
from pyspark.sql import Window 
counts = dft3.groupBy('product_title').count()
counts = counts.selectExpr("product_title as product_title_tmp", "count as count")  

In [18]:
counts.show(1000)

+--------------------+-----+
|   product_title_tmp|count|
+--------------------+-----+
|iDeaUSA Wireless ...|   45|
|Photive Hydra Wir...|  847|
|MUZO Cobblestone ...|   16|
|Invision TV Wall ...|  572|
|Waterfi 100% Wate...|  738|
|Winegard Outdoor ...| 1307|
|Sony MDR-V6 Monit...| 1498|
|Skullcandy Ink'd ...| 1236|
|Belkin 6-Outlet H...| 1963|
|Monoprice Marker ...|   57|
|Headphones, Sound...|  112|
|AC Power Cord Cab...|    5|
|New Online Waistb...|    2|
|GranVela A809 Lig...|   30|
|Powerbeats Wired ...|   42|
|Sony HTCT260H Sou...|  491|
|Jantzen Audio 0.2...|    1|
|Black Cable with ...|    3|
|Skullcandy Chops ...|    2|
|Panasonic Tcm125 ...|    1|
|Emotiva Audio ERC...|   20|
|[UL Listed] Pwr+ ...|    1|
|Case AceTM Apple ...|    3|
|Yamaha CRX-332BL ...|   42|
|TUNES2GO ROCKTube...|    2|
|Sleepphones SC5BM...|    1|
|V-MODA VC-3SZ-BLA...|    3|
|Royal 29297W WES ...|  157|
|Panasonic eneloop...|   18|
|Pioneer VSX-1123 ...|   90|
|2PCS NCR 18650B 3...|    1|
|Latte DeLite 

In [19]:
dft3 = dft3.join(counts, dft3.product_title == counts.product_title_tmp)
dft3 = dft3.drop('product_title_tmp')

In [20]:
dft3.count()

1481128

In [21]:
dft3 = dft3.select('*').where("count > 500")
dft3.show(20)

+--------------------+--------------------+-----+
|       product_title|         review_body|count|
+--------------------+--------------------+-----+
|Belkin 6-Outlet H...|The price for the...| 1963|
|Belkin 6-Outlet H...|So far these seem...| 1963|
|Belkin 6-Outlet H...|As other reviewer...| 1963|
|Belkin 6-Outlet H...|A very good exten...| 1963|
|Belkin 6-Outlet H...|I haven't really ...| 1963|
|Belkin 6-Outlet H...|I've bought sever...| 1963|
|Belkin 6-Outlet H...|Frustrating but...| 1963|
|Belkin 6-Outlet H...|I have a bunch of...| 1963|
|Belkin 6-Outlet H...|Love these for th...| 1963|
|Belkin 6-Outlet H...|These were perfec...| 1963|
|Belkin 6-Outlet H...|Wow 12 outlets. I...| 1963|
|Belkin 6-Outlet H...|I just have an ol...| 1963|
|Belkin 6-Outlet H...|I originally gave...| 1963|
|Belkin 6-Outlet H...|Product arrived i...| 1963|
|Belkin 6-Outlet H...|The 2.5ft version...| 1963|
|Belkin 6-Outlet H...|Lightning struck ...| 1963|
|Belkin 6-Outlet H...|It's a good produ...| 1963|


In [22]:
dft3.count()

354451

In [23]:
from pyspark.sql.functions import col
dft3.groupBy("product_title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(20)

+--------------------+-----+
|       product_title|count|
+--------------------+-----+
|Panasonic ErgoFit...|10386|
|Clip Plus 4 GB MP...| 6855|
|Mediabridge ULTRA...| 6185|
|Sennheiser On-Ear...| 5327|
|Cheetah APTMM2B T...| 4470|
|Mohu Leaf 30 TV A...| 4246|
|High Speed HDMI C...| 4158|
|MEElectronics Spo...| 4102|
|VideoSecu ML531BE...| 4067|
|Sanyo NEW 1500 en...| 3510|
|     HDMI-High-Speed| 2978|
|Bluetooth Speaker...| 2970|
|AmazonBasics Ultr...| 2823|
|Apple iPod touch ...| 2813|
|Electrohome EAAC6...| 2791|
|JLab JBuds Hi-Fi ...| 2783|
|CABTE High speed ...| 2752|
|AmazonBasics High...| 2653|
|HomeSpot NFC-Enab...| 2532|
|     SDMX22-004G-PAR| 2466|
+--------------------+-----+
only showing top 20 rows



In [24]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="review_body", outputCol="words", pattern = "[^A-Za-z]+", toLowercase=True)

# stop words
add_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
add_irrelevantwords = ["poor", "perfect", "good", "excellent", "excelent" ,"great", "horrible", "cheap", "expensive", "different", "awesome"]
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords).setStopWords(add_irrelevantwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [25]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "product_title", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])


In [26]:
pipelineFit_t = pipeline.fit(dft3)
dataset_t = pipelineFit_t.transform(dft3)
dataset_t.show(5)
dataset_final = dataset_t.select('*').where('label < 10')

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Belkin 6-Outlet H...|The price for the...| 1963|[the, price, for,...|[the, price, for,...|(10000,[0,1,2,3,4...| 31.0|
|Belkin 6-Outlet H...|So far these seem...| 1963|[so, far, these, ...|[so, far, these, ...|(10000,[1,2,3,5,7...| 31.0|
|Belkin 6-Outlet H...|As other reviewer...| 1963|[as, other, revie...|[as, other, revie...|(10000,[0,1,2,3,4...| 31.0|
|Belkin 6-Outlet H...|A very good exten...| 1963|[a, very, good, e...|[a, very, extensi...|(10000,[0,2,3,4,5...| 31.0|
|Belkin 6-Outlet H...|I haven't really ...| 1963|[i, haven, t, rea...|[i, haven, t, rea...|(10000,[0,1,3,4,5...| 31.0|
+--------------------+--------------------+-----

In [27]:
dataset_final.show(3)

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Mediabridge ULTRA...|I needed an HDMI ...| 6185|[i, needed, an, h...|[i, needed, an, h...|(10000,[0,1,2,3,4...|  2.0|
|Mediabridge ULTRA...|When I don't touc...| 6185|[when, i, don, t,...|[when, i, don, t,...|(10000,[0,1,2,4,5...|  2.0|
|Mediabridge ULTRA...|Purchased to conn...| 6185|[purchased, to, c...|[purchased, to, c...|(10000,[0,1,2,3,4...|  2.0|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
only showing top 3 rows



In [32]:
data = dataset_final.drop('count', 'review_body', 'words', 'filtered')
data.show(3)
(trainingData_t, testData_t) = data.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset 1 Count: " + str(trainingData_t.count()))
print("Test Dataset Count: " + str(testData_t.count()))

+--------------------+--------------------+-----+
|       product_title|            features|label|
+--------------------+--------------------+-----+
|Mediabridge ULTRA...|(10000,[0,1,2,3,4...|  2.0|
|Mediabridge ULTRA...|(10000,[0,1,2,4,5...|  2.0|
|Mediabridge ULTRA...|(10000,[0,1,2,3,4...|  2.0|
+--------------------+--------------------+-----+
only showing top 3 rows

Training Dataset 1 Count: 37453
Test Dataset Count: 15853


In [None]:
dataset_final.show(14107)
dataset_final.count()

+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|       product_title|         review_body|count|               words|            filtered|            features|label|
+--------------------+--------------------+-----+--------------------+--------------------+--------------------+-----+
|Mediabridge ULTRA...|I needed an HDMI ...| 6185|[i, needed, an, h...|[i, needed, an, h...|(10000,[0,1,2,3,4...|  2.0|
|Mediabridge ULTRA...|When I don't touc...| 6185|[when, i, don, t,...|[when, i, don, t,...|(10000,[0,1,2,4,5...|  2.0|
|Mediabridge ULTRA...|Purchased to conn...| 6185|[purchased, to, c...|[purchased, to, c...|(10000,[0,1,2,3,4...|  2.0|
|Mediabridge ULTRA...|My cable provider...| 6185|[my, cable, provi...|[my, cable, provi...|(10000,[0,1,2,3,7...|  2.0|
|Mediabridge ULTRA...|Honestly, this is...| 6185|[honestly, this, ...|[honestly, this, ...|(10000,[0,1,2,4,5...|  2.0|
|Mediabridge ULTRA...|This cable is too...| 6185

53306

In [None]:
from pyspark.ml.classification import LogisticRegression

lrt = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0.8)
lrModelt = lrt.fit(trainingData_t)
predictions = lrModelt.transform(testData_t)
predictions.filter(predictions['prediction'] == 0) \
    .select("product_title","probability","label","prediction", "product_title") \
    .orderBy("probability", ascending=False) \
    .show(n = 1000, truncate = 30)

In [None]:
print(predictions.schema)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)