### Chapter11_NLP_Ex2: Musical Instruments
- dataset: "Musical_Instrument_5.json"
- Yêu cầu: Build 1 bộ lọc reviewer. SỬ dụng các công cụ NLP và thuật toán Naive Bayes để đưa ra dự đoán nếu 1 review là like(overall>=4), not like(overall<=2) or neutral

In [1]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar -xvf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"
import findspark
findspark.init()

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:8 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:14 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("NLP_ex2").getOrCreate()

In [4]:
path = "/content/drive/MyDrive/Classroom/LDS9_S7N_DoanMinhHieu/demo/LDS9 buoi 7/Musical_Instruments_5.json"
data = spark.read.json(path)

In [5]:
data.show(5)

+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin| helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+--------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|1384719342|  [0, 0]|    5.0|Not much to write...|02 28, 2014|A2IBPI20UZIR0U|cassandra tu "Yea...|                good|    1393545600|
|1384719342|[13, 14]|    5.0|The product does ...|03 16, 2013|A14VAT5EAX3D9S|                Jake|                Jake|    1363392000|
|1384719342|  [1, 1]|    5.0|The primary job o...|08 28, 2013|A195EZSQDW3E21|Rick Bennette "Ri...|It Does The Job Well|    1377648000|
|1384719342|  [0, 0]|    5.0|Nice windscreen p...|02 14, 2014|A2C00NNG1ZQQG2|RustyBill "Sunday...|GOOD WINDSCREEN F...|    1392336000|
|1384719342|  [0, 0]|    5.0|This pop filter i...|02 21

In [6]:
from pyspark.sql.functions import *

In [7]:
data = data.withColumn("class", when(col("overall") >=4, "like").when(col("overall")<=2, "not_like").otherwise("neutral"))
data = data.select("reviewText", "overall", "class")

In [8]:
data.show(5)

+--------------------+-------+-----+
|          reviewText|overall|class|
+--------------------+-------+-----+
|Not much to write...|    5.0| like|
|The product does ...|    5.0| like|
|The primary job o...|    5.0| like|
|Nice windscreen p...|    5.0| like|
|This pop filter i...|    5.0| like|
+--------------------+-------+-----+
only showing top 5 rows



In [9]:
data = data.withColumn("length", length("reviewText"))
data.show(5)

+--------------------+-------+-----+------+
|          reviewText|overall|class|length|
+--------------------+-------+-----+------+
|Not much to write...|    5.0| like|   268|
|The product does ...|    5.0| like|   544|
|The primary job o...|    5.0| like|   436|
|Nice windscreen p...|    5.0| like|   206|
|This pop filter i...|    5.0| like|   159|
+--------------------+-------+-----+------+
only showing top 5 rows



In [10]:
data.groupby("class").mean().show()

+--------+------------------+-----------------+
|   class|      avg(overall)|      avg(length)|
+--------+------------------+-----------------+
|not_like|1.5353319057815846|579.2055674518201|
| neutral|               3.0|579.2111398963731|
|    like|4.7690090888938155|473.1188206606074|
+--------+------------------+-----------------+



In [11]:
data.groupby("class").count().show()

+--------+-----+
|   class|count|
+--------+-----+
|not_like|  467|
| neutral|  772|
|    like| 9022|
+--------+-----+



In [12]:
# => du lieu bị lệch, cần xử lý đây

In [13]:
# Transform du lieu 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import NaiveBayes
from pyspark.ml import Pipeline

In [14]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="token_text")
stopremove = StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
count_vec = CountVectorizer(inputCol="stop_tokens", outputCol="c_vec")
idf = IDF(inputCol="c_vec", outputCol="tf_idf")

class_to_num = StringIndexer(inputCol="class", outputCol="label")

clean_up = VectorAssembler(inputCols=["tf_idf", "length"], outputCol="features")

In [15]:
# build pipeline
data_pre_pipe = Pipeline(stages=[class_to_num, 
                                 tokenizer, 
                                 stopremove,
                                 count_vec,
                                 idf,
                                 clean_up])

In [16]:
cleaner = data_pre_pipe.fit(data)

In [17]:
clean_data = cleaner.transform(data)

In [18]:
clean_data.show(5)

+--------------------+-------+-----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|          reviewText|overall|class|length|label|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|
+--------------------+-------+-----+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|Not much to write...|    5.0| like|   268|  0.0|[not, much, to, w...|[much, write, her...|(51948,[3,12,14,3...|(51948,[3,12,14,3...|(51949,[3,12,14,3...|
|The product does ...|    5.0| like|   544|  0.0|[the, product, do...|[product, exactly...|(51948,[2,3,12,16...|(51948,[2,3,12,16...|(51949,[2,3,12,16...|
|The primary job o...|    5.0| like|   436|  0.0|[the, primary, jo...|[primary, job, de...|(51948,[11,19,44,...|(51948,[11,19,44,...|(51949,[11,19,44,...|
|Nice windscreen p...|    5.0| like|   206|  0.0|[nice, windscreen...|

In [19]:
# Train va danh gia model
clean_data = clean_data.select("label", "features")
clean_data.show(5, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
train_data, test_data = clean_data.randomSplit([0.7,0.3])

In [21]:
train_data.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6331|
|  1.0|  536|
|  2.0|  332|
+-----+-----+



In [22]:
test_data.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 2691|
|  1.0|  236|
|  2.0|  135|
+-----+-----+



In [23]:
# buil model
nb = NaiveBayes()
spam_predictor = nb.fit(train_data)

In [24]:
test_results = spam_predictor.transform(test_data)
test_results.show(5, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
# xem cunfusion matrix
test_results.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   62|
|  1.0|       1.0|   74|
|  0.0|       1.0|  518|
|  1.0|       0.0|  141|
|  2.0|       2.0|   30|
|  2.0|       1.0|   43|
|  1.0|       2.0|   21|
|  0.0|       0.0| 1962|
|  0.0|       2.0|  211|
+-----+----------+-----+



In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [27]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
acc

0.7299239452042784

In [28]:
# do chinh xac dat khoang 73%, chua tot lam
# => thu thuc hien cac thuat toan khac

### Su dung logisticRegression/ Random Forest

In [29]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

In [30]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
predict_lg = lg.fit(train_data)
test_results_lg = predict_lg.transform(train_data)

In [31]:
# xem cunfusion matrix
test_results_lg.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   44|
|  1.0|       1.0|  446|
|  1.0|       0.0|   90|
|  2.0|       2.0|  287|
|  2.0|       1.0|    1|
|  0.0|       0.0| 6331|
+-----+----------+-----+



In [32]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [33]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results_lg)
acc

0.9805454077950888

In [34]:
# Random forest

In [35]:
rfc = RandomForestClassifier(labelCol="label", featuresCol="features",
                             numTrees=500, maxDepth=5, maxBins=64)
predict_rfc = rfc.fit(train_data)
test_results_rfc = predict_rfc.transform(train_data)

In [36]:
# xem cunfusion matrix
test_results_rfc.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  332|
|  1.0|       0.0|  536|
|  0.0|       0.0| 6331|
+-----+----------+-----+



In [37]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [38]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results_rfc)
acc

0.8230091290292741

In [39]:
# Do chinh xac cao do ty le du doan cua like tot, còn voi các loại đánh giá khác thì quá tệ
# Thử resample data

### Resampling data

In [40]:
like_df = train_data.filter(col("label")==0)
neutral_df = train_data.filter(col("label")==1)
not_like_df = train_data.filter(col("label")==2)

In [41]:
ratio_1 = int(like_df.count()/neutral_df.count())
ratio_2 = int(like_df.count()/not_like_df.count())

In [42]:
print("like_df/neutral_df", ratio_1)
print("like_df/not_like_df", ratio_2)

like_df/neutral_df 11
like_df/not_like_df 19


In [43]:
# resampling neural
a1 = range(ratio_1)
oversampled_neural_df = neutral_df.withColumn("dummy", explode(array([lit(x) for x in a1]))).drop("dummy")

In [44]:
combined_df = like_df.unionAll(oversampled_neural_df)
combined_df.show(5, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [45]:
combined_df.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6331|
|  1.0| 5896|
+-----+-----+



In [46]:
# resampling not like
a2 = range(ratio_2)
oversampled_not_like_df = not_like_df.withColumn("dummy", explode(array([lit(x) for x in a2]))).drop("dummy")

In [47]:
conbined_df = combined_df.unionAll(oversampled_not_like_df)
conbined_df.show(5, truncate = False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [48]:
conbined_df.groupby("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 6331|
|  1.0| 5896|
|  2.0| 6308|
+-----+-----+



In [49]:
# buil model
nb = NaiveBayes()
predictor_4 = nb.fit(conbined_df)

In [50]:
test_results_4 = predictor_4.transform(test_data)
test_results_4.show(5, truncate=False)

+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [51]:
# xem cunfusion matrix
test_results_4.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  113|
|  1.0|       1.0|   22|
|  0.0|       1.0|  117|
|  1.0|       0.0|  200|
|  2.0|       2.0|   11|
|  2.0|       1.0|   11|
|  1.0|       2.0|   14|
|  0.0|       0.0| 2517|
|  0.0|       2.0|   57|
+-----+----------+-----+



In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [53]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results_4)
acc

0.8145717478999177

In [54]:
# do chinh xac dat khoang 82%, chua tot lam
# => thu thuc hien cac thuat toan khac

In [55]:
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression

In [56]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
predict_lg_2 = lg.fit(conbined_df)
test_results_lg_2 = predict_lg_2.transform(conbined_df)

In [57]:
# xem cunfusion matrix
test_results_lg_2.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0| 5863|
|  0.0|       1.0|    8|
|  1.0|       0.0|   22|
|  2.0|       2.0| 6308|
|  1.0|       2.0|   11|
|  0.0|       0.0| 6322|
|  0.0|       2.0|    1|
+-----+----------+-----+



In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [59]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results_lg_2)
acc

0.9977328322117285

In [60]:
# Random forest

In [61]:
rfc = RandomForestClassifier(labelCol="label", featuresCol="features",
                             numTrees=500, maxDepth=5, maxBins=64)
predict_rfc_2 = rfc.fit(conbined_df)
test_results_rfc_2 = predict_rfc_2.transform(conbined_df)

In [62]:
# xem cunfusion matrix
test_results_rfc_2.groupby("label", "prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0| 1349|
|  1.0|       1.0|  869|
|  0.0|       1.0|    6|
|  1.0|       0.0| 4400|
|  2.0|       2.0| 4959|
|  1.0|       2.0|  627|
|  0.0|       0.0| 5915|
|  0.0|       2.0|  410|
+-----+----------+-----+



In [63]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [64]:
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results_rfc_2)
acc

0.580533090611274

In [64]:
# => ko tốt kể cả độ chính xác và xem confusion matrix