## Deliverable 2

In [8]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Find the latest versions of
#   Spark & Hadoop:  https://spark.apache.org/downloads.html (https://www.apache.org/dist/spark/)
#   Postgres driver: https://jdbc.postgresql.org/
os.environ['HADOOP_VERSION']   = hadoop_version   = 'hadoop3'
os.environ['SPARK_VERSION']    = spark_version    = 'spark-3.3.1'
os.environ['POSTGRES_VERSION'] = postgres_version = 'postgresql-42.5.1'

# Install Java
! apt install openjdk-11-jdk-headless > /dev/null
os.environ['JAVA_HOME']  = '/usr/lib/jvm/java-11-openjdk-amd64'

# Install Spark
! wget https://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-$HADOOP_VERSION.tgz
! tar xf $SPARK_VERSION-bin-$HADOOP_VERSION.tgz
os.environ['SPARK_HOME'] = f'/content/{spark_version}-bin-{hadoop_version}'
! pip install findspark

# Install Postgres driver
! wget https://jdbc.postgresql.org/download/$POSTGRES_VERSION.jar

# Install AWS's Boto3
! pip install boto3

import boto3
import findspark
findspark.init()
from   getpass     import getpass
from   pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName('M16-Vine-Challenge') \
  .config('spark.driver.extraClassPath', f'/content/{postgres_version}.jar') \
  .getOrCreate()
spark



--2022-11-30 11:21:17--  https://www.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
Resolving www.apache.org (www.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to www.apache.org (www.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz [following]
--2022-11-30 11:21:18--  https://downloads.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
Resolving downloads.apache.org (downloads.apache.org)... 135.181.214.104, 88.99.95.219, 2a01:4f8:10a:201a::2, ...
Connecting to downloads.apache.org (downloads.apache.org)|135.181.214.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 299350810 (285M) [application/x-gzip]
Saving to: ‘spark-3.3.1-bin-hadoop3.tgz.1’


2022-11-30 11:21:47 (9.98 MB/s) - ‘spark-3.3.1-bin-hadoop3.tgz.1’ saved [299350810/299350810]

Looking in indexes: https://pypi.org/simple, https://us-py

In [9]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_DVD_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
video_review_df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep="\t", header=True, inferSchema=True)
video_review_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   27288431| R33UPQQUZQEM8|B005T4ND06|     400024643|Yoga for Movement...|       Video DVD|          5|            3|          3|   N|                Y|This was a gift f...|This was a gift f...|2015-08-31 00:00:00|
|         US|   13722556|R3IKTNQQPD9662|B004EPZ070|     685335564|  Something Borrowed| 

In [10]:
total_votes_df = video_review_df.filter(video_review_df["total_votes"] >= 20)
total_votes_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   44783596|R31EUR60FV3BU5|B00ZGPZR9I|     514136181|  Wodehouse in Exile|       Video DVD|          4|           32|         39|   N|                N|Unfair Accusation...|Amazed after Wode...|2015-08-31 00:00:00|
|         US|   20714119|R12Q1NO1HI9PP4|B00XUV1B4U|     881344665|Mad Max Anthology...| 

In [11]:
helpful_votes_df = total_votes_df.filter(total_votes_df["helpful_votes"]/total_votes_df["total_votes"] >= 0.5)
helpful_votes_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   44783596|R31EUR60FV3BU5|B00ZGPZR9I|     514136181|  Wodehouse in Exile|       Video DVD|          4|           32|         39|   N|                N|Unfair Accusation...|Amazed after Wode...|2015-08-31 00:00:00|
|         US|   12720421|R3MDX24QDAT0OW|B00ZCHTQGA|     779817400|Northmen - A Viki...| 

In [5]:
vine_df=helpful_votes_df.filter(helpful_votes_df["vine"]=="Y")
vine_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   27249597|R3PBQ665MBL8CP|B00E1LT41K|     592331046|The White Queen: ...|       Video DVD|          3|           66|         72|   Y|                N|An enjoyable watc...|I enjoyed this TV...|2014-03-27 00:00:00|
|         US|   37876368|R3EMD30K7MNOZ0|B00A27OMKU|     678954891|Shiva Rea: Yoga i...| 

In [12]:
not_vine_df=helpful_votes_df.filter(helpful_votes_df["vine"]=="N")
not_vine_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|        review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-------------------+
|         US|   44783596|R31EUR60FV3BU5|B00ZGPZR9I|     514136181|  Wodehouse in Exile|       Video DVD|          4|           32|         39|   N|                N|Unfair Accusation...|Amazed after Wode...|2015-08-31 00:00:00|
|         US|   12720421|R3MDX24QDAT0OW|B00ZCHTQGA|     779817400|Northmen - A Viki...| 

In [20]:
# Total number of paid reviews (enrolled in Vine program)
paid_count= vine_df.count()

In [19]:
# Total number of unpaid reviews (not enrolled in Vine program)
unpaid_count=not_vine_df.count()

In [32]:
# Number of 5 Star Paid Reviews
five_star_paid_count = vine_df.filter(vine_df["star_rating"]=="5").count()

In [33]:
# Number of 5 Star Unpaid Reviews
five_star_unpaid_count = not_vine_df.filter(not_vine_df["star_rating"]=="5").count()

In [66]:
# Total number of Paid Reviews
vine_count=vine_df.count()

In [67]:
# Total number of Unpaid Reviews
not_vine_count=not_vine_df.count()

In [45]:
# Percentage of Paid Reviews that are 5 Stars
five_star_paid_perc = (five_star_paid_count)/(paid_count)*100
five_star_paid_perc

18.367346938775512

In [35]:
# Percentage of Unpaid Reviews that are 5 Stars
five_star_unpaid_perc = (five_star_unpaid_count)/(unpaid_count)*10
five_star_unpaid_perc

5.155944517833554

In [36]:
print(f"Paid and Helpful Reviews: {paid_count}")

Paid and Helpful Reviews: 49


In [22]:
print(f"Unpaid and Helpful Reviews: {unpaid_count}")

Unpaid and Helpful Reviews: 151400


In [37]:
print("Paid 5 Star Reviews: {}".format(five_star_paid_count))

Paid 5 Star Reviews: 9


In [38]:
print(f"Unpaid 5 Star Reviews: {five_star_unpaid_count}")

Unpaid 5 Star Reviews: 78061


In [53]:
print(f"Percentage of Paid Reviews that have 5 Stars: {five_star_paid_perc}%")

Percentage of Paid Reviews that have 5 Stars: 18.367346938775512%


In [54]:
print(f"Percentage of Unpaid Reviews that have 5 Stars: {five_star_unpaid_perc}%")

Percentage of Unpaid Reviews that have 5 Stars: 5.155944517833554%


### Additional Analysis: 

Let us see jsut how many of the total reviews on Video DVDs is Paid and Unpaid

In [78]:
# Total number of reviews on Video DVDs
total_reviews = video_review_df.count()
total_reviews

5069140

In [79]:
# Total number of Paid (Vine) Reviews-- helpful or not 
total_vine = video_review_df.filter(video_review_df["vine"] =='Y').count()
total_vine

4340

In [77]:
# Total number of Unpaid (Not-Vine) Reviews-- helpful or not 
total_not_vine = video_review_df.filter(video_review_df["vine"] =='N').count()
total_not_vine

5064796

In [43]:
print(f"Total Number of Video DVD Reviews: {total_reviews}")

Total Number of Video DVD Reviews: 5069140


In [80]:
# Percentage of Paid Vine Reviews (in all)
vine_reviews_percent= total_vine/total_reviews *100

In [81]:
# Percentage of Unpaid Vine Reviews (in all)
not_vine_reviews_percent= total_not_vine/total_reviews*100

In [82]:
print(f"Percentage of  Paid Video DVD Reviews: {vine_reviews_percent:.2f}%")

Percentage of  Paid Video DVD Reviews: 0.09%


In [83]:
print(f"Percentage of Unpaid Video DVD Reviews: {not_vine_reviews_percent:.2f}%")

Percentage of Unpaid Video DVD Reviews: 99.91%
