In [2]:
!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"
# Start a SparkSession
import findspark
findspark.init()

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=5516f5f52d93a0237b8f0eef38bc9b8e16b20583bd5d423154c7edcb5dfcee5e
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [3]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()

In [89]:
# 1. Read in the csv as a DataFrame
from pyspark import SparkFiles

df = spark.read.csv(SparkFiles.get("/amz_ca_total_products_data_processed.csv"), header=True, inferSchema=True, quote='"', escape='"')

df.show()

+----------+--------------------+--------------------+--------------------+-----+-------+-----+---------+--------------------+------------+-----------------+
|      asin|               title|              imgUrl|          productURL|stars|reviews|price|listPrice|        categoryName|isBestSeller|boughtInLastMonth|
+----------+--------------------+--------------------+--------------------+-----+-------+-----+---------+--------------------+------------+-----------------+
|B07CV4L6HX|Green Leaf WW3D W...|https://m.media-a...|https://www.amazo...|  4.4|   2876|47.69|      0.0|Industrial  Scien...|       false|                0|
|B09N1HGY74|8pcs Toilet Seat ...|https://m.media-a...|https://www.amazo...|  3.8|     55|10.99|      0.0|Industrial  Scien...|       false|              100|
|B087P7538J|YaeCCC 19 Pcs Hol...|https://m.media-a...|https://www.amazo...|  4.0|    126|25.99|    27.99|Industrial  Scien...|       false|               50|
|B0822FF7YQ|LLPT Butyl Putty ...|https://m.media-a..

In [90]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- title: string (nullable = true)
 |-- imgUrl: string (nullable = true)
 |-- productURL: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- reviews: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- listPrice: double (nullable = true)
 |-- categoryName: string (nullable = true)
 |-- isBestSeller: boolean (nullable = true)
 |-- boughtInLastMonth: integer (nullable = true)



In [91]:
# Drop unnecessary columns to the ML model
df = df.drop(*['asin', 'title', 'imgUrl', 'productURL', 'listPrice'])
df.show()

+-----+-------+-----+--------------------+------------+-----------------+
|stars|reviews|price|        categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+-----+--------------------+------------+-----------------+
|  4.4|   2876|47.69|Industrial  Scien...|       false|                0|
|  3.8|     55|10.99|Industrial  Scien...|       false|              100|
|  4.0|    126|25.99|Industrial  Scien...|       false|               50|
|  4.5|   1936|21.99|Industrial  Scien...|       false|              100|
|  4.2|     46|18.99|Industrial  Scien...|       false|              100|
|  4.5|   2505|15.99|Industrial  Scien...|       false|                0|
|  4.3|    216|27.99|Industrial  Scien...|       false|               50|
|  4.0|     53| 9.99|Industrial  Scien...|       false|               50|
|  4.5|    164|15.95|Industrial  Scien...|       false|                0|
|  4.4|    366|29.99|Industrial  Scien...|       false|               50|
|  4.6|    874|12.99|Industrial  Scien

In [92]:
# Ensure all rows are there
df.count()

2165926

In [93]:
# Drop NaN Values
df = df.dropna()
df.show()

+-----+-------+-----+--------------------+------------+-----------------+
|stars|reviews|price|        categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+-----+--------------------+------------+-----------------+
|  4.4|   2876|47.69|Industrial  Scien...|       false|                0|
|  3.8|     55|10.99|Industrial  Scien...|       false|              100|
|  4.0|    126|25.99|Industrial  Scien...|       false|               50|
|  4.5|   1936|21.99|Industrial  Scien...|       false|              100|
|  4.2|     46|18.99|Industrial  Scien...|       false|              100|
|  4.5|   2505|15.99|Industrial  Scien...|       false|                0|
|  4.3|    216|27.99|Industrial  Scien...|       false|               50|
|  4.0|     53| 9.99|Industrial  Scien...|       false|               50|
|  4.5|    164|15.95|Industrial  Scien...|       false|                0|
|  4.4|    366|29.99|Industrial  Scien...|       false|               50|
|  4.6|    874|12.99|Industrial  Scien

In [94]:
# Count rows without N/A values
df.count()

2165926

In [95]:
# Check for stars column errors
df.filter((df['stars'] > 5) | (df['stars'] < 0)).show()

+-----+-------+-----+------------+------------+-----------------+
|stars|reviews|price|categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+-----+------------+------------+-----------------+
+-----+-------+-----+------------+------------+-----------------+



In [96]:
# remove rows with unknown prices
df = df.filter(df['price'] != 0)
df.show()

+-----+-------+-----+--------------------+------------+-----------------+
|stars|reviews|price|        categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+-----+--------------------+------------+-----------------+
|  4.4|   2876|47.69|Industrial  Scien...|       false|                0|
|  3.8|     55|10.99|Industrial  Scien...|       false|              100|
|  4.0|    126|25.99|Industrial  Scien...|       false|               50|
|  4.5|   1936|21.99|Industrial  Scien...|       false|              100|
|  4.2|     46|18.99|Industrial  Scien...|       false|              100|
|  4.5|   2505|15.99|Industrial  Scien...|       false|                0|
|  4.3|    216|27.99|Industrial  Scien...|       false|               50|
|  4.0|     53| 9.99|Industrial  Scien...|       false|               50|
|  4.5|    164|15.95|Industrial  Scien...|       false|                0|
|  4.4|    366|29.99|Industrial  Scien...|       false|               50|
|  4.6|    874|12.99|Industrial  Scien

In [97]:
# Rows without unkown prices
df.count()

1988016

In [98]:
# Check unique values in category column
df.select(df['categoryName']).distinct().count()

266

In [99]:
# Get the top 30 categories
top_30_categories = df.groupBy('categoryName').count().orderBy('count', ascending=False).limit(30)

# Extract the top 30 category names into a list
top_30_category_names = [row['categoryName'] for row in top_30_categories.collect()]

# Filter the DataFrame to keep only the rows with category names in the top 30 list
filtered_df = df.filter(df['categoryName'].isin(top_30_category_names))

filtered_df.show()

+-----+-------+------+--------------------+------------+-----------------+
|stars|reviews| price|        categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+------+--------------------+------------+-----------------+
|  4.5|  43171| 29.99|Motorcycle Access...|        true|                0|
|  4.7|    215|  5.98|Motorcycle Access...|        true|              900|
|  3.9|   1006| 14.22|Motorcycle Access...|        true|             1000|
|  4.3|  17381| 12.99|Motorcycle Access...|        true|              900|
|  5.0|      1| 21.69|Motorcycle Access...|        true|              200|
|  4.7|   2441|  8.98|Motorcycle Access...|       false|             1000|
|  4.7|    646| 11.99|Motorcycle Access...|       false|              400|
|  4.7|   5311| 13.06|Motorcycle Access...|       false|              400|
|  4.8|    454|  5.97|Motorcycle Access...|       false|              500|
|  2.1|      5| 14.37|Motorcycle Access...|       false|              800|
|  4.6|  22466|  22.4|Mot

In [100]:
# Check to see if only 30 categories are left as expected
filtered_df.select(df['categoryName']).distinct().count()

30

In [101]:
# Count rows with filtered categories
filtered_df.count()

489909

In [102]:
# Confirm that none of the values in Target column are non boolean
filtered_df.filter(~df['isBestSeller'].isin([True, False])).show()

+-----+-------+-----+------------+------------+-----------------+
|stars|reviews|price|categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+-----+------------+------------+-----------------+
+-----+-------+-----+------------+------------+-----------------+



In [103]:
# Change isBestSeller column to binary
from pyspark.sql.functions import when

# Convert 'isBestSeller' column to binary 1-0 representation
filtered_df = filtered_df.withColumn('isBestSeller', when(df['isBestSeller'] == True, 1).otherwise(0))

filtered_df.show()

+-----+-------+------+--------------------+------------+-----------------+
|stars|reviews| price|        categoryName|isBestSeller|boughtInLastMonth|
+-----+-------+------+--------------------+------------+-----------------+
|  4.5|  43171| 29.99|Motorcycle Access...|           1|                0|
|  4.7|    215|  5.98|Motorcycle Access...|           1|              900|
|  3.9|   1006| 14.22|Motorcycle Access...|           1|             1000|
|  4.3|  17381| 12.99|Motorcycle Access...|           1|              900|
|  5.0|      1| 21.69|Motorcycle Access...|           1|              200|
|  4.7|   2441|  8.98|Motorcycle Access...|           0|             1000|
|  4.7|    646| 11.99|Motorcycle Access...|           0|              400|
|  4.7|   5311| 13.06|Motorcycle Access...|           0|              400|
|  4.8|    454|  5.97|Motorcycle Access...|           0|              500|
|  2.1|      5| 14.37|Motorcycle Access...|           0|              800|
|  4.6|  22466|  22.4|Mot

In [104]:
# Double check counts stayed the same
filtered_df.count()

489909

In [108]:
import pandas as pd

# Convert PySpark DataFrame to Pandas DataFrame
pandas_df = filtered_df.toPandas()

# Now you can work with the Pandas DataFrame as usual
pandas_df.head()

Unnamed: 0,stars,reviews,price,categoryName,isBestSeller,boughtInLastMonth
0,4.5,43171,29.99,Motorcycle Accessories Parts,1,0
1,4.7,215,5.98,Motorcycle Accessories Parts,1,900
2,3.9,1006,14.22,Motorcycle Accessories Parts,1,1000
3,4.3,17381,12.99,Motorcycle Accessories Parts,1,900
4,5.0,1,21.69,Motorcycle Accessories Parts,1,200


In [111]:
# Download cleaned csv
pandas_df.to_csv('amazon_data_cleaned.csv')