In [1]:
!wget -q https://dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"
# Start a SparkSession
import findspark
findspark.init()

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=e4dafcee4927bfaf49870522d21f10933ec5c8b94de2b916d3cebd859d590347
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()

In [15]:
# 1. Read in the csv as a DataFrame
from pyspark import SparkFiles

df = spark.read.csv(SparkFiles.get("/amz_ca_total_products_data_processed.csv"), header=True, inferSchema=True, quote='"', escape='"')

df.show()

+----------+--------------------+--------------------+--------------------+-----+-------+-----+---------+--------------------+------------+-----------------+
|      asin|               title|              imgUrl|          productURL|stars|reviews|price|listPrice|        categoryName|isBestSeller|boughtInLastMonth|
+----------+--------------------+--------------------+--------------------+-----+-------+-----+---------+--------------------+------------+-----------------+
|B07CV4L6HX|Green Leaf WW3D W...|https://m.media-a...|https://www.amazo...|  4.4|   2876|47.69|      0.0|Industrial  Scien...|       false|                0|
|B09N1HGY74|8pcs Toilet Seat ...|https://m.media-a...|https://www.amazo...|  3.8|     55|10.99|      0.0|Industrial  Scien...|       false|              100|
|B087P7538J|YaeCCC 19 Pcs Hol...|https://m.media-a...|https://www.amazo...|  4.0|    126|25.99|    27.99|Industrial  Scien...|       false|               50|
|B0822FF7YQ|LLPT Butyl Putty ...|https://m.media-a..

In [16]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- title: string (nullable = true)
 |-- imgUrl: string (nullable = true)
 |-- productURL: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- reviews: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- listPrice: double (nullable = true)
 |-- categoryName: string (nullable = true)
 |-- isBestSeller: boolean (nullable = true)
 |-- boughtInLastMonth: integer (nullable = true)



In [17]:
# Drop unnecessary columns to the ML model
df = df.drop(*['asin', 'imgUrl', 'productURL', 'listPrice'])
df.show()

+--------------------+-----+-------+-----+--------------------+------------+-----------------+
|               title|stars|reviews|price|        categoryName|isBestSeller|boughtInLastMonth|
+--------------------+-----+-------+-----+--------------------+------------+-----------------+
|Green Leaf WW3D W...|  4.4|   2876|47.69|Industrial  Scien...|       false|                0|
|8pcs Toilet Seat ...|  3.8|     55|10.99|Industrial  Scien...|       false|              100|
|YaeCCC 19 Pcs Hol...|  4.0|    126|25.99|Industrial  Scien...|       false|               50|
|LLPT Butyl Putty ...|  4.5|   1936|21.99|Industrial  Scien...|       false|              100|
|Lightbeam 16" Lon...|  4.2|     46|18.99|Industrial  Scien...|       false|              100|
|etguuds White USB...|  4.5|   2505|15.99|Industrial  Scien...|       false|                0|
|REAQER CPAP Hose ...|  4.3|    216|27.99|Industrial  Scien...|       false|               50|
|SAVITA 2pcs Magne...|  4.0|     53| 9.99|Industri

In [18]:
# Ensure all rows are there
df.count()

2165926

In [19]:
# Drop NaN Values
df = df.dropna()
df.show()

+--------------------+-----+-------+-----+--------------------+------------+-----------------+
|               title|stars|reviews|price|        categoryName|isBestSeller|boughtInLastMonth|
+--------------------+-----+-------+-----+--------------------+------------+-----------------+
|Green Leaf WW3D W...|  4.4|   2876|47.69|Industrial  Scien...|       false|                0|
|8pcs Toilet Seat ...|  3.8|     55|10.99|Industrial  Scien...|       false|              100|
|YaeCCC 19 Pcs Hol...|  4.0|    126|25.99|Industrial  Scien...|       false|               50|
|LLPT Butyl Putty ...|  4.5|   1936|21.99|Industrial  Scien...|       false|              100|
|Lightbeam 16" Lon...|  4.2|     46|18.99|Industrial  Scien...|       false|              100|
|etguuds White USB...|  4.5|   2505|15.99|Industrial  Scien...|       false|                0|
|REAQER CPAP Hose ...|  4.3|    216|27.99|Industrial  Scien...|       false|               50|
|SAVITA 2pcs Magne...|  4.0|     53| 9.99|Industri

In [20]:
# Count rows without N/A values
df.count()

2165926

In [21]:
# Convert title column to title length
from pyspark.sql.functions import length

# Add a new column 'titleLength' containing the length of the 'title' column
df = df.withColumn('titleLength', length(df['title']))
df = df.drop('title')

In [24]:
# Display df
df.orderBy('price', ascending=False).show(truncate=False)

+-----+-------+--------+------------------------------------------------------+------------+-----------------+-----------+
|stars|reviews|price   |categoryName                                          |isBestSeller|boughtInLastMonth|titleLength|
+-----+-------+--------+------------------------------------------------------+------------+-----------------+-----------+
|0.0  |0      |40900.0 |Outdoor Storage  Housing                              |false       |0                |117        |
|0.0  |0      |34552.5 |Outdoor Storage  Housing                              |false       |0                |114        |
|0.0  |0      |32079.9 |Television Projectors                                 |false       |0                |55         |
|0.0  |0      |29508.95|Salon  Spa Equipment                                  |false       |0                |73         |
|0.0  |0      |28012.45|Television Projectors                                 |false       |0                |62         |
|0.0  |0      |2

In [25]:
# Check for stars column errors
df.filter((df['stars'] > 5) | (df['stars'] < 0)).show()

+-----+-------+-----+------------+------------+-----------------+-----------+
|stars|reviews|price|categoryName|isBestSeller|boughtInLastMonth|titleLength|
+-----+-------+-----+------------+------------+-----------------+-----------+
+-----+-------+-----+------------+------------+-----------------+-----------+



In [28]:
# remove rows with unknown prices
df = df.filter(df['price'] != 0)
df.orderBy('price', ascending=False).show(truncate=False)

+-----+-------+--------+------------------------------------------------------+------------+-----------------+-----------+
|stars|reviews|price   |categoryName                                          |isBestSeller|boughtInLastMonth|titleLength|
+-----+-------+--------+------------------------------------------------------+------------+-----------------+-----------+
|0.0  |0      |40900.0 |Outdoor Storage  Housing                              |false       |0                |117        |
|0.0  |0      |34552.5 |Outdoor Storage  Housing                              |false       |0                |114        |
|0.0  |0      |32079.9 |Television Projectors                                 |false       |0                |55         |
|0.0  |0      |29508.95|Salon  Spa Equipment                                  |false       |0                |73         |
|0.0  |0      |28012.45|Television Projectors                                 |false       |0                |62         |
|0.0  |0      |2

In [29]:
# remove rows with no titles
df = df.filter(df['titleLength'] != 0)

In [30]:
# Rows without unkown prices
df.count()

1988016

In [31]:
# Confirm that none of the values in Target column are non boolean
df.filter(~df['isBestSeller'].isin([True, False])).show()

+-----+-------+-----+------------+------------+-----------------+-----------+
|stars|reviews|price|categoryName|isBestSeller|boughtInLastMonth|titleLength|
+-----+-------+-----+------------+------------+-----------------+-----------+
+-----+-------+-----+------------+------------+-----------------+-----------+



In [32]:
# Change isBestSeller column to binary
from pyspark.sql.functions import when

# Convert 'isBestSeller' column to binary 1-0 representation
df = df.withColumn('isBestSeller', when(df['isBestSeller'] == True, 1).otherwise(0))

df.orderBy('price', ascending=False).show(truncate=False)

+-----+-------+--------+------------------------------------------------------+------------+-----------------+-----------+
|stars|reviews|price   |categoryName                                          |isBestSeller|boughtInLastMonth|titleLength|
+-----+-------+--------+------------------------------------------------------+------------+-----------------+-----------+
|0.0  |0      |40900.0 |Outdoor Storage  Housing                              |0           |0                |117        |
|0.0  |0      |34552.5 |Outdoor Storage  Housing                              |0           |0                |114        |
|0.0  |0      |32079.9 |Television Projectors                                 |0           |0                |55         |
|0.0  |0      |29508.95|Salon  Spa Equipment                                  |0           |0                |73         |
|0.0  |0      |28012.45|Television Projectors                                 |0           |0                |62         |
|0.0  |0      |2

In [33]:
# Double check counts stayed the same
df.count()

1988016

In [34]:
# Check unique values in category column
df.select(df['categoryName']).distinct().count()

266

In [35]:
# Get the top 30 categories
top_30_categories = df.groupBy('categoryName').count().orderBy('count', ascending=False).limit(30)

# Extract the top 30 category names into a list
top_30_category_names = [row['categoryName'] for row in top_30_categories.collect()]

# Filter the DataFrame to keep only the rows with category names in the top 30 list
filtered_df = df.filter(df['categoryName'].isin(top_30_category_names))

filtered_df.orderBy('price', ascending=False).show(truncate=False)

+-----+-------+--------+--------------------+------------+-----------------+-----------+
|stars|reviews|price   |categoryName        |isBestSeller|boughtInLastMonth|titleLength|
+-----+-------+--------+--------------------+------------+-----------------+-----------+
|0.0  |0      |29508.95|Salon  Spa Equipment|0           |0                |73         |
|0.0  |0      |27890.8 |Salon  Spa Equipment|0           |0                |156        |
|0.0  |0      |27504.41|Salon  Spa Equipment|0           |0                |142        |
|0.0  |0      |27409.95|Salon  Spa Equipment|0           |0                |123        |
|0.0  |0      |26975.67|Salon  Spa Equipment|0           |0                |125        |
|0.0  |0      |25488.12|Salon  Spa Equipment|0           |0                |120        |
|0.0  |0      |24572.7 |Salon  Spa Equipment|0           |0                |136        |
|0.0  |0      |24520.8 |Salon  Spa Equipment|0           |0                |144        |
|0.0  |0      |24328.

In [36]:
# Check to see if only 30 categories are left as expected
filtered_df.select(df['categoryName']).distinct().count()

30

In [37]:
# Count rows with filtered categories
filtered_df.count()

489909

In [None]:
import pandas as pd

# Convert PySpark DataFrame to Pandas DataFrame
top_categories_df = filtered_df.toPandas()
all_categories_df = df.toPandas()

Unnamed: 0,stars,reviews,price,categoryName,isBestSeller,boughtInLastMonth,titleLength
0,4.5,43171,29.99,Motorcycle Accessories Parts,1,0,254
1,4.7,215,5.98,Motorcycle Accessories Parts,1,900,132
2,3.9,1006,14.22,Motorcycle Accessories Parts,1,1000,55
3,4.3,17381,12.99,Motorcycle Accessories Parts,1,900,195
4,5.0,1,21.69,Motorcycle Accessories Parts,1,200,189


Unnamed: 0,stars,reviews,price,categoryName,isBestSeller,boughtInLastMonth,titleLength
0,4.4,2876,47.69,Industrial Scientific,0,0,60
1,3.8,55,10.99,Industrial Scientific,0,100,152
2,4.0,126,25.99,Industrial Scientific,0,50,160
3,4.5,1936,21.99,Industrial Scientific,0,100,130
4,4.2,46,18.99,Industrial Scientific,0,100,153


In [None]:
# display the DFs
display(top_categories_df.tail())
display(all_categories_df.tail())

Unnamed: 0,stars,reviews,price,categoryName,isBestSeller,boughtInLastMonth,titleLength
489904,0.0,0,32.59,Shaving Hair Removal Products,0,0,188
489905,0.0,0,15.21,Shaving Hair Removal Products,0,0,188
489906,0.0,0,13.3,Shaving Hair Removal Products,0,0,135
489907,0.0,0,11.87,Shaving Hair Removal Products,0,0,136
489908,0.0,0,233.86,Shaving Hair Removal Products,0,0,118


Unnamed: 0,stars,reviews,price,categoryName,isBestSeller,boughtInLastMonth,titleLength
1988011,0.0,0,52.99,Play Sets Playground Equipment,0,0,190
1988012,0.0,0,15.99,Play Sets Playground Equipment,0,0,106
1988013,0.0,0,15.29,Play Sets Playground Equipment,0,0,84
1988014,0.0,0,21.09,Play Sets Playground Equipment,0,0,150
1988015,0.0,0,21.99,Play Sets Playground Equipment,0,0,142


In [None]:
# Download cleaned CSVs
top_categories_df.to_csv('top_categories_cleaned_data.csv', index=False)
all_categories_df.to_csv('all_categories_cleaned_data.csv', index=False)