In [0]:
#Install Java and Apache Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz

In [0]:
# Install spark-related depdencies for Python
!pip install -q findspark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/9a/5a/271c416c1c2185b6cb0151b29a91fff6fcaed80173c8584ff6d20e46b465/pyspark-2.4.5.tar.gz (217.8MB)
[K     |████████████████████████████████| 217.8MB 45kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 61.0MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.5-py2.py3-none-any.whl size=218257927 sha256=1a0d6435fa38158bdc0340db5550c890eaf0ad7108a84040740fb9c96c4f3e68
  Stored in directory: /root/.cache/pip/wheels/bf/db/04/61d66a5939364e756eb1c1be4ec5bdce6e04047fc7929a3c3c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.5


In [0]:
# Set up required environment variables

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
# Point Colaboratory to Google Drive

from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import json
import gzip
import pandas as pd
from urllib.request import urlopen

In [0]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import IntegerType

from pyspark.ml.feature import Tokenizer, Word2Vec

# these imports are how we build and manager our data science processes: cleaning data, preparing a model,
# executing the model, and evaluating the model.

from matplotlib import pyplot as plt
import numpy as np
import functools
%matplotlib inline

In [0]:
from pyspark.sql.functions import isnan, when, count, col

In [0]:
# we use a set of constants for clarity and simplicity in managing the notebook.
# this allows you to refer back to this cell at any time if you need to either confirm or modify any of these values.

DATA_NAME = '/content/gdrive/My Drive/luxury_beauty_index_title.json'
APP_NAME = "Amazon Clothing Recommender"
SPARK_URL = "local[*]"


In [0]:
MAX_MEMORY = "6g"

In [0]:
#Create spark context and sqlcontext
sc = SparkSession.builder.appName(APP_NAME).config('spark.driver.memory', MAX_MEMORY).config('spark.driver.memory', MAX_MEMORY).master(SPARK_URL).getOrCreate()
sc.sparkContext.setCheckpointDir('/tmp')
sqlContext = SQLContext(sc)

In [0]:
df = sqlContext.read.json(DATA_NAME)

In [0]:
print(f"Dataset shape is {df.count():d} rows by {len(df.columns):d} columns.")

Dataset shape is 29187 rows by 4 columns.


In [0]:
df.printSchema()

root
 |-- product_index: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- reviewer_index: long (nullable = true)
 |-- title: string (nullable = true)



In [0]:
df = df.dropDuplicates()
df.count()

27823

In [0]:
#Check for missing values
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+-------------+------+--------------+-----+
|product_index|rating|reviewer_index|title|
+-------------+------+--------------+-----+
|            0|     0|             0|    0|
+-------------+------+--------------+-----+



In [0]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
#Split the data
(training_data, test_data) = df.randomSplit([0.8, 0.2], seed=123)

In [0]:
#build a baseline model using default parameters
als_model = ALS(userCol='reviewer_index', itemCol='product_index', ratingCol='rating', 
                rank=10, maxIter=10, regParam= 0.1, 
                coldStartStrategy='drop', nonnegative=True, implicitPrefs=False
)

In [0]:
#Fit the model
model = als_model.fit(training_data)

In [0]:
#Get predictions
predictions = model.transform(test_data)
predictions.show()

+-------------+------+--------------+--------------------+----------+
|product_index|rating|reviewer_index|               title|prediction|
+-------------+------+--------------+--------------------+----------+
|          148|   5.0|          1759|Crabtree & Evelyn...|  4.789463|
|          463|   5.0|          3097|Jouer Luminizing ...|  4.523949|
|          463|   5.0|           809|Jouer Luminizing ...| 3.8161159|
|          463|   4.0|          3360|Jouer Luminizing ...|  2.303919|
|          463|   5.0|          2215|Jouer Luminizing ...|  4.416069|
|          463|   4.0|          1411|Jouer Luminizing ...|  4.094961|
|          471|   5.0|          1930|Vichy Pureté Ther...| 4.3253007|
|          471|   4.0|           521|Vichy Pureté Ther...| 3.1159132|
|          833|   2.0|          2924|butter LONDON Ico...|  3.453354|
|          833|   4.0|          2994|butter LONDON Ico...| 3.5188751|
|          833|   3.0|           257|butter LONDON Ico...| 3.6329434|
|          833|   3.

In [0]:
# Import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator

# Complete the evaluator code
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

# Extract the 3 parameters
print(evaluator.getMetricName())
print(evaluator.getLabelCol())
print(evaluator.getPredictionCol())

rmse
rating
prediction


In [0]:
# Evaluate the "predictions" dataframe
RMSE = evaluator.evaluate(predictions)

# Print the RMSE
print (RMSE)

1.0399496967110733


In [0]:
user_recs = model.recommendForAllUsers(10)

In [0]:
user_recs.show()

+--------------+--------------------+
|reviewer_index|     recommendations|
+--------------+--------------------+
|          1580|[[209, 6.6506586]...|
|           471|[[1403, 5.855223]...|
|          1591|[[1463, 5.3505282...|
|          1342|[[1403, 7.0037026...|
|          2122|[[120, 5.9558034]...|
|          2142|[[1403, 6.918046]...|
|           463|[[120, 5.8547897]...|
|           833|[[256, 5.4818115]...|
|          3794|[[1403, 7.051875]...|
|          1645|[[2, 7.2931385], ...|
|          3175|[[518, 6.429821],...|
|           496|[[120, 5.522118],...|
|          2366|[[518, 6.341648],...|
|          2866|[[2, 4.3754506], ...|
|           148|[[2, 7.29276], [1...|
|          1088|[[1403, 6.0008698...|
|          1238|[[1403, 6.268956]...|
|          1829|[[1074, 5.155364]...|
|          1959|[[1403, 6.2203918...|
|          2659|[[2, 1.4583375], ...|
+--------------+--------------------+
only showing top 20 rows



In [0]:
user_recs.createTempView('temp')

In [0]:
clean_recs = sqlContext.sql("SELECT reviewer_index,\
                                    products_and_ratings.product_index AS product_index,\
                                     products_and_ratings.rating AS prediction\
                              FROM temp\
                              LATERAL VIEW explode(recommendations) exploded_table\
                              AS products_and_ratings")

In [0]:
clean_recs.show()

+--------------+-------------+----------+
|reviewer_index|product_index|prediction|
+--------------+-------------+----------+
|          1580|          209| 6.6506586|
|          1580|            2|  6.468758|
|          1580|          120| 6.4565763|
|          1580|         1205|  6.268751|
|          1580|         1463| 6.2304897|
|          1580|           88| 6.2075086|
|          1580|          518| 6.1629066|
|          1580|          105|  6.083294|
|          1580|          315|  6.069291|
|          1580|         1146|  6.068855|
|           471|         1403|  5.855223|
|           471|         1157|  5.811935|
|           471|          518|  5.738923|
|           471|          166|  5.634433|
|           471|          715| 5.3668146|
|           471|          114| 5.3165555|
|           471|          271|  5.222644|
|           471|         1443| 5.1858034|
|           471|         1146| 5.1572957|
|           471|         1430|  5.127748|
+--------------+-------------+----

In [0]:
df_products = df.select('product_index', 'title')

In [0]:
df_products.show()

+-------------+--------------------+
|product_index|               title|
+-------------+--------------------+
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|         1001|Crabtree & Evelyn...|
|          955|Supersmile Profes...|
|          955|Supersmile Profes...|
|          955|Supersmile Profes...|
|          955|Supersmile Profes...|
+-------------+--------------------+
only showing top 20 rows



In [0]:
recs_with_titles = 

In [0]:
df_products_unique = df_products.dropDuplicates()

In [0]:
df_products_unique.show()

+-------------+--------------------+
|product_index|               title|
+-------------+--------------------+
|         1233|jane iredale Pure...|
|         1061|PHYTODÉFRISANT Bo...|
|          604|PHILIP B Drop Dea...|
|          525|Oscar Blandi Pron...|
|         1054|Nioxin Cleanser S...|
|         1093|Zwilling J.A. Hen...|
|         1472|PÜR 4-in-1 Presse...|
|         1448|Laura Geller New ...|
|          886|LORAC Tantalizer ...|
|          265|butter LONDON Tru...|
|         1501|Mario Badescu Gly...|
|         1121|MONTBLANC Star Wa...|
|          496|Bioelements SPF 5...|
|          722|FHI HEAT Techniqu...|
|         1420|Happy Buddha Body...|
|           36|Obagi Hydrate Lux...|
|          787|Bliss Crown Jewel...|
|          275|MAKE Cosmetics Cu...|
|          627|Mustela Stretch M...|
|          549|NUXE Prodigieux S...|
+-------------+--------------------+
only showing top 20 rows



In [0]:
#clean_recs.join(df, ['reviewer_index', 'product_index'], 'left').filter(df.rating.isNull()).show()
recs_with_products = clean_recs.join(df_products_unique, on='product_index', how='left')

In [0]:
recs_with_products.show()

+-------------+--------------+----------+--------------------+
|product_index|reviewer_index|prediction|               title|
+-------------+--------------+----------+--------------------+
|          209|          1580| 6.6506586|Bioelements Keraf...|
|            2|          1580|  6.468758|Bioelements Cruci...|
|          120|          1580| 6.4565763| Anthony Shave Cream|
|         1205|          1580|  6.268751|Supergoop! Daily ...|
|         1463|          1580| 6.2304897|Mario Badescu Cle...|
|           88|          1580| 6.2075086|Bioelements Flash...|
|          518|          1580| 6.1629066|Pure Gentle Shamp...|
|          105|          1580|  6.083294|bliss Soapy Suds ...|
|          315|          1580|  6.069291|L'Occitane Gentle...|
|         1146|          1580|  6.068855|Mario Badescu Cer...|
|         1403|           471|  5.855223|ABBA Pure Moistur...|
|         1157|           471|  5.811935|PCA SKIN A&C Syne...|
|          518|           471|  5.738923|Pure Gentle Sh

In [0]:
df_ratings = df.select('product_index','reviewer_index','rating')
df_ratings.show()

+-------------+--------------+------+
|product_index|reviewer_index|rating|
+-------------+--------------+------+
|         1001|            26|   5.0|
|         1001|          1134|   4.0|
|         1001|          3685|   5.0|
|         1001|             0|   5.0|
|         1001|          2057|   5.0|
|         1001|          2740|   5.0|
|         1001|          3792|   3.0|
|         1001|          3640|   5.0|
|         1001|            26|   3.0|
|         1001|          1409|   4.0|
|         1001|           859|   5.0|
|         1001|          1793|   5.0|
|         1001|          3641|   5.0|
|         1001|            84|   5.0|
|         1001|          1409|   5.0|
|         1001|           942|   5.0|
|          955|           748|   5.0|
|          955|          1539|   5.0|
|          955|           795|   3.0|
|          955|          2933|   3.0|
+-------------+--------------+------+
only showing top 20 rows



In [0]:
df.createTempView('ratings')

In [0]:
sqlContext.sql('select reviewer_index, product_index, title from ratings where reviewer_index = 1580').show()

+--------------+-------------+--------------------+
|reviewer_index|product_index|               title|
+--------------+-------------+--------------------+
|          1580|         1462|essie Nail Color ...|
|          1580|         1051|essie Nail Color ...|
|          1580|         1217|EltaMD UV Sport S...|
|          1580|          915|OPI Nail Envy Nai...|
|          1580|         1284|butter LONDON Tre...|
|          1580|         1037|Klorane Dry Shamp...|
|          1580|          698|essie Nail Color ...|
|          1580|         1114|essie Nail Color ...|
|          1580|          506|essie Nail Color ...|
|          1580|           21|amika Perk Up Dry...|
|          1580|           21|amika Perk Up Dry...|
|          1580|          475|essie Nail Color ...|
|          1580|          778|essie Nail Color ...|
|          1580|          521|essie Nail Color ...|
|          1580|         1276|Patent Shine 10X ...|
|          1580|           60|butter LONDON Nai...|
+-----------

In [0]:
recommendation_list = recs_with_products.join(df_ratings, on=['product_index', 'reviewer_index'], how='left_outer')

In [0]:
recommendation_list.where(col('rating').isNotNull()).show()

+-------------+--------------+----------+--------------------+------+
|product_index|reviewer_index|prediction|               title|rating|
+-------------+--------------+----------+--------------------+------+
|          266|          1829|  4.775143|Revision Skincare...|   5.0|
|          444|          3749|  4.778608|Biolage Hydrasour...|   5.0|
|         1403|           243| 4.9723034|ABBA Pure Moistur...|   5.0|
|          166|           243| 4.8472805|ABBA Moisture Con...|   5.0|
|           67|          3179| 3.6533325|L'Occitane 15% Sh...|   4.0|
|         1285|          2996|  4.928733|EltaMD PM Therapy...|   5.0|
|          498|          1339| 2.8086095|Olaplex Hair Perf...|   3.0|
|         1045|          3490|  4.799543|Mario Badescu Dry...|   5.0|
|           88|          1975| 5.0095677|Bioelements Flash...|   5.0|
|         1224|          1223|   4.87154|Malin + Goetz Lip...|   5.0|
|          928|          1223| 4.8180695|LORAC Natural Per...|   5.0|
|         1139|     