# Recommendation - Amazon – Toys and Games
Pham Quoc Thai

## Import library

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark import SparkConf

In [3]:
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.functions import isnan, when, count, col, udf
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [5]:
SparkContext.setSystemProperty('spark.executor.memory', '12g')
SparkContext.setSystemProperty('spark.driver.memory', '12g')
#SparkContext.setSystemProperty('spark.executor.memoryOverhead', '2048')

sc = SparkContext(master='local', appName='Recommendation')

24/03/14 12:39:40 WARN Utils: Your hostname, QT-Ubuntu resolves to a loopback address: 127.0.1.1; using 172.16.8.87 instead (on interface enp3s0)
24/03/14 12:39:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/14 12:39:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
spark = SparkSession(sc)

## Read data and overview

In [7]:
df_rating = spark.read.csv('cleaned_data/Products_ThoiTrangNam_rating_clean.csv', header=True)

In [30]:
df_product = spark.read.option("multiLine", "true").csv('cleaned_data/Products_ThoiTrangNam_clean.csv', header=True)

In [31]:
df_product.show(5, truncate=True)

+----------+-------+------+--------------------+----------------+--------------------+
|product_id|  price|rating|    product_name_new|sub_category_new|     description_new|
+----------+-------+------+--------------------+----------------+--------------------+
|       190|86250.0|   4.9|áo lỗ gân form th...|           áo lỗ|mục shopee thời n...|
|       191|26800.0|   4.9|áo lỗ nam trắng c...|           áo lỗ|mục shopee thời n...|
|       192|39500.0|   4.8|áo lỗ nam tyasuo ...|           áo lỗ|mục shopee thời n...|
|       193|16500.0|   4.8| áo lỗ hàng việt nam|           áo lỗ|mục shopee thời n...|
|       194|45000.0|   4.8|áo nam thể lỗ mẫu...|           áo lỗ|mục shopee thời n...|
+----------+-------+------+--------------------+----------------+--------------------+
only showing top 5 rows



In [32]:
#check nan/null values
df_product.select([count(when(col(c).isNull(), c)).alias(c) for c in
           df_product.columns]).toPandas().T

[Stage 67:>                                                         (0 + 1) / 1]                                                                                

Unnamed: 0,0
product_id,0
price,0
rating,0
product_name_new,0
sub_category_new,0
description_new,0


In [33]:
df_product = df_product.select('product_id','product_name_new')
df_product.show(5, truncate=True)

+----------+--------------------+
|product_id|    product_name_new|
+----------+--------------------+
|       190|áo lỗ gân form th...|
|       191|áo lỗ nam trắng c...|
|       192|áo lỗ nam tyasuo ...|
|       193| áo lỗ hàng việt nam|
|       194|áo nam thể lỗ mẫu...|
+----------+--------------------+
only showing top 5 rows



In [34]:
df_rating.show(5,truncate=True)

+----------+-------+------------------+------+
|product_id|user_id|              user|rating|
+----------+-------+------------------+------+
|       190|      1|      karmakyun2nd|     5|
|       190|      2|  tranquangvinh_vv|     5|
|       190|      3|nguyenquoctoan2005|     5|
|       190|      4|    nguyenthuyhavi|     5|
|       190|      5|      luonganh5595|     5|
+----------+-------+------------------+------+
only showing top 5 rows



In [35]:
data_sub = df_rating.join(df_product, on=['product_id'], how='left')
data_sub.show(5)

+----------+-------+------------------+------+--------------------+
|product_id|user_id|              user|rating|    product_name_new|
+----------+-------+------------------+------+--------------------+
|       190|      1|      karmakyun2nd|     5|áo lỗ gân form th...|
|       190|      2|  tranquangvinh_vv|     5|áo lỗ gân form th...|
|       190|      3|nguyenquoctoan2005|     5|áo lỗ gân form th...|
|       190|      4|    nguyenthuyhavi|     5|áo lỗ gân form th...|
|       190|      5|      luonganh5595|     5|áo lỗ gân form th...|
+----------+-------+------------------+------+--------------------+
only showing top 5 rows



[Stage 73:>                                                         (0 + 1) / 1]                                                                                

In [44]:
data_sub.count()

                                                                                

952548

## Preprocessing data

In [37]:
#make sure rating column is DoubleType, and product_id, user_id is numeric
data_sub = data_sub.withColumn("product_id", data_sub["product_id"].cast(IntegerType()))
data_sub = data_sub.withColumn("user_id", data_sub["user_id"].cast(IntegerType()))
data_sub = data_sub.withColumn("rating", data_sub["rating"].cast(DoubleType()))

In [38]:
# Print the Schema of the DataFrame
data_sub.printSchema()

root
 |-- product_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- product_name_new: string (nullable = true)



In [39]:
#inspect
data_sub.show(5, truncate=True)

+----------+-------+------------------+------+--------------------+
|product_id|user_id|              user|rating|    product_name_new|
+----------+-------+------------------+------+--------------------+
|       190|      1|      karmakyun2nd|   5.0|áo lỗ gân form th...|
|       190|      2|  tranquangvinh_vv|   5.0|áo lỗ gân form th...|
|       190|      3|nguyenquoctoan2005|   5.0|áo lỗ gân form th...|
|       190|      4|    nguyenthuyhavi|   5.0|áo lỗ gân form th...|
|       190|      5|      luonganh5595|   5.0|áo lỗ gân form th...|
+----------+-------+------------------+------+--------------------+
only showing top 5 rows



[Stage 82:>                                                         (0 + 1) / 1]                                                                                

In [43]:
#check nan/null values
data_sub.select([count(when(col(c).isNull(), c)).alias(c) for c in
           data_sub.columns]).toPandas().T

                                                                                

Unnamed: 0,0
product_id,0
user_id,0
user,0
rating,0
product_name_new,0


In [42]:
data_sub = data_sub.dropna(subset=["product_name_new"])

In [41]:
#Check classes
data_sub.groupBy('rating').count().show()

[Stage 96:>                                                         (0 + 1) / 1]

+------+------+
|rating| count|
+------+------+
|   1.0| 41447|
|   4.0|118212|
|   3.0| 63051|
|   2.0| 24110|
|   5.0|777662|
+------+------+



                                                                                

In [None]:
# Distinct users and movies
users = data_sub.select("user_id").distinct().count()
products = data_sub.select("product_id").distinct().count()
numerator = data_sub.count()

[Stage 126:>                                                        (0 + 1) / 1]

In [None]:
print('Number of users:', users)
print('Number of products:', products)
print('Number of records:', numerator)

In [None]:
# Number of ratings matrix could contain if no empty cells
denominator = users * products
denominator

In [None]:
#Calculating sparsity
sparsity = 1 - (numerator*1.0 / denominator)
print ("Sparsity: "), sparsity

High sparse

## Prepare data to ALS format

In [23]:
# Create an indexer
#indexer = StringIndexer(inputCol='user_id',
#                       outputCol='asin_idx')

# Indexer identifies categories in the data
#indexer_model = indexer.fit(data_sub)

# Indexer creates a new column with numeric index values
#data_indexed = indexer_model.transform(data_sub)

# Repeat the process for the other categorical feature
#indexer1 = StringIndexer(inputCol='reviewerID',
 #                        outputCol='reviewerID_idx')
#indexer1_model = indexer1.fit(data_indexed)
#data_indexed = indexer1_model.transform(data_indexed)

                                                                                

In [36]:
#inspect
data_indexed = data_sub
del data_sub
data_indexed.show(5, truncate=True)

+----------+-------+------+
|product_id|user_id|rating|
+----------+-------+------+
|       190|      1|   5.0|
|       190|      2|   5.0|
|       190|      3|   5.0|
|       190|      4|   5.0|
|       190|      5|   5.0|
+----------+-------+------+
only showing top 5 rows



In [37]:
#check null
data_indexed.select([count(when(col(c).isNull(), c)).alias(c) for c in
           data_indexed.columns]).toPandas().T

                                                                                

Unnamed: 0,0
product_id,0
user_id,0
rating,0


## Modeling

In [38]:
#split data into trainset, testset
# many data so we will use 0.8 / 0.2
(training, test) = data_indexed.randomSplit([0.8, 0.2])

In [39]:
#some parameter
#maxIter: maximum number of iterations that ALS will run during training
#regParam: controls regularization to prevent overfitting. higher value indicates stronger regularization, lower value allows more flexibility in fitting the data.
#rank: determines the number of latent factors used to represent users and items.
#higher ranker the lower error but more complexity compute
_maxIter = 20
_regParam = 0.01
_rank = 60
#train the model
als = ALS(maxIter=_maxIter,
          regParam=_regParam,
          rank = _rank,
          userCol="user_id",
          itemCol="product_id",
          ratingCol="rating",
          coldStartStrategy="drop",
          nonnegative=True)
model = als.fit(training)

24/03/13 21:00:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

## Evaluation

In [40]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

In [41]:
#inspect
predictions.show(5)

                                                                                

+----------+-------+------+----------+
|product_id|user_id|rating|prediction|
+----------+-------+------+----------+
|      2142|     40|   5.0| 3.9790876|
|     25517|     40|   5.0| 3.8241038|
|    232473|     40|   5.0|  5.035574|
|     11317|     50|   5.0|  4.661109|
|     13623|     57|   5.0|  5.002939|
+----------+-------+------+----------+
only showing top 5 rows



In [42]:
#calcuate rmse score
evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)

                                                                                

In [43]:
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1477614413474846


The model can predict new overall with error nearly 1.0 

In [44]:
# save model to disk
#model.save("recommendation_model_amazon_toys_games")

### Providing Recommendations: for all users

In [45]:
# get 5 recommendations which have highest rating.
user_recs = model.recommendForAllUsers(5)

In [46]:
user_recs.show(5, truncate=False)

[Stage 277:>                                                        (0 + 1) / 1]

+-------+------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                       |
+-------+------------------------------------------------------------------------------------------------------+
|12     |[{19344, 6.25057}, {173345, 6.2220078}, {172416, 6.1911573}, {235012, 6.1029043}, {172944, 6.0931697}]|
|26     |[{12334, 6.008749}, {172944, 6.0044107}, {19344, 5.9870224}, {173345, 5.9641447}, {235012, 5.909465}] |
|28     |[{173345, 6.349544}, {172416, 6.240093}, {173325, 6.196763}, {231384, 6.1837873}, {151305, 6.1657777}]|
|31     |[{10489, 6.274107}, {111747, 6.2604423}, {12382, 6.2467313}, {173710, 6.0998383}, {174879, 6.099676}] |
|34     |[{12741, 6.9398265}, {10490, 6.916534}, {173345, 6.9019046}, {16155, 6.8827195}, {174436, 6.8670235}] |
+-------+---------------------------------------------------------------------------------------

                                                                                

In [35]:
#user_recs.count()

## Save data to file

In [49]:
df_product = df_product.select('product_id','product_name_new')

In [36]:
#get reviewerID for reviewerID_idx
df_reviewer_reviewer_id = data_indexed.select('reviewerID_idx', 'reviewerID').distinct()

In [37]:
#get product_id for asin_idx
df_product_idx = data_indexed.select('asin_idx', 'asin').distinct()

In [51]:
data_indexed.show(3)

+----------+-------+------+
|product_id|user_id|rating|
+----------+-------+------+
|       190|      1|   5.0|
|       190|      2|   5.0|
|       190|      3|   5.0|
+----------+-------+------+
only showing top 3 rows



In [55]:
#combine dataframe with reviewerID
new_user_recs = user_recs.join(df_product, on=['product_id'], how='left')

AnalysisException: [UNRESOLVED_USING_COLUMN_FOR_JOIN] USING column `product_id` cannot be resolved on the left side of the join. The left-side columns: [`recommendations`, `user_id`].

In [39]:
#inspect
new_user_recs.show(3, truncate=False)

23/08/04 14:54:47 WARN DAGScheduler: Broadcasting large task binary with size 1327.2 KiB
23/08/04 14:55:05 WARN DAGScheduler: Broadcasting large task binary with size 1324.1 KiB


+--------------+-----------------------------------------------------------------------------------------------+--------------+
|reviewerID_idx|recommendations                                                                                |reviewerID    |
+--------------+-----------------------------------------------------------------------------------------------+--------------+
|0             |[{9847, 6.2357802}, {6836, 5.9811497}, {9254, 5.9066997}, {7533, 5.8986154}, {7735, 5.8954325}]|AJGU56YG8G1DQ |
|1             |[{6696, 5.283102}, {4135, 5.2031755}, {4560, 5.169352}, {7244, 5.1690936}, {8672, 5.165695}]   |A1M8AYAL3L8ACP|
|2             |[{10415, 5.114091}, {2700, 5.0675197}, {7010, 5.048225}, {9254, 5.0290294}, {7735, 5.0211015}] |A1II2ZRPKZAQQD|
+--------------+-----------------------------------------------------------------------------------------------+--------------+
only showing top 3 rows



In [40]:
# Save to disk
new_user_recs.write.parquet('reviews_Toys_and_Games_5_User.parquet', mode='overwrite')
df_asin_asin_idx.write.parquet('reviews_Toys_and_Games_5_Product.parquet', mode='overwrite')

23/08/04 14:55:06 WARN DAGScheduler: Broadcasting large task binary with size 1327.2 KiB
23/08/04 14:55:23 WARN DAGScheduler: Broadcasting large task binary with size 1525.3 KiB
                                                                                

## Then make recommendations to some users: 
A3GJPLCZCDXXG6, A34U85WY8ZWBPV, A2VIY2TL6QPYLG

In [41]:
def make_recommendation(reviewerID):
    #create a dictionary for user recommendation
    find_user_rec = new_user_recs.filter(new_user_recs['reviewerID'] == reviewerID)
    user = find_user_rec.first()
    lst = []
    for row in user['recommendations']:
        row_f = df_asin_asin_idx.filter(df_asin_asin_idx.asin_idx == row['asin_idx'])
        row_f_first = row_f.first()
        lst.append((row['asin_idx'], row_f_first['asin'], row['rating']))
    dic_user_rec = {'reviewerID' : user.reviewerID, 'recommendations' :lst}
    return dic_user_rec

In [42]:
def make_result_dataframe(dic_user_rec, rating=4):
    #adjust rating parameter for filter higher rating
    #creating dataframe recommendations for new users
    import pandas as pd
    df_result = pd.DataFrame(dic_user_rec['recommendations'], columns=['asin_idx', 'productID', 'rating'])
    df_result['reviewerID'] = dic_user_rec['reviewerID']
    column_order = ['reviewerID',  'productID', 'rating', 'asin_idx']
    df_result = df_result.reindex(columns=column_order)
    df_result.drop('asin_idx', axis=1, inplace=True)
    result = df_result[df_result['rating']>rating]
    return result

In [43]:
#recommendation for each users
for user in ['A3GJPLCZCDXXG6', 'A34U85WY8ZWBPV', 'A2VIY2TL6QPYLG']:
    user_recom = make_recommendation(user)
    result = make_result_dataframe(user_recom)
    print("Recommendation for: ", user)
    display(result)    

23/08/04 14:55:28 WARN DAGScheduler: Broadcasting large task binary with size 1327.2 KiB
23/08/04 14:55:45 WARN DAGScheduler: Broadcasting large task binary with size 1323.3 KiB
[Stage 369:>                                                        (0 + 1) / 1]

Recommendation for:  A3GJPLCZCDXXG6


                                                                                

Unnamed: 0,reviewerID,productID,rating
0,A3GJPLCZCDXXG6,B00D3Y18WO,5.634512
1,A3GJPLCZCDXXG6,B0007DI63S,5.598936
2,A3GJPLCZCDXXG6,B008L264Q8,5.564395
3,A3GJPLCZCDXXG6,B004Y3U90K,5.558105
4,A3GJPLCZCDXXG6,B00GHPHRI0,5.535957


23/08/04 14:55:51 WARN DAGScheduler: Broadcasting large task binary with size 1327.2 KiB
23/08/04 14:56:08 WARN DAGScheduler: Broadcasting large task binary with size 1323.3 KiB
                                                                                

Recommendation for:  A34U85WY8ZWBPV


Unnamed: 0,reviewerID,productID,rating
0,A34U85WY8ZWBPV,B00BK3FA8I,5.089292
1,A34U85WY8ZWBPV,B0093HPI9E,5.066402
2,A34U85WY8ZWBPV,B0007DI63S,5.065325
3,A34U85WY8ZWBPV,B00739W6VM,5.064379
4,A34U85WY8ZWBPV,B0007XIZ0M,5.021018


23/08/04 14:56:12 WARN DAGScheduler: Broadcasting large task binary with size 1327.2 KiB
23/08/04 14:56:29 WARN DAGScheduler: Broadcasting large task binary with size 1323.3 KiB
[Stage 483:>                                                        (0 + 1) / 1]

Recommendation for:  A2VIY2TL6QPYLG


                                                                                

Unnamed: 0,reviewerID,productID,rating
0,A2VIY2TL6QPYLG,B00BY7YIPG,4.860012
1,A2VIY2TL6QPYLG,B009B7F6CA,4.828602
2,A2VIY2TL6QPYLG,B00ERZGLT8,4.795135
3,A2VIY2TL6QPYLG,B004SGOSI2,4.794385
4,A2VIY2TL6QPYLG,B00ERZVZZS,4.739208


#### Summary
* A3GJPLCZCDXXG6 user might likes these productIDs: B009Y943F6, B009Y94H1G, B007CB7X1E, B00D0GYMTG, B00ERZVZZS

* A34U85WY8ZWBPV user might likes these productIDs: B00D3Y18WO, B004Y3U90K, B003AVWOMS, B00AZQ2SYU, B000BX4TTO

* A2VIY2TL6QPYLG user might likes these productIDs: B00A0GNOVQ, B00D95E30G, B001L123JS, B000PC62EQ, B00CFELU30 	

## Or reading the saved file to get input data => recommendation

In [44]:
# Read the Parquet file into a new DataFrame
new_user_recs = spark.read.parquet('reviews_Toys_and_Games_5_User.parquet')

In [45]:
new_user_recs.printSchema()

root
 |-- reviewerID_idx: integer (nullable = true)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- asin_idx: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)
 |-- reviewerID: string (nullable = true)



In [46]:
new_user_recs.show(2)

+--------------+--------------------+--------------+
|reviewerID_idx|     recommendations|    reviewerID|
+--------------+--------------------+--------------+
|             0|[{9847, 6.2357802...| AJGU56YG8G1DQ|
|             1|[{6696, 5.283102}...|A1M8AYAL3L8ACP|
+--------------+--------------------+--------------+
only showing top 2 rows



In [47]:
df_asin_asin_idx = spark.read.parquet('reviews_Toys_and_Games_5_Product.parquet')

In [48]:
df_asin_asin_idx.printSchema()

root
 |-- asin_idx: double (nullable = true)
 |-- asin: string (nullable = true)



In [49]:
df_asin_asin_idx.show(2)

+--------+----------+
|asin_idx|      asin|
+--------+----------+
|  9630.0|1603800689|
|    19.0|B00000K3BR|
+--------+----------+
only showing top 2 rows



In [50]:
#recommendation for each users
for user in ['A3GJPLCZCDXXG6', 'A34U85WY8ZWBPV', 'A2VIY2TL6QPYLG']:
    user_recom = make_recommendation(user)
    result = make_result_dataframe(user_recom)
    print("Recommendation for: ", user)
    display(result)    

Recommendation for:  A3GJPLCZCDXXG6


Unnamed: 0,reviewerID,productID,rating
0,A3GJPLCZCDXXG6,B00D3Y18WO,5.634512
1,A3GJPLCZCDXXG6,B0007DI63S,5.598936
2,A3GJPLCZCDXXG6,B008L264Q8,5.564395
3,A3GJPLCZCDXXG6,B004Y3U90K,5.558105
4,A3GJPLCZCDXXG6,B00GHPHRI0,5.535957


Recommendation for:  A34U85WY8ZWBPV


Unnamed: 0,reviewerID,productID,rating
0,A34U85WY8ZWBPV,B00BK3FA8I,5.089292
1,A34U85WY8ZWBPV,B0093HPI9E,5.066402
2,A34U85WY8ZWBPV,B0007DI63S,5.065325
3,A34U85WY8ZWBPV,B00739W6VM,5.064379
4,A34U85WY8ZWBPV,B0007XIZ0M,5.021018


Recommendation for:  A2VIY2TL6QPYLG


Unnamed: 0,reviewerID,productID,rating
0,A2VIY2TL6QPYLG,B00BY7YIPG,4.860012
1,A2VIY2TL6QPYLG,B009B7F6CA,4.828602
2,A2VIY2TL6QPYLG,B00ERZGLT8,4.795135
3,A2VIY2TL6QPYLG,B004SGOSI2,4.794385
4,A2VIY2TL6QPYLG,B00ERZVZZS,4.739208
