In [5]:
import os
import pyspark
from pyspark.sql import SQLContext, SparkSession

# change "pierre.cs.colsotate.edu" to the name of your spark master node
SPARK_NODE="pierre.cs.colostate.edu"


# 31820 corresponds to SPARK_MASTER_PORT in $SPARK_HOME/conf/spark-env.sh
SPARK_PORT=31820

spark = SparkSession.builder.master('spark://{}:{}'.format(SPARK_NODE,SPARK_PORT)).appName('test').getOrCreate()

sqlContext = SQLContext(sparkContext=spark.sparkContext, sparkSession=spark)
print("Spark Version: " + spark.version)
print("PySpark Version: " + pyspark.__version__)

Spark Version: 3.0.3
PySpark Version: 3.0.3


In [6]:
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import  StandardScaler

## Remove users with few ratings

ratings.json -> output/filtered_ratings.json

Description: filtered_ratings contains only ratings from users with more than 4 ratings

In [7]:
ratings=spark.read.json("/FP/ratings.json")

ratings.show()

+-------+------+-------+
|item_id|rating|user_id|
+-------+------+-------+
|      5|   3.0| 997206|
|     10|   4.0| 997206|
|     13|   4.0| 997206|
|     17|   5.0| 997206|
|     21|   4.0| 997206|
|     28|   5.0| 997206|
|     31|   4.0| 997206|
|     39|   3.0| 997206|
|     40|   4.0| 997206|
|     45|   2.0| 997206|
|     46|   4.0| 997206|
|     50|   5.0| 997206|
|     62|   4.0| 997206|
|     74|   5.0| 997206|
|     85|   2.0| 997206|
|    110|   5.0| 997206|
|    112|   4.0| 997206|
|    141|   4.0| 997206|
|    150|   3.0| 997206|
|    161|   3.0| 997206|
+-------+------+-------+
only showing top 20 rows



In [8]:
ratings_mod_user_id = ratings.groupby("user_id").count()
total_ratings=ratings.count()
total_users=ratings_mod_user_id.count()

avg_ratings_per_user = total_ratings / total_users

print("total_ratings",total_ratings)
print("total_users",total_users)
print("avg_ratings_per_user",avg_ratings_per_user)

more_than_four = ratings_mod_user_id.filter("count>4")
filtered_ratings = more_than_four.join(ratings,more_than_four.user_id == ratings.user_id).select(ratings.user_id,"item_id","rating")

total_ratings 28490116
total_users 247383
avg_ratings_per_user 115.16602191743168


In [13]:
print("before",total_ratings)
print("after",filtered_ratings.count())

before 28490116
after 28462184


In [11]:
filtered_ratings.write.mode('overwrite').json("/FP/output/filtered_ratings.json")

## Normalize rating per user

filtered_ratings.json -> output/norm_ratings.json

Description: norm_ratings.json normalizes each rating accross the set of ratings for the same user.


normalization: $r`=r/\mu_R - 1$

In [12]:
#filtered_ratings=spark.read.json("/FP/output/filtered_ratings.json")
ratings_sum_mod_userid = filtered_ratings.groupby("user_id").sum("rating")
ratings_count_mod_userid = filtered_ratings.groupby("user_id").count()

ratings_mod_userid = ratings_sum_mod_userid.join(ratings_count_mod_userid,\
                                                 ratings_sum_mod_userid.user_id==ratings_count_mod_userid.user_id,
                                                ).select(ratings_sum_mod_userid.user_id,"sum(rating)","count")

joined_w_sum = filtered_ratings.join(ratings_mod_userid,filtered_ratings.user_id==ratings_mod_userid.user_id)

In [13]:
#joined_w_sum.show()

normalized_ratings=joined_w_sum.rdd.map(lambda x: (x["user_id"],x["item_id"],\
                                                x["rating"]/x["sum(rating)"]*x["count"]-1)\
                                    ).toDF(["user_id","item_id","rating"])

In [16]:
normalized_ratings.show()
normalized_ratings.write.mode('overwrite').json("/FP/output/normalized_ratings.json")

+-------+-------+--------------------+
|user_id|item_id|              rating|
+-------+-------+--------------------+
|     26|    318| 0.06666666666666665|
|     26|   2959| 0.06666666666666665|
|     26|   3793|-0.04000000000000...|
|     26|   4226| -0.1466666666666666|
|     26|   4993| 0.06666666666666665|
|     26|   5952| 0.06666666666666665|
|     26|   7153| 0.06666666666666665|
|     26| 116823| -0.1466666666666666|
|   2529|    153|-0.34545454545454546|
|   2529|    165| 0.09090909090909083|
|   2529|    356| 0.09090909090909083|
|   2529|    434| 0.09090909090909083|
|   2529|    586| 0.09090909090909083|
|   2529|    587| 0.09090909090909083|
|   2529|    589| 0.09090909090909083|
|   2529|    590| 0.09090909090909083|
|   2529|    592|-0.34545454545454546|
|   2529|    593| 0.09090909090909083|
|   2529|    595| 0.09090909090909083|
|   2529|    597|-0.12727272727272732|
+-------+-------+--------------------+
only showing top 20 rows

