In [1]:
from pyspark.sql import SparkSession
from pyspark import Row
from pyspark.sql import SQLContext
import sys
import json
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import lit
from pyspark.ml.feature import MinMaxScaler
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import StandardScaler
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col

In [2]:
spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()


In [4]:
df = spark.read.load('steam_dataset.csv', 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')

In [5]:
df = df.withColumnRenamed("User ID", "userId").withColumnRenamed("_c0", "num").\
        withColumnRenamed("Game Title","gameTitle").withColumnRenamed("Hours Played","hoursPlayed").\
        withColumnRenamed("Game ID","gameId")

In [6]:
print((df.count(), len(df.columns)))

(70489, 5)


In [7]:
# df.show()

# Normalization

In [8]:
df_norm = df.select(
    _mean(col('hoursPlayed')).alias('mean'),
    _stddev(col('hoursPlayed')).alias('std')
).collect()

mean = df_norm[0]['mean']
std = df_norm[0]['std']

print(mean)
print(std)

48.878063243911484
229.33523599681345


In [9]:
df = df.withColumn("normalizedH", (df['hoursPlayed']-mean)/std)

In [11]:
# df.show()

# Baseline - Basic ALS

In [25]:
# df = spark.read.load('steam_data_w_game_id.csv', 
#                       format='com.databricks.spark.csv', 
#                       header='true', 
#                       inferSchema='true')
# df = df.withColumnRenamed("User ID", "userId").withColumnRenamed("_c0", "num").\
#         withColumnRenamed("Game Title","gameTitle").withColumnRenamed("Hours Played","hoursPlayed").\
#         withColumnRenamed("Game ID","gameId")
# df_norm = df.select(
#     _mean(col('hoursPlayed')).alias('mean'),
#     _stddev(col('hoursPlayed')).alias('std')
# ).collect()

# mean = df_norm[0]['mean']
# std = df_norm[0]['std']
# df = df.withColumn("normalizedH", (df['hoursPlayed']-mean)/std)
# (training, test) = df.randomSplit([0.8, 0.2], 123)

In [13]:
(training, test) = df.randomSplit([0.8, 0.2], 123)
als = ALS(maxIter=5, regParam=0.01, implicitPrefs = True, userCol="userId", itemCol="gameId", ratingCol="normalizedH",
          coldStartStrategy="drop").setSeed(123)

model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="normalizedH",predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

#Result
rmse

0.8612171660889554

In [16]:
# df.show()

# ALS Global Average

In [12]:
# df = spark.read.load('steam_data_w_game_id.csv', 
#                       format='com.databricks.spark.csv', 
#                       header='true', 
#                       inferSchema='true')
# df = df.withColumnRenamed("User ID", "userId").withColumnRenamed("_c0", "num").\
#         withColumnRenamed("Game Title","gameTitle").withColumnRenamed("Hours Played","hoursPlayed").\
#         withColumnRenamed("Game ID","gameId")
# df_norm = df.select(
#     _mean(col('hoursPlayed')).alias('mean'),
#     _stddev(col('hoursPlayed')).alias('std')
# ).collect()

# mean = df_norm[0]['mean']
# std = df_norm[0]['std']
# df = df.withColumn("normalizedH", (df['hoursPlayed']-mean)/std)
# (training, test) = df.randomSplit([0.8, 0.2], 123)

In [18]:
test.show()

+---+---------+--------------------+-----------+------+--------------------+
|num|   userId|           gameTitle|hoursPlayed|gameId|         normalizedH|
+---+---------+--------------------+-----------+------+--------------------+
|  1|151603712|           Fallout 4|       87.0|  1162| 0.16622799627972654|
|  2|151603712|               Spore|       14.9|  2813|-0.14815893029357077|
|  3|151603712|   Fallout New Vegas|       12.1|  1163|-0.16036813132554348|
|  8|151603712|         Left 4 Dead|        3.3|  1732|-0.19873990599745772|
| 10|151603712|         Tomb Raider|        2.5|  3247| -0.2022282491494499|
| 16|151603712|SEGA Genesis & Me...|        0.8|  2535|-0.20964097834743337|
| 24|151603712|         Garry's Mod|        0.1|  1313| -0.2126932786054265|
| 27| 59945701|Ultra Street Figh...|      238.0|  3327|  0.8246527662182549|
| 29| 59945701|The Elder Scrolls...|       58.0|  3067|0.039775557020009185|
| 32| 59945701|Company of Heroes...|       10.2|   641|-0.16865294631152494|

In [20]:
training.show()

+---+---------+--------------------+-----------+------+--------------------+--------------------+
|num|   userId|           gameTitle|hoursPlayed|gameId|         normalizedH|      normalizedHAVG|
+---+---------+--------------------+-----------+------+--------------------+--------------------+
|  0|151603712|The Elder Scrolls...|      273.0|  3067|  0.9772677791179138|0.001253190493281...|
|  4|151603712|       Left 4 Dead 2|        8.9|  1733|-0.17432150393351228|0.001253190493281...|
|  5|151603712|            HuniePop|        8.5|  1535|-0.17606567550950838|0.001253190493281...|
|  6|151603712|       Path of Exile|        8.1|  2197|-0.17780984708550449|0.001253190493281...|
|  7|151603712|         Poly Bridge|        7.5|  2251|-0.18042610444949864|0.001253190493281...|
|  9|151603712|     Team Fortress 2|        2.8|  2994|-0.20092012046745283|0.001253190493281...|
| 11|151603712|     The Banner Saga|        2.0|  3024|-0.20440846361944504|0.001253190493281...|
| 12|151603712|Dead 

In [19]:
global_mean = training.groupBy().avg("normalizedH").collect()[0]['avg(normalizedH)']
training = training.withColumn('normalizedHAVG',lit(global_mean))
print(global_mean)

0.0012531904932816308


In [21]:
als = ALS(maxIter=5, regParam=0.01,implicitPrefs = True,userCol="userId", itemCol="gameId", ratingCol="normalizedHAVG",
          coldStartStrategy="drop").setSeed(123)

model = als.fit(training)

# Evaluate the model by computing the RMSE on the test dataI
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="normalizedH",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

#Result
rmse

0.8753411684638526

# ALS with Bias

In [28]:
df = spark.read.load('steam_dataset.csv', 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')
df = df.withColumnRenamed("User ID", "userId").withColumnRenamed("_c0", "num").\
        withColumnRenamed("Game Title","gameTitle").withColumnRenamed("Hours Played","hoursPlayed").\
        withColumnRenamed("Game ID","gameId")
df_norm = df.select(
    _mean(col('hoursPlayed')).alias('mean'),
    _stddev(col('hoursPlayed')).alias('std')
).collect()

mean = df_norm[0]['mean']
std = df_norm[0]['std']
df = df.withColumn("normalizedH", (df['hoursPlayed']-mean)/std)
(training, test) = df.randomSplit([0.8, 0.2], 123)

In [29]:
#################

#global average 
# global_mean = training.groupBy().avg("normalizedH").collect()[0]['avg(normalizedH)']

#User_mean
user_mean = training.groupBy("userId").agg({"normalizedH": "avg"})
user_mean = user_mean.withColumnRenamed('avg(normalizedH)', 'user_mean')

#item_mean
item_mean = training.groupBy('gameId').agg({"normalizedH": "avg"})
item_mean = item_mean.withColumnRenamed('avg(normalizedH)', 'item_mean')

#joining DFs
# mainDF = training.alias('mainDF')
userDF = user_mean.alias('userDF')
itemDF = item_mean.alias('itemDF')

training = training.join(userDF, training.userId == userDF.userId, 'outer')\
    .select(training.userId, training.gameId, training.normalizedH , userDF.user_mean)

training = training.join(itemDF, training.gameId == itemDF.gameId, 'outer')\
    .select(training.userId, training.gameId, training.normalizedH ,training.user_mean, itemDF.item_mean)

#user_item_interaction  
training = training.withColumn('user_item_interaction',training.normalizedH\
                               - (training.user_mean + training.item_mean - global_mean))


#################
#####test#######

test_user_mean = user_mean.alias('test_user_mean')
test = test.join(test_user_mean, test.userId == test_user_mean.userId, 'inner') \
    .select(test.userId, test.gameId, test.normalizedH, test_user_mean.user_mean)

test_item_mean = item_mean.alias('test_item_mean')
test = test.join(test_item_mean, test.gameId == test_item_mean.gameId, 'inner') \
    .select(test.userId, test.gameId, test.normalizedH, test.user_mean, test_item_mean.item_mean)

als = ALS( maxIter=5, regParam=0.01,implicitPrefs = True,userCol="userId", itemCol="gameId", ratingCol="user_item_interaction",
          coldStartStrategy="drop").setSeed(123)


model = als.fit(training)
predictions = model.transform(test)

predictions = predictions.withColumn('prediction_calculated',
                                     predictions.prediction + predictions.user_mean + predictions.item_mean - global_mean)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="normalizedH", predictionCol="prediction_calculated")
rmse = evaluator.evaluate(predictions)

rmse


0.91869479589276

In [None]:
df1 = test.alias('df1')
dfu = user_mean.alias('dfu')
test = df1.join(dfu, df1.userId == dfu.userId, 'inner') \
    .select(df1.userId, df1.movieId, df1.rating, dfu.user_mean)

df1 = test.alias('df1')
dfm = movie_mean.alias('dfm')
test = df1.join(dfm, df1.movieId == dfm.movieId, 'inner') \
    .select(df1.userId, df1.movieId, df1.rating, df1.user_mean, dfm.item_mean)

# Normalization based on Game ID

In [68]:
df = spark.read.load('steam_dataset.csv', 
                      format='com.databricks.spark.csv', 
                      header='true', 
                      inferSchema='true')
df = df.withColumnRenamed("User ID", "userId").withColumnRenamed("_c0", "num").\
        withColumnRenamed("Game Title","gameTitle").withColumnRenamed("Hours Played","hoursPlayed").\
        withColumnRenamed("Game ID","gameId")
# df_norm = df.select(
#     _mean(col('hoursPlayed')).alias('mean'),
#     _stddev(col('hoursPlayed')).alias('std')
# ).collect()

# mean = df_norm[0]['mean']
# std = df_norm[0]['std']
# df = df.withColumn("normalizedH", (df['hoursPlayed']-mean)/std)


In [60]:
# df.show()

In [69]:
from pyspark.sql.functions import *

In [70]:
#item_mean
item_mean = df.groupBy('gameId').agg({"hoursPlayed": "avg"})
item_mean = item_mean.withColumnRenamed('avg(hoursPlayed)', 'item_mean')

#item_std
item_stddev = df.groupBy('gameId').agg(stddev_pop("hoursPlayed"))
item_stddev = item_stddev.withColumnRenamed('stddev_pop(hoursPlayed)', 'item_stddev')

#joining DFs
itemDF = item_mean.alias('itemDF')
itemsdDF = item_stddev.alias('itemsdDF')

from pyspark.sql.functions import * #for decs or asc

df = df.join(itemDF, df.gameId == itemDF.gameId, 'outer')\
    .select( df.num, df.userId, df.gameTitle, df.hoursPlayed ,df.gameId, itemDF.item_mean)#.orderBy(asc("num"))

df = df.join(itemsdDF, df.gameId == itemsdDF.gameId, 'outer')\
    .select( df.num, df.userId, df.gameTitle, df.hoursPlayed ,df.gameId, df.item_mean , itemsdDF.item_stddev)


In [71]:
df = df.withColumn("item_normalizedH", (df['hoursPlayed']-df['item_mean'])/df['item_stddev'])

In [72]:
df = df.na.fill(0)

In [47]:
# df.show()

In [48]:
print((df.count(), len(df.columns)))

(70489, 8)


In [49]:
from pyspark.sql.functions import isnan, when, count, col

# df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).count()
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+------+---------+-----------+------+---------+-----------+----------------+
|num|userId|gameTitle|hoursPlayed|gameId|item_mean|item_stddev|item_normalizedH|
+---+------+---------+-----------+------+---------+-----------+----------------+
|  0|     0|        0|          0|     0|        0|          0|               0|
+---+------+---------+-----------+------+---------+-----------+----------------+



# Basic ALS with new normalization

In [73]:
(training, test) = df.randomSplit([0.8, 0.2], 123)

In [51]:
als = ALS(maxIter=5, regParam=0.01, implicitPrefs = True, userCol="userId", itemCol="gameId", ratingCol="item_normalizedH",
          coldStartStrategy="drop").setSeed(123)

model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="item_normalizedH",predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

#Result
rmse

0.9938771458503272

# Global average ALS with new normalization

In [52]:
global_item_mean = training.groupBy().avg("item_normalizedH").collect()[0]['avg(item_normalizedH)']
training = training.withColumn('item_normalizedHAVG',abs(lit(global_item_mean)))
print(global_item_mean)

-0.0004050004770339087


In [None]:
# Note: I should have used absolute value for global average to be able to use 

In [119]:
# training.show()

In [53]:
als = ALS(maxIter=5, regParam=0.01,implicitPrefs = True,userCol="userId", itemCol="gameId", ratingCol="item_normalizedHAVG",
          coldStartStrategy="drop").setSeed(123)

model = als.fit(training)

# Evaluate the model by computing the RMSE on the test dataI
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="item_normalizedH",
                                predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

#Result
rmse

1.0074705380442321

In [57]:
training.show()

+----+---------+--------------------+-----------+------+-------------------+-------------------+--------------------+--------------------+
| num|   userId|           gameTitle|hoursPlayed|gameId|          item_mean|        item_stddev|    item_normalizedH| item_normalizedHAVG|
+----+---------+--------------------+-----------+------+-------------------+-------------------+--------------------+--------------------+
| 298| 97298878|            Mafia II|       15.9|  1829|  24.94310344827587|  30.57027298485587|-0.29581363086799095|4.050004770339087E-4|
|1430| 69857045|            Mafia II|       21.0|  1829|  24.94310344827587|  30.57027298485587| -0.1289848949085026|4.050004770339087E-4|
|1702|162649407|            Mafia II|       23.0|  1829|  24.94310344827587|  30.57027298485587|-0.06356186119889932|4.050004770339087E-4|
|1864| 42681063|            Mafia II|       14.4|  1829|  24.94310344827587|  30.57027298485587| -0.3448809061501934|4.050004770339087E-4|
|2031|234024191|The Way of 

In [66]:
test.show()

+-----+---------+--------------------+-----------+------+-------------------+-------------------+--------------------+
|  num|   userId|           gameTitle|hoursPlayed|gameId|          item_mean|        item_stddev|    item_normalizedH|
+-----+---------+--------------------+-----------+------+-------------------+-------------------+--------------------+
|  951| 32592631|            Mafia II|       16.8|  1829|  24.94310344827587|  30.57027298485587| -0.2663732656986695|
|  976| 57103808|    America's Army 3|       25.0|   148|           10.61875| 23.253957156524997|  0.6184431279028422|
| 1151|148510973|            Mafia II|        0.5|  1829|  24.94310344827587|  30.57027298485587| -0.7995709904319362|
| 2310| 92914917|          Stellar 2D|        2.4|  2866|               2.55|0.15000000000000013| -0.9999999999999986|
| 2936| 61676491|            Mafia II|        5.2|  1829|  24.94310344827587|  30.57027298485587| -0.6458268612143685|
| 4687| 93575977|            Mafia II|        4.

# Bias ALS with new normalization

In [89]:
#################

# #global average 
# global_mean = training.groupBy().avg("normalizedH").collect()[0]['avg(normalizedH)']
global_item_mean = -1*(global_item_mean)

#User_mean
user_mean = training.groupBy("userId").agg({"item_normalizedH": "avg"})
user_mean = user_mean.withColumnRenamed('avg(item_normalizedH)', 'user_mean')

#item_mean
item_mean = training.groupBy('gameId').agg({"item_normalizedH": "avg"})
item_mean = item_mean.withColumnRenamed('avg(item_normalizedH)', 'item_mean')

#joining DFs
# mainDF = training.alias('mainDF')
userDF = user_mean.alias('userDF')
itemDF = item_mean.alias('itemDF')

training = training.join(userDF, training.userId == userDF.userId, 'outer')\
    .select(training.userId, training.gameId, training.item_normalizedH , userDF.user_mean)

training = training.join(itemDF, training.gameId == itemDF.gameId, 'outer')\
    .select(training.userId, training.gameId, training.item_normalizedH ,training.user_mean, itemDF.item_mean)

#user_item_interaction  
training = training.withColumn('user_item_interaction',(training.item_normalizedH \
                                                        - (training.user_mean + training.item_mean - global_item_mean)))


#################
# #####test#######

test_user_mean = user_mean.alias('test_user_mean')
test = test.join(test_user_mean, test.userId == test_user_mean.userId, 'inner') \
    .select(test.userId, test.gameId, test.item_normalizedH, test_user_mean.user_mean)

test_itemmean = item_mean.alias('test_itemmean')
test = test.join(test_itemmean, test.gameId == test_itemmean.gameId, 'inner') \
    .select(test.userId, test.gameId, test.item_normalizedH, test.user_mean, test_itemmean.item_mean)

als = ALS( maxIter=5, regParam=0.01,implicitPrefs = True,userCol="userId", itemCol="gameId", ratingCol="user_item_interaction",
          coldStartStrategy="drop").setSeed(123)


model = als.fit(training)
predictions = model.transform(test)

predictions = predictions.withColumn('prediction_calculated',
                                     predictions.prediction + predictions.user_mean + predictions.item_mean - global_mean)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="item_normalizedH", predictionCol="prediction_calculated")
rmse = evaluator.evaluate(predictions)

rmse

1.0618828292342712