In [1]:
# tell jupyter where pyspark is
import findspark
findspark.init()

In [2]:
# import ALS and Linear Regression models
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.regression import LinearRegression
from pyspark.sql import Row
from pyspark.sql import SparkSession

imports done


In [3]:
# Build a SparkSession; SparkSession provides a single point of entry to interact with underlying Spark functionality
spark = SparkSession\
    .builder\
    .appName("ALSExample")\
    .getOrCreate()

In [4]:
# Load data as RDD, then transform it to DataFrame format
lines = spark.read.text("data/sample_movielens_ratings.txt").rdd
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
# Split data to training part and test part
(training, test) = ratings.randomSplit([0.8, 0.2])

In [5]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [6]:
# Make predictions using the model we just built; Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.6600081422025192


In [7]:
# Generate top 5 movie recommendations for each user
userRecs = model.recommendForAllUsers(5)
userRecs.show()

# Generate top 5 user recommendations for each movie
movieRecs = model.recommendForAllItems(5)
movieRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[[47, 5.458582], ...|
|    26|[[94, 5.227647], ...|
|    27|[[7, 4.780279], [...|
|    12|[[55, 5.5590134],...|
|    22|[[88, 5.110451], ...|
|     1|[[98, 4.290312], ...|
|    13|[[93, 3.5240116],...|
|     6|[[25, 4.617099], ...|
|    16|[[52, 7.210397], ...|
|     3|[[51, 4.8401504],...|
|    20|[[22, 4.7629094],...|
|     5|[[17, 4.5950394],...|
|    19|[[18, 4.800347], ...|
|    15|[[46, 5.0365653],...|
|    17|[[46, 5.1224556],...|
|     9|[[32, 5.5034], [7...|
|     4|[[93, 3.9633436],...|
|     8|[[47, 6.023553], ...|
|    23|[[98, 5.014476], ...|
|     7|[[83, 4.8010864],...|
+------+--------------------+
only showing top 20 rows

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     31|[[11, 5.123742], ...|
|     85|[[8, 4.8567004], ...|
|     65|[[23, 4.9459953],...|
|     53|[[8, 5.1482024], ...|
|     78|[[23, 1.1633878],...|
|     

In [8]:
# Generate top 5 movie recommendations for a specified user
user = ratings.select(als.getUserCol()).distinct().limit(1)
userSubsetRecs = model.recommendForUserSubset(user, 5)
userSubsetRecs.show()

# Generate top 5 user recommendations for a specified movie
movie = ratings.select(als.getItemCol()).distinct().limit(1)
movieSubSetRecs = model.recommendForItemSubset(movie, 5)
movieSubSetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[[94, 5.227647], ...|
+------+--------------------+

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     26|[[12, 3.0687487],...|
+-------+--------------------+



In [9]:

spark = SparkSession\
    .builder\
    .appName("LinearRegressionWithElasticNet")\
    .getOrCreate()

In [10]:
# Load training data
training = spark.read.format("libsvm")\
    .load("data/sample_linear_regression_data.txt")

In [13]:
# Create a linear regression model and fit the model
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
training.show()
lrModel = lr.fit(training)

# Print the coefficients and intercept of the model
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [12]:
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)


numIterations: 7
RMSE: 10.189077
