In [1]:
# Connect Google Drive Untuk Ambil Data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=67b89575f65b93e3bf94394296ff7797a9a36d9be99a1a94b237f96df9f76b88
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [3]:
# Import Library
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("Movie Lens").getOrCreate()

In [15]:
# Parse String Menjadi Objek Rating
def parseRating(str):
    fields = str.split("::")
    assert len(fields) == 4
    return (int(fields[0]), int(fields[1]), float(fields[2]), int(fields[3]))


In [16]:
raw = spark.read.text("/content/drive/MyDrive/Colab Notebooks/SPARK/ratings.dat").rdd.map(lambda x: x[0])
ratings = raw.map(parseRating).toDF(["userId", "movieId", "rating", "timestamp"])
ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|   1193|   5.0|978300760|
|     1|    661|   3.0|978302109|
|     1|    914|   3.0|978301968|
|     1|   3408|   4.0|978300275|
|     1|   2355|   5.0|978824291|
+------+-------+------+---------+
only showing top 5 rows



In [17]:
# Data Training 80% dan Test 20%
training, test = ratings.randomSplit([0.8, 0.2])

In [18]:
# Membuat Model
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(training)
model.save("mymodel")

In [19]:
# Prediksi Data
predictions = model.transform(test)
mse = predictions.withColumn("diff", col("rating") - col("prediction")).select((col("diff") ** 2).alias("squared_diff")).filter(~col("squared_diff").isNull()).agg({"squared_diff": "sum"}).collect()[0][0]
print("Mean Squared Error:", mse)

predictions.show(10)

Mean Squared Error: nan
+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     1|    594|   4.0|978302268|  4.369921|
|     1|    595|   5.0|978824268| 3.9790118|
|     1|    720|   3.0|978300760| 3.9435213|
|     1|    783|   4.0|978824291| 4.8502617|
|     1|    938|   4.0|978301752| 3.3182936|
|     1|   1022|   5.0|978300055| 4.1811028|
|     1|   1907|   4.0|978824330| 3.4290638|
|     1|   3114|   4.0|978302174| 4.3886657|
|     2|    110|   5.0|978298625| 4.3349752|
|     2|    163|   4.0|978299809| 3.1953938|
+------+-------+------+---------+----------+
only showing top 10 rows



In [20]:
# Menyimpan Hasil Prediksi
predictions.write.format("csv").save("ml-predictions.csv")