In [20]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("MovieLens Exploration") \
    .getOrCreate()

# Load dataset
ratings_df = spark.read.csv("../data/ratings_small.csv", header=True, inferSchema=True)

# Display the first few rows of the dataframe
ratings_df.show()

# Print the schema of the dataframe
ratings_df.printSchema()

from pyspark.sql import functions as F

# Average rating per movie
avg_ratings = ratings_df.groupBy("movieId").agg(F.avg("rating").alias("average_rating"))
avg_ratings.show()

# Number of ratings per user
user_ratings_count = ratings_df.groupBy("userId").agg(F.count("rating").alias("num_ratings"))
user_ratings_count.show()

                                                                                

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+-------+------+----------+
only showing top 20 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timesta

                                                                                

+-------+------------------+
|movieId|    average_rating|
+-------+------------------+
|   1580| 3.663157894736842|
|   2659|               4.0|
|   3794|               3.4|
|   3175|3.5076923076923077|
|    471| 3.877551020408163|
|   1088| 3.358490566037736|
|   1342|3.0588235294117645|
|   1645|3.4583333333333335|
|   2366|3.9347826086956523|
|   6620|3.6470588235294117|
|   8638|3.8823529411764706|
|  96488|              3.75|
| 160563|               2.5|
|   7982|3.1666666666666665|
|   1238| 4.147058823529412|
|   1959|               3.8|
|    463|3.4285714285714284|
|   2122|2.3181818181818183|
|   1591|               2.7|
|   5518|               4.5|
+-------+------------------+
only showing top 20 rows



[Stage 27:>                                                         (0 + 1) / 1]

+------+-----------+
|userId|num_ratings|
+------+-----------+
|   148|        132|
|   463|        483|
|   471|        216|
|   496|        126|
|   243|        307|
|   392|         25|
|   540|         20|
|   623|        103|
|    31|         69|
|   516|        149|
|    85|        107|
|   137|         80|
|   251|        119|
|   451|         52|
|   580|        922|
|    65|         27|
|   458|         76|
|    53|         46|
|   255|        145|
|   481|        436|
+------+-----------+
only showing top 20 rows



                                                                                

In [22]:
# Data Preprocessing

# 1. Handle Missing Values
for col in ratings_df.columns:
    print(col, "\t", "with null values: ", ratings_df.filter(ratings_df[col].isNull()).count())

# Drop them if necessary (just an example)
# ratings_df = ratings_df.dropna()

# 2. Convert Data Types if necessary
# As an example, converting 'rating' column to float type
# ratings_df = ratings_df.withColumn("rating", ratings_df["rating"].cast("float"))

# 3. Split the Dataset
train_data, test_data = ratings_df.randomSplit([0.8, 0.2], seed=1234)

# Check the count of each dataset
print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))


                                                                                

userId 	 with null values:  0


                                                                                

movieId 	 with null values:  0
rating 	 with null values:  0


                                                                                

timestamp 	 with null values:  0


                                                                                

Number of training records: 80149


[Stage 63:>                                                         (0 + 1) / 1]

Number of testing records : 19855


                                                                                

In [25]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Define the ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative=True, implicitPrefs=False, coldStartStrategy="drop")

# Set model parameters
model = als.fit(train_data)

# Predict on the test set
predictions = model.transform(test_data)


23/08/21 17:59:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [26]:
# Evaluate the model with Root Mean Squared Error
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))


[Stage 174:>                                                        (0 + 1) / 1]

Root-mean-square error = 0.9063129788369194


                                                                                