In [1]:
# Install PySpark
!pip install pyspark

# Import PySpark libraries
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col




In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("E-Learning Recommendation System") \
    .getOrCreate()


In [3]:
# Download the MovieLens dataset
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

# Unzip the dataset
!unzip -o ml-latest-small.zip


--2024-12-11 14:02:01--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-12-11 14:02:02 (3.28 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [4]:
# Load the dataset
ratings_file = "ml-latest-small/ratings.csv"
ratings_df = spark.read.csv(ratings_file, header=True, inferSchema=True)

# Select relevant columns and rename them for clarity
ratings_df = ratings_df.select(col("userId").alias("student_id"),
                                col("movieId").alias("course_id"),
                                col("rating"))

# Display the first few rows of the dataset
ratings_df.show(5)


+----------+---------+------+
|student_id|course_id|rating|
+----------+---------+------+
|         1|        1|   4.0|
|         1|        3|   4.0|
|         1|        6|   4.0|
|         1|       47|   5.0|
|         1|       50|   5.0|
+----------+---------+------+
only showing top 5 rows



In [5]:
# Split the dataset
(training, test) = ratings_df.randomSplit([0.8, 0.2])

# Show the number of rows in each split
print(f"Training set: {training.count()} rows, Test set: {test.count()} rows")


Training set: 80688 rows, Test set: 20148 rows


In [6]:
# Build the ALS model
als = ALS(
    maxIter=10,
    regParam=0.1,
    userCol="student_id",
    itemCol="course_id",
    ratingCol="rating",
    coldStartStrategy="drop"  # To handle cases with no data
)

# Train the model on the training dataset
model = als.fit(training)


In [7]:
# Generate predictions on the test set
predictions = model.transform(test)

# Show the predictions
predictions.show(5)


+----------+---------+------+----------+
|student_id|course_id|rating|prediction|
+----------+---------+------+----------+
|       148|     8368|   4.0| 3.8821478|
|       148|    54001|   4.0| 3.6937602|
|       148|    89745|   4.0| 3.3706663|
|       148|    98491|   5.0| 3.5977542|
|       148|   110102|   4.0| 3.5825524|
+----------+---------+------+----------+
only showing top 5 rows



In [8]:
# Generate top 5 course recommendations for each student
student_recommendations = model.recommendForAllUsers(5)

# Show recommendations
student_recommendations.show(5, truncate=False)


+----------+------------------------------------------------------------------------------------------------+
|student_id|recommendations                                                                                 |
+----------+------------------------------------------------------------------------------------------------+
|1         |[{132333, 5.8284874}, {8542, 5.8284874}, {5490, 5.8284874}, {5915, 5.759862}, {3494, 5.5152655}]|
|2         |[{131724, 4.908405}, {33649, 4.844349}, {59814, 4.687559}, {4846, 4.6244664}, {33090, 4.551824}]|
|3         |[{6835, 4.90117}, {5746, 4.90117}, {5919, 4.864108}, {5181, 4.8599205}, {2851, 4.7668996}]      |
|4         |[{1254, 5.3492885}, {7700, 5.288615}, {2186, 5.0236835}, {26471, 4.98444}, {25825, 4.9810395}]  |
|5         |[{1254, 5.3015494}, {7700, 5.047809}, {25825, 5.0206337}, {26471, 4.9838924}, {2186, 4.929912}] |
+----------+------------------------------------------------------------------------------------------------+
only showi

In [9]:
# Generate top 5 student recommendations for each course
course_recommendations = model.recommendForAllItems(5)

# Show recommendations
course_recommendations.show(5, truncate=False)


+---------+--------------------------------------------------------------------------------------+
|course_id|recommendations                                                                       |
+---------+--------------------------------------------------------------------------------------+
|1        |[{53, 4.9882946}, {43, 4.7993546}, {452, 4.6958423}, {35, 4.6744957}, {236, 4.669444}]|
|3        |[{43, 4.8460155}, {12, 4.453474}, {543, 4.389518}, {594, 4.3619523}, {562, 4.2554255}]|
|5        |[{43, 4.6369643}, {594, 4.3792825}, {12, 4.124978}, {224, 4.071038}, {543, 4.0473347}]|
|6        |[{53, 5.5844626}, {171, 4.905902}, {93, 4.8478637}, {452, 4.832537}, {276, 4.8112035}]|
|9        |[{492, 4.2895117}, {337, 4.242257}, {53, 4.1915984}, {43, 4.118189}, {243, 4.117582}] |
+---------+--------------------------------------------------------------------------------------+
only showing top 5 rows

