In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 28 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 44.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=30eb7f3120182520f59e4b9dd93208adc820d61195129a65454a6bf1a47cf029
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [2]:
pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9675 sha256=c3210edc001ee94d826763d615763eccbf0489b9ee164b037661c05b90228d83
  Stored in directory: /root/.cache/pip/wheels/a1/b6/7c/0e63e34eb06634181c63adacca38b79ff8f35c37e3c13e3c02
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [4]:
import wget
wget.download('https://files.grouplens.org/datasets/movielens/ml-25m.zip','ml25.zip')

'ml25.zip'

In [5]:
from zipfile import ZipFile

In [8]:
with ZipFile("ml25.zip", "r") as zip:
  zip.printdir()
  zip.extractall()

File Name                                             Modified             Size
ml-25m/                                        2019-11-21 16:41:38            0
ml-25m/tags.csv                                2019-11-21 15:55:50     38810332
ml-25m/links.csv                               2019-11-21 16:40:20      1368578
ml-25m/README.txt                              2019-11-21 16:41:38        10460
ml-25m/ratings.csv                             2019-11-21 16:26:42    678260987
ml-25m/genome-tags.csv                         2019-11-21 16:35:38        18103
ml-25m/genome-scores.csv                       2019-11-21 16:35:38    435164157
ml-25m/movies.csv                              2019-11-21 16:32:10      3038099


In [9]:
from pyspark.sql import SparkSession
session = SparkSession.builder.appName('recom').getOrCreate()

In [10]:
data = session.read.csv("ml-25m/ratings.csv", header = True, inferSchema = True)

In [11]:
data.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    296|   5.0|1147880044|
|     1|    306|   3.5|1147868817|
|     1|    307|   5.0|1147868828|
|     1|    665|   5.0|1147878820|
|     1|    899|   3.5|1147868510|
+------+-------+------+----------+
only showing top 5 rows



In [12]:
data.count()

25000095

In [16]:
from pyspark.sql.functions import countDistinct
data.select(countDistinct('userId')).show()

+----------------------+
|count(DISTINCT userId)|
+----------------------+
|                162541|
+----------------------+



In [17]:
data.select(countDistinct("movieId")).show()

+-----------------------+
|count(DISTINCT movieId)|
+-----------------------+
|                  59047|
+-----------------------+



In [18]:
from pyspark.sql.functions import min, max
data.select(min("rating"), max('rating')).show()

+-----------+-----------+
|min(rating)|max(rating)|
+-----------+-----------+
|        0.5|        5.0|
+-----------+-----------+



In [19]:
training, test = data.randomSplit([.75, .25])
print(training.count())
print(test.count())

18752739
6247356


In [20]:
from pyspark.ml.recommendation import ALS
model = ALS(userCol = "userId", itemCol = "movieId", ratingCol = "rating")
newmodel = model.fit(training)
results = newmodel.transform(test)
results.show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|     1|    306|   3.5|1147868817|  4.075927|
|     1|   1237|   5.0|1147868839|  3.951597|
|     1|   2692|   5.0|1147869100|  4.002228|
|     1|   3569|   5.0|1147879603| 3.6241555|
|     1|   3949|   5.0|1147868678| 3.8945453|
|     1|   5684|   2.0|1147879797| 3.5936627|
|     1|   7234|   4.5|1147868869| 3.8987718|
|     1|   7327|   3.5|1147868855| 3.9351058|
|     1|   7361|   5.0|1147880055|  4.097943|
|     1|   7365|   4.0|1147869033| 3.8021004|
|     1|   7939|   2.5|1147869183|  3.822856|
|     1|   8360|   4.0|1147868682| 3.2438219|
| 32906|   1193|   5.0| 965802125|  4.873412|
| 32906|   1198|   5.0| 965801230|  4.916242|
| 32906|   1203|   5.0| 965802215| 4.9936147|
| 32906|   1204|   5.0| 965798677|  4.980528|
| 32906|   1209|   5.0| 965799052|  4.682129|
| 32906|   1225|   4.0| 965802278| 4.8169355|
| 32906|   1269|   4.0| 965799388|

In [21]:
# in a test dataset rating columns is not consider
# the values is predicted in the basis of userid and movie id
results.filter(results['movieId']== 1217).orderBy('prediction', ascending = False).show(5)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|140038|   1217|   5.0|1018348499|  5.062324|
| 49998|   1217|   5.0|1303171174|  5.014899|
| 35911|   1217|   5.0|1150793847|  4.987176|
|136967|   1217|   5.0| 962374039|  4.976087|
|  5420|   1217|   5.0|1453484404| 4.9728584|
+------+-------+------+----------+----------+
only showing top 5 rows



In [22]:
results.filter(results['userId'] == 4377).orderBy('prediction', ascending = False).show()

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|  4377|   1212|   5.0| 974760416|  5.169819|
|  4377|    923|   5.0| 974760495|  5.141238|
|  4377|   1260|   5.0| 974760725| 5.1149497|
|  4377|   3022|   5.0|1071474623| 5.0955753|
|  4377|   1233|   5.0| 974761512|  5.080318|
|  4377|   1219|   5.0| 974760891| 5.0516453|
|  4377|    111|   5.0| 974761309|  5.045123|
|  4377|    924|   5.0| 974760753| 5.0211706|
|  4377|   1228|   5.0| 974761612| 5.0163136|
|  4377|   1213|   5.0| 974804074|  4.990537|
|  4377|   1267|   5.0| 974760970| 4.9730606|
|  4377|   3470|   5.0| 974760330| 4.9635687|
|  4377|   1230|   5.0| 974761512| 4.9611435|
|  4377|   6787|   5.0|1070504398| 4.9382296|
|  4377|   2010|   5.0| 974761674|  4.925714|
|  4377|   1244|   5.0| 974762049|  4.922358|
|  4377|    928|   5.0| 974761309|   4.91721|
|  4377|   1283|   5.0| 974762262| 4.8887625|
|  4377|   1262|   5.0| 974761560|