In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS

In [3]:
spark = SparkSession.builder.appName('mov_ratings').getOrCreate()



Reading the Data

In [12]:
ratings = spark.read.option("header","true").option("inferSchema","true").csv("ratings.csv").select("Userid","Movieid","Rating","Timestamp").cache()
ratings.show(5)
ratings.printSchema()

movies = spark.read.option("header","true").option("inferSchema","true").csv("movies.csv").select("movieid","title","genres").cache()
movies.show(5)
movies.printSchema()

+------+-------+------+---------+
|Userid|Movieid|Rating|Timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows

root
 |-- Userid: integer (nullable = true)
 |-- Movieid: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Timestamp: integer (nullable = true)

+-------+--------------------+--------------------+
|movieid|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows

root
 |-- m

### A - Apply ALS Algorithm. Display Userid with Title and Genre

### B - Find RMSE

In [93]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

training_data,validation_data = ratings.randomSplit([8.0,2.0])

als = ALS(userCol="Userid",itemCol="Movieid",ratingCol="Rating",rank=10,maxIter=5,regParam=0.01,coldStartStrategy="drop")
evaluator = RegressionEvaluator(metricName="rmse",labelCol="Rating",predictionCol="prediction")
 
model = als.fit(training_data)
predictions=model.transform(validation_data)
predictions.show(15,False)

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# Userid, title and genre
df_joined = ratings.join(movies, ratings["Movieid"] == movies["movieid"], "inner").select(ratings["Userid"], ratings["Movieid"], ratings["Rating"], ratings["Timestamp"], movies["title"], movies["genres"])
df_joined.select('Userid','title','genres').show()

+------+-------+------+----------+----------+
|Userid|Movieid|Rating|Timestamp |prediction|
+------+-------+------+----------+----------+
|148   |4308   |4.0   |1482548613|3.1101894 |
|148   |4886   |3.0   |1482548505|3.104631  |
|148   |4896   |4.0   |1482548717|3.4539232 |
|148   |5952   |3.0   |1482548769|3.3775644 |
|148   |40629  |5.0   |1482548578|4.0114675 |
|148   |40815  |4.0   |1482548512|3.6738138 |
|148   |50872  |3.0   |1482548504|3.2952244 |
|148   |68954  |4.0   |1482548482|3.8991573 |
|148   |69844  |4.0   |1482548500|4.497712  |
|148   |79702  |4.0   |1482548751|2.4387648 |
|148   |81847  |4.5   |1482549236|3.591402  |
|148   |89745  |4.0   |1482548710|2.8142447 |
|148   |110102 |4.0   |1482548669|3.1342027 |
|148   |115617 |3.5   |1482548495|4.649155  |
|148   |122886 |3.5   |1482548686|4.53813   |
+------+-------+------+----------+----------+
only showing top 15 rows

Root Mean Squared Error (RMSE) = 1.0632168601108882
+------+--------------------+-------------------

### C - Separate all genres individually and display every unique genre along wiht the count of their occurrences

In [94]:
from pyspark import SparkContext
from pyspark import SparkConf

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

In [95]:
genres = movies.select('genres')
all_genres = []

genres_rdd = genres.rdd
x = genres_rdd.collect()

for row in genres.rdd.collect():
    genres_all = row["genres"]
    all_genres.append(genres_all.split("|"))

#print(len(all_genres))
    
all_genres = sc.parallelize(all_genres).flatMap(lambda x: x)
#all_genres.count()

counts = all_genres.map(lambda genre: (genre, 1)).reduceByKey(lambda x, y: x + y)

output = counts.collect()
for (word, count) in output:
    print("%s: %i" % (word, count))

Romance: 1596
War: 382
IMAX: 158
Drama: 4361
Documentary: 440
Sci-Fi: 980
Western: 167
Horror: 978
Mystery: 573
Fantasy: 779
Musical: 334
Children: 664
Action: 1828
Animation: 611
(no genres listed): 34
Thriller: 1894
Adventure: 1263
Comedy: 3756
Film-Noir: 87
Crime: 1199


### D - Display total rating of each movie and total rating of each genre

In [96]:
df_joined = ratings.join(movies, ratings["Movieid"] == movies["movieid"], "inner").select(ratings["Userid"], ratings["Movieid"], ratings["Rating"], ratings["Timestamp"], movies["title"], movies["genres"])

df_joined.groupBy('title').sum('Rating').show()

df_joined.groupBy('genres').sum('Rating').show()

+--------------------+-----------+
|               title|sum(Rating)|
+--------------------+-----------+
|       Psycho (1960)|      335.0|
|Men in Black (a.k...|      575.5|
|Gulliver's Travel...|        9.0|
|Heavenly Creature...|       82.5|
|    Elizabeth (1998)|       84.5|
|Before Night Fall...|       21.5|
|O Brother, Where ...|      358.0|
|Snow White and th...|      278.5|
| Three Wishes (1995)|        3.0|
|When We Were King...|       39.0|
|   Annie Hall (1977)|      224.5|
| If Lucy Fell (1996)|        5.0|
|First Blood (Ramb...|      106.5|
|Don't Tell Mom th...|       30.5|
| Nut Job, The (2014)|       13.0|
|22 Jump Street (2...|       70.0|
|   Deadpool 2 (2018)|       46.5|
|Starship Troopers...|        3.0|
|Voices from the L...|       21.5|
|Night of the Livi...|      105.0|
+--------------------+-----------+
only showing top 20 rows

+--------------------+-----------+
|              genres|sum(Rating)|
+--------------------+-----------+
|Adventure|Sci-Fi|...|      1

### E - 1) Top 10 best performing movies

### 2) 5 worst performing movies

### 3) Average rating of all movies

### 4) Average rating of all users

In [97]:
from pyspark.sql.functions import col,sum,avg

#Top 10 best performing
df_joined.groupBy('title').agg(sum('Rating').alias('total_rating')).orderBy(col('total_rating').cast('float'), ascending=False).show(10)

#5 worst performing
df_joined.groupBy('title').agg(sum('Rating').alias('total_rating')).orderBy(col('total_rating').cast('float')).show(5)

#Average rating of all movies
df_joined.groupBy('title').agg(avg('Rating').alias('avergae_rating')).show()

#Average rating of all users
df_joined.groupBy('Userid').agg(avg('Rating').alias('avergae_rating')).show()

+--------------------+------------+
|               title|total_rating|
+--------------------+------------+
|Shawshank Redempt...|      1404.0|
| Forrest Gump (1994)|      1370.0|
| Pulp Fiction (1994)|      1288.5|
|  Matrix, The (1999)|      1165.5|
|Silence of the La...|      1161.0|
|Star Wars: Episod...|      1062.0|
|   Braveheart (1995)|       955.5|
|   Fight Club (1999)|       931.5|
|Schindler's List ...|       929.5|
|Jurassic Park (1993)|       892.5|
+--------------------+------------+
only showing top 10 rows

+--------------------+------------+
|               title|total_rating|
+--------------------+------------+
|  Legionnaire (1998)|         0.5|
|       Sorrow (2015)|         0.5|
|     Iron Man (1931)|         0.5|
|Pokémon Heroes (2...|         0.5|
|Gods of Egypt (2016)|         0.5|
+--------------------+------------+
only showing top 5 rows

+--------------------+------------------+
|               title|    avergae_rating|
+--------------------+---------------