<a href="https://colab.research.google.com/github/SharWarr/ML_Projects/blob/main/ML_Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libraries:


In [24]:
import pandas as pd

In [36]:
from sklearn.model_selection import train_test_split

In [4]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 57.1 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=ef4b821f4864ab4a42fa85c673b05f471fe78598f7b4d4230d7c840330a79fdd
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [5]:
import pyspark
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder.appName("als").getOrCreate()

## Load Movie Ratings Data

In [8]:
# Load the ratings data into a dataframe
ratings = spark.read.csv("/content/ratings.csv",inferSchema=True, header=True)

In [11]:
# Print the ratings dataframe
ratings

DataFrame[userId: int, movieId: int, rating: double, timestamp: int]

In [12]:
# Drop the timestamp column from the ratings dataframe.
ratings = ratings.drop('timestamp')

In [109]:
# Print the rating dataframe after dropping the timestamp column.
ratings
ratings.count()

372362

## Load New User Data

In [15]:
# Load the New user's ratings data into a dataframe New_user_rating
New_user_rating = spark.read.csv("/content/New_user_rating.csv",inferSchema=True, header=True)

In [108]:
New_user_rating
New_user_rating.count()

50

## Concatenate the Movie Rating data with new user Data.

In [104]:
# Concatenate the ratings and New_user_rating dataframes into single dataframe.
rating_final = ratings.union(New_user_rating)

In [107]:
# Describe the "rating_final" dataframe.
rating_final.count()

372412

In [110]:
from pyspark.sql.functions import mean
rating_final.describe().show()

+-------+------------------+------------------+------------------+
|summary|            userId|           movieId|            rating|
+-------+------------------+------------------+------------------+
|  count|            372412|            372412|            372412|
|   mean| 1336.377920152949|20936.295890035766|3.5561125849865203|
| stddev|2005.2416952567514| 38743.81856613659|1.0494478377380205|
|    min|                 1|                 1|               0.5|
|    max|            162542|            208793|               5.0|
+-------+------------------+------------------+------------------+



## Load Movies Data

In [31]:
# Load the Movies data into a dataframe
movies = spark.read.csv("/content/movies.csv",inferSchema=True, header=True)

In [33]:
# Print the Movies data.
movies.head(5)

[Row(movieId=1, title='Toy Story (1995)', genres='Adventure|Animation|Children|Comedy|Fantasy'),
 Row(movieId=2, title='Jumanji (1995)', genres='Adventure|Children|Fantasy'),
 Row(movieId=3, title='Grumpier Old Men (1995)', genres='Comedy|Romance'),
 Row(movieId=4, title='Waiting to Exhale (1995)', genres='Comedy|Drama|Romance'),
 Row(movieId=5, title='Father of the Bride Part II (1995)', genres='Comedy')]

In [34]:
ratings.head(5)

[Row(userId=1, movieId=296, rating=5.0),
 Row(userId=1, movieId=306, rating=3.5),
 Row(userId=1, movieId=307, rating=5.0),
 Row(userId=1, movieId=665, rating=5.0),
 Row(userId=1, movieId=899, rating=3.5)]

In [35]:
New_user_rating.head(5)

[Row(userId=162542, movieId=85788, rating=3.5),
 Row(userId=162542, movieId=104908, rating=3.0),
 Row(userId=162542, movieId=109487, rating=4.5),
 Row(userId=162542, movieId=8533, rating=4.0),
 Row(userId=162542, movieId=4995, rating=3.5)]

## Split the Rating data to Train & Test set

In [47]:
x = rating_final.drop('rating')
y = rating_final.drop('userId','movieId')

In [48]:
x.head(5)

[Row(userId=1, movieId=296),
 Row(userId=1, movieId=306),
 Row(userId=1, movieId=307),
 Row(userId=1, movieId=665),
 Row(userId=1, movieId=899)]

In [49]:
y.head(5)

[Row(rating=5.0),
 Row(rating=3.5),
 Row(rating=5.0),
 Row(rating=5.0),
 Row(rating=3.5)]

In [72]:
# Split the "rating_final" data into training and test data.
(training,validation,test) = rating_final.randomSplit([0.6, 0.2,0.2], seed=26)

In [73]:
training.show()


+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    296|   5.0|
|     1|    307|   5.0|
|     1|    665|   5.0|
|     1|    899|   3.5|
|     1|   1175|   3.5|
|     1|   1217|   3.5|
|     1|   1250|   4.0|
|     1|   1260|   3.5|
|     1|   1653|   4.0|
|     1|   2011|   2.5|
|     1|   2068|   2.5|
|     1|   2351|   4.5|
|     1|   2573|   4.0|
|     1|   2632|   5.0|
|     1|   2692|   5.0|
|     1|   3448|   4.0|
|     1|   3569|   5.0|
|     1|   3949|   5.0|
|     1|   4144|   5.0|
|     1|   4308|   3.0|
+------+-------+------+
only showing top 20 rows



In [74]:
validation.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|   1088|   4.0|
|     1|   1237|   5.0|
|     1|   2012|   2.5|
|     1|   2843|   4.5|
|     1|   5269|   0.5|
|     1|   5684|   2.0|
|     1|   6711|   5.0|
|     1|   6954|   3.5|
|     1|   7318|   2.0|
|     1|   7361|   5.0|
|     1|   7820|   2.5|
|     1|   7938|   2.5|
|     1|   8685|   1.0|
|     1|   8873|   3.0|
|     1|   8973|   4.0|
|     1|  27193|   3.0|
|     1|  32591|   5.0|
|     2|     62|   0.5|
|     2|    150|   4.0|
|     2|    333|   5.0|
+------+-------+------+
only showing top 20 rows



In [75]:
test.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    306|   3.5|
|     1|   2161|   3.5|
|     1|   5147|   4.0|
|     1|   6377|   4.0|
|     1|   7323|   3.5|
|     1|   7939|   2.5|
|     1|   7940|   4.5|
|     2|    110|   5.0|
|     2|    151|   4.5|
|     2|    349|   4.5|
|     2|    553|   2.0|
|     2|    653|   3.0|
|     2|    858|   3.5|
|     2|    914|   4.0|
|     2|   1035|   1.0|
|     2|   1197|   5.0|
|     2|   1210|   5.0|
|     2|   1246|   4.0|
|     2|   1291|   5.0|
|     2|   1485|   3.0|
+------+-------+------+
only showing top 20 rows



## ALS Model Building with iteration value = 3

In [76]:
# Build the ALS model.
als = ALS(
         userCol="userId", 
         itemCol="movieId",
         ratingCol="rating",
         maxIter=5, 
         regParam=0.05,
         nonnegative = True, 
         implicitPrefs = False,
         coldStartStrategy="drop"
)

In [77]:
# Print the RMSE value.
model = als.fit(training)

In [78]:
prediction = model.transform(test)

In [79]:
prediction.show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|   148|     19|   3.0| 2.4790854|
|   148|    318|   5.0|  4.385628|
|   148|    541|   4.0| 4.0531034|
|   148|    858|   4.5| 4.5477877|
|   148|    908|   4.0|  4.132942|
|   148|    912|   4.0| 4.2696757|
|   148|   1178|   5.0| 4.1429768|
|   148|   1196|   3.5| 4.1865396|
|   148|   1212|   4.0|  4.238729|
|   148|   1217|   3.5| 4.3237085|
|   148|   1254|   4.0|  3.999802|
|   148|   1284|   4.0|  4.119061|
|   148|   1617|   4.0| 4.0856586|
|   148|   2329|   4.0| 4.2353964|
|   148|   2502|   4.0|   3.97389|
|   148|   2571|   5.0|  4.051012|
|   148|   3307|   4.0| 4.5806866|
|   148|   4886|   4.0| 3.8758981|
|   148|  27773|   4.5|  4.138972|
|   148|  54286|   3.5| 3.7024207|
+------+-------+------+----------+
only showing top 20 rows



In [81]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction')
RMSE = evaluator.evaluate(prediction)
print(RMSE)

0.8840049731501671


## Recommend the top 20 movies to the new user with user id = 162542

In [82]:
# Get the top 20 movie recommendations for all the users.
userRecs = model.recommendForAllUsers(20)

In [83]:
# Print the userRecs.
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{93040, 6.019327...|
|     3|[{144580, 5.63226...|
|     5|[{42094, 6.096114...|
|     6|[{42094, 6.629612...|
|     9|[{7193, 8.283019}...|
|    12|[{110675, 5.84665...|
|    13|[{42094, 6.515002...|
|    15|[{93265, 7.735002...|
|    16|[{6837, 6.121942}...|
|    17|[{2281, 6.8287845...|
|    19|[{110675, 5.33438...|
|    20|[{60103, 6.170853...|
|    22|[{7193, 8.407098}...|
|    26|[{26231, 6.044614...|
|    27|[{127130, 7.35465...|
|    28|[{47904, 7.093816...|
|    31|[{60103, 3.885210...|
|    34|[{8125, 6.1509423...|
|    35|[{47904, 6.394623...|
|    37|[{32840, 6.255139...|
+------+--------------------+
only showing top 20 rows



In [111]:
# Filter the recommendations of top 10 movies for userId 162542 from "userRecs".
df= userRecs.where(userRecs.userId == 162542).collect()
print(type(df))

<class 'list'>


In [87]:
# Convert the 'df' object into the dataframe.
df = pd.DataFrame(df)

In [88]:
# Print the df dataframe.
df.head()

Unnamed: 0,0,1
0,162542,"[(6783, 5.838325500488281), (7817, 5.787448883..."


In [89]:
# Get the recommended movies for the userId 162542 into df1 list.
df1 = df.head()[1]

#convert the df1 into dataframe.
df1= spark.createDataFrame(df1)
df1.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                  _1|                  _2|                  _3|                  _4|                  _5|                  _6|                  _7|                  _8|                  _9|                 _10|                 _11|                 _12|                 _13|                 _14|                 _15|                 _16|                 _17|                 _18|                 _19|                 _20|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------

In [92]:
# Join the df1 dataframe with movies data to see the name of the recommended movies.
userRecs_joined = df1.join(movies)

In [93]:
# Print the recommended movies for the new user with userId 162542.
userRecs_joined.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+--------------------+--------------------+
|                  _1|                  _2|                  _3|                  _4|                  _5|                  _6|                  _7|                  _8|                  _9|                 _10|                 _11|                 _12|                 _13|                 _14|                 _15|                 _16|                 _17|                 _18|                 _19|                 _20|movieId|               title|              genres|
+--------------------+--------------------+-------------