## To run pyspark in notebook, import these

In [4]:
import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\Spark\\spark-3.0.1-bin-hadoop2.7'

## Read File as csv format and take only required fields (uid, animeid, rating_score)

In [5]:
import csv
lines = open('AnimeData/reviews.csv',encoding='utf-8')
fcsv = csv.reader(lines)
result = list(fcsv)
rates = []
for i in result[1::]:  #remove header row 0
    rates.append((int(i[0]),int(i[2]),float(i[4])))

In [6]:
rates[:10:]

[(255938, 34096, 8.0),
 (259117, 34599, 10.0),
 (253664, 28891, 7.0),
 (8254, 2904, 9.0),
 (291149, 4181, 10.0),
 (10046, 2904, 10.0),
 (247454, 16664, 6.0),
 (140903, 2904, 8.0),
 (23791, 2904, 10.0),
 (25115, 4181, 4.0)]

## Create spark session with app name 'Anime Recommendation' and create dataframe with respective column names (UID, AnimeID, Score)

In [7]:
from pyspark.sql import SparkSession,functions,Row

spark = SparkSession.builder.appName('Anime Recomendation').getOrCreate()
data = spark.createDataFrame(rates,['UID', 'AnimeID', 'Score'])

In [35]:
data.show(10)

+------+-------+-----+
|   UID|AnimeID|Score|
+------+-------+-----+
|255938|  34096|  8.0|
|259117|  34599| 10.0|
|253664|  28891|  7.0|
|  8254|   2904|  9.0|
|291149|   4181| 10.0|
| 10046|   2904| 10.0|
|247454|  16664|  6.0|
|140903|   2904|  8.0|
| 23791|   2904| 10.0|
| 25115|   4181|  4.0|
+------+-------+-----+
only showing top 10 rows



## Remove deplicate values of same uid rates same animeID many times by using avg

In [36]:
#remove deplicate value of same uid rates same animeID many times
ratings = data.groupBy('UID','AnimeID').avg('score')
ratings.show(10)

+------+-------+----------+
|   UID|AnimeID|avg(score)|
+------+-------+----------+
|287614|   5680|       7.0|
| 28120|   5680|      10.0|
| 94151|     97|      10.0|
| 31995|   4472|       7.0|
|297731|  38958|      10.0|
|217503|   1571|       8.0|
|275482|  19319|       9.0|
|283922|  35860|       8.0|
|240022|  28999|      10.0|
|212633|  20057|      10.0|
+------+-------+----------+
only showing top 10 rows



## Create ALS model

In [37]:
from pyspark.ml.recommendation import ALS

als = ALS(rank=10, regParam=0.01, userCol='UID', itemCol='AnimeID', ratingCol='avg(score)')
model = als.fit(ratings)

In [38]:
model

ALSModel: uid=ALS_84d3a49ac7d6, rank=10

## Get anime that are more than 50 ratings

In [39]:
mostRate = ratings.groupBy('AnimeID').count().filter('count > 50')
mostRate.show(10)

+-------+-----+
|AnimeID|count|
+-------+-----+
|     26|   79|
|  16742|  186|
|  30276|  414|
|  11771|  149|
|  35120|  314|
|  34383|   56|
|  13659|   53|
|   2581|   73|
|    270|   75|
|   1690|   70|
+-------+-----+
only showing top 10 rows



## Create testing dataset with UID 34

In [41]:
from pyspark.sql.functions import lit

dataForRecom = mostRate.select('AnimeID').withColumn('UID', lit(34))
dataForRecom.show(10)

+-------+---+
|AnimeID|UID|
+-------+---+
|     26| 34|
|  16742| 34|
|  30276| 34|
|  11771| 34|
|  35120| 34|
|  34383| 34|
|  13659| 34|
|   2581| 34|
|    270| 34|
|   1690| 34|
+-------+---+
only showing top 10 rows



## Get anime names and add it to dictionary with key as anime id and value as anime name

In [8]:
file = open('AnimeData/animes.csv',encoding='utf-8')
names = list(csv.reader(file))
anime_names = {}
for name in names[1::]:
    anime_names[int(name[0])] = name[1]

## Predict the data of UID 34

In [43]:
result = model.transform(dataForRecom)
recomemded = result.sort('prediction',ascending=False)
recomemded.show(10)

+-------+---+----------+
|AnimeID|UID|prediction|
+-------+---+----------+
|  28891| 34|  4.599096|
|  10495| 34| 3.3498428|
|   8426| 34| 3.2341256|
|  16774| 34| 3.1218092|
|   6802| 34| 2.9618676|
|  34984| 34|  2.934247|
|  38329| 34| 2.9225383|
|     45| 34|  2.904904|
|     57| 34|  2.898753|
|  21877| 34| 2.5509806|
+-------+---+----------+
only showing top 10 rows



## Get top 10 prediction with anime names

In [52]:
show = recomemded.take(10)

In [59]:
print('Recomended Anime for userid 34:\n')

for anime in show:
    print(anime_names[anime[0]],round(anime[2],2))

Recomended Anime for userid 34:

Haikyuu!! Second Season 4.6
Yuru Yuri 3.35
Hourou Musuko 3.23
Inferno Cop 3.12
So Ra No Wo To 2.96
Koi wa Ameagari no You ni 2.93
Seishun Buta Yarou wa Yumemiru Shoujo no Yume wo Minai 2.92
Rurouni Kenshin: Meiji Kenkaku Romantan 2.9
Beck 2.9
High Score Girl 2.55
