In [2]:
%pip install pandas
%pip install scikit-learn

Collecting pandasNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/bf/2c/a0cee9c392a4c9227b835af27f9260582b994f9a2b5ec23993b596e5deb7/pandas-2.2.2-cp39-cp39-win_amd64.whl.metadata
  Downloading pandas-2.2.2-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.22.4 (from pandas)
  Obtaining dependency information for numpy>=1.22.4 from https://files.pythonhosted.org/packages/b5/42/054082bd8220bbf6f297f982f0a8f5479fcbc55c8b511d928df07b965869/numpy-1.26.4-cp39-cp39-win_amd64.whl.metadata
  Downloading numpy-1.26.4-cp39-cp39-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     -------------------- ------------------- 30.7/61.0 kB ? eta -:--:--
     -------------------- ------------------- 30.7/61.0 kB ? eta -:--:--
     -------------------------------------- 61.0/61.0 kB 466.1 kB/s eta 0:00:00
Collecting pytz>=2020.1 (fro

In [20]:
import pandas as pd
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [51]:
class Helpers:
    @staticmethod
    def extractGenres(animeDataframe):
        genres = animeDataframe['Genres'].str.split(', ').tolist()
        genresList = [genre for sublist in genres for genre in sublist]
        uniqueGenres = set(genresList)
        return uniqueGenres
    
    @staticmethod
    def extractTypes(animeDataframe):
        typesList = animeDataframe['Type']
        uniqueTypes = set(typesList)
        return uniqueTypes
    
    @staticmethod
    def extractRatingCategories(animeDataframe):
        ratingsList = animeDataframe["Rating"]
        uniqueRatings = set(ratingsList)
        return uniqueRatings
    
    @staticmethod
    def extractProducers(animeDataframe):
        producers = animeDataframe["Producers"].str.split(', ').tolist()
        producersList = [producer for sublist in producers for producer in sublist]
        uniqueProducers = set(producersList)
        return uniqueProducers
    
    @staticmethod
    def extractLicensors(animeDataframe):
        licensors = animeDataframe["Licensors"].str.split(", ").tolist()
        licensorsList = [licensor for sublist in licensors for licensor in sublist]
        uniqueLicensors = set(licensorsList)
        return uniqueLicensors

    @staticmethod
    def extractStudios(animeDataframe):
        studios = animeDataframe["Studios"].str.split(", ").tolist()
        studiosList = [studio for sublist in studios for studio in sublist]
        uniqueStudios = set(studiosList)
        return uniqueStudios

    @staticmethod
    def extractSources(animeDataframe):
        sourcesList = animeDataframe["Source"]
        uniqueSources = set(sourcesList)
        return uniqueSources
    
    @staticmethod
    def extractNames(animeDataframe):
        namesList = animeDataframe["Name"].tolist()
        return namesList
    
    @staticmethod
    def extractEnglishNames(animeDataframe):
        englishNamesList = animeDataframe["English name"].tolist()
        uniqueEnglishNames = set(englishNamesList)
        return uniqueEnglishNames
    
    @staticmethod
    def extractJapaneseNames(animeDataframe):
        japaneseNamesList = animeDataframe["Japanese name"].tolist()
        uniqueJapeneseNames = set(japaneseNamesList)
        return uniqueJapeneseNames
    
    @staticmethod
    def extractEpisodeIntervals(animeDataframe):
        intervalCount = 5
        animeDataframe['Episodes'] = pd.to_numeric(animeDataframe['Episodes'], errors='coerce')
        minimumEpisodes = animeDataframe['Episodes'].min()
        maximumEpisodes = animeDataframe['Episodes'].max()
        intervalWidth = (maximumEpisodes - minimumEpisodes) / intervalCount
        intervals = []
        for i in range(intervalCount):
            lowerBound = minimumEpisodes + i * intervalWidth
            upperBound = lowerBound + intervalWidth
            intervals.append(f"{upperBound:.0f}")
        return intervals
        
    @staticmethod
    def extractPremieredTime(animeDataframe):
        premiereTimes = animeDataframe["Premiered"].tolist()
        uniquePremiereTimes = set(premiereTimes)
        return uniquePremiereTimes
    
    @staticmethod
    def createUserAnimeMatrix(userPreferenceInformation):
        userAnimeMatrix = {'animeID': userPreferenceInformation["userAnimeIDs"], 'rating': userPreferenceInformation["userRatings"]}
        userAnimeMatrix = pd.DataFrame(userAnimeMatrix)
        userAnimeMatrix.set_index('animeID', inplace=True)
        return userAnimeMatrix

    @staticmethod
    def createUserProfileMatrix(userPreferenceInformation, animeDataframe, contentAttributes, filters):
        columns = [element for set_item in contentAttributes for element in set_item]
        userProfileMatrix = pd.DataFrame(0, index=userPreferenceInformation["userAnimeIDs"], columns=columns)
        
        for animeID in userPreferenceInformation["userAnimeIDs"]:
            if "animeGenres" in filters:
                animeGenres = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Genres'].str.split(', ').tolist()[0]
                userProfileMatrix.loc[animeID, animeGenres] = 1
            if "producers" in filters:
                animeProducers = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Producers'].str.split(', ').tolist()[0]
                userProfileMatrix.loc[animeID, animeProducers] = 1
            if "licensors" in filters:
                animeLicensors = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Licensors'].str.split(', ').tolist()[0]
                userProfileMatrix.loc[animeID, animeLicensors] = 1
            if "studios" in filters:
                animeStudios = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Studios'].str.split(', ').tolist()[0]
                userProfileMatrix.loc[animeID, animeStudios] = 1
        userProfileMatrix = userProfileMatrix.fillna(0)
        return userProfileMatrix

    @staticmethod
    def scaleUserProfileMatrix(userAnimeMatrix, userProfileMatrix):
        scaledUserProfileMatrix = userAnimeMatrix.values * userProfileMatrix.values
        scaledUserProfileMatrix = pd.DataFrame(scaledUserProfileMatrix, columns=userProfileMatrix.columns, index=userAnimeMatrix.index)
        return scaledUserProfileMatrix

    @staticmethod
    def normalizeUserProfileMatrix(scaledUserProfileMatrix, userProfileMatrix):
        columnSums = scaledUserProfileMatrix.sum(axis=0)
        normalizedUserProfileMatrix = columnSums / columnSums.sum()
        normalizedUserProfileMatrix.index = userProfileMatrix.columns
        return normalizedUserProfileMatrix

    @staticmethod
    def createUnratedAnimeMatrix(animeDataframe, contentAttributes):
        columns = [element for set_item in contentAttributes for element in set_item]
        unratedAnimeMatrix = pd.DataFrame(0, index=animeDataframe['MAL_ID'], columns=columns)
        return unratedAnimeMatrix
    
    @staticmethod
    def createUnratedOneHotEncodingMatrix(unratedAnimeMatrix, animeDataframe, filters):
        for animeID in unratedAnimeMatrix.index:
            if "animeGenres" in filters:
                animeGenres = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Genres'].str.split(', ').tolist()[0]
                unratedAnimeMatrix.loc[animeID, animeGenres] = 1
            if "producers" in filters:
                animeProducers = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Producers'].str.split(', ').tolist()[0]
                unratedAnimeMatrix.loc[animeID, animeProducers] = 1
            if "licensors" in filters:
                animeLicensors = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Licensors'].str.split(', ').tolist()[0]
                unratedAnimeMatrix.loc[animeID, animeLicensors] = 1
            if "studios" in filters:
                animeStudios = animeDataframe[animeDataframe['MAL_ID'] == animeID]['Studios'].str.split(', ').tolist()[0]
                unratedAnimeMatrix.loc[animeID, animeStudios] = 1
        return unratedAnimeMatrix

    @staticmethod
    def createUnratedUserAnimeMatrix(filters, userPreferenceInformation):
        fileName = "-".join(filters)
        fileName += ".csv"
        unratedAnimeMatrix = pd.read_csv("../UnratedEncodings/"+fileName)
        unratedAnimeMatrix.drop(userPreferenceInformation['userAnimeIDs'], inplace=True)
        unratedAnimeMatrix = unratedAnimeMatrix.drop("MAL_ID", axis = 1)
        return unratedAnimeMatrix

    @staticmethod
    def createCollaborativeMatrix(animeRatingsDataset):
        users = animeRatingsDataset["user_id"].unique()
        anime = animeRatingsDataset["anime_id"].unique()
        collaborativeMatrix = pd.DataFrame(0, columns=anime, index=users)
        for row in animeRatingsDataset.itertuples():
            user_id = row.user_id
            anime_id = row.anime_id
            rating = row.rating
            collaborativeMatrix.loc[user_id, anime_id] = rating
        return collaborativeMatrix
    
    @staticmethod
    def findSimilarUsers(k, activeUser, collaborativeMatrix):
        k+=1
        similarityMatrix = pd.DataFrame(cosine_similarity(collaborativeMatrix))  
        similarityScores = np.array(similarityMatrix.loc[activeUser])
        similarUsers= np.array((-similarityScores).argsort()[:k])
        similarUsers = np.delete(similarUsers, activeUser)
        return similarUsers, similarityScores[similarUsers]
    
    @staticmethod
    def predictScore(ratings, scores, k):
        result = 0
        for i in range(k):
            result += scores[i] * ratings[i]
        result = result / np.sum(abs(scores))
        return result


In [52]:
class AnimeRecommenderSystem:
    def __init__(self):
        self.animeDataset = pd.read_csv("../Dataset/anime.csv")
        self.ratingsDataset = pd.read_csv("../Dataset/rating_complete.csv")
        self.ratingsDataset = self.ratingsDataset.sample(n=2000)
        self.animeGenres = Helpers.extractGenres(self.animeDataset)
        self.producers = Helpers.extractProducers(self.animeDataset)
        self.licensors = Helpers.extractLicensors(self.animeDataset)
        self.studios = Helpers.extractStudios(self.animeDataset)
        self.dataMap = [
            "animeGenres",
            "producers",
            "licensors",
            "studios"
        ]

    def permuteAndCreateUnratedAnimeMatrices(self):
        categories = self.dataMap
        powerSet = []
        for r in range(1, len(categories) + 1):
            combinationsList = combinations(categories, r)
            for combination in combinationsList:
                powerSet.append("-".join(combination))
        for classifiers in powerSet:
            classifier = classifiers.split("-")
            filteredList = [getattr(self, filter) for filter in classifier]
            unratedAnimeMatrix = Helpers.createUnratedAnimeMatrix(self.animeDataset, filteredList)
            oneHotEncodedUnratedMatrix = Helpers.createUnratedOneHotEncodingMatrix(unratedAnimeMatrix, self.animeDataset, classifier)
            oneHotEncodedUnratedMatrix.to_csv("./UnratedEncodings/"+str(classifiers)+".csv")
    
    def contentRecommender(self, userPreferenceInformation, filters, relevantResultsCount=5):
        userAnimeMatrix = Helpers.createUserAnimeMatrix(userPreferenceInformation)
        filteredList = [getattr(self, filter) for filter in filters]
        userProfileMatrix = Helpers.createUserProfileMatrix(userPreferenceInformation, self.animeDataset, filteredList, filters)
        scaledUserProfileMatrix = Helpers.scaleUserProfileMatrix(userAnimeMatrix, userProfileMatrix)
        normalizedUserProfileMatrix = Helpers.normalizeUserProfileMatrix(scaledUserProfileMatrix, userProfileMatrix)
        unratedUserAnimeMatrix = Helpers.createUnratedUserAnimeMatrix(filters, userPreferenceInformation)
        cosineSimilarities = cosine_similarity([normalizedUserProfileMatrix], unratedUserAnimeMatrix)
        animeCosineSimilarities = pd.Series(cosineSimilarities[0], index=unratedUserAnimeMatrix.index)
        topCosineSimilarities = animeCosineSimilarities.nlargest(relevantResultsCount)
        return topCosineSimilarities
    
    def userBasedCollaborativeFiltering(self):
        collaborativeMatrix = Helpers.createCollaborativeMatrix(self.ratingsDataset)
        print(collaborativeMatrix)
        activeUser = int(input("Enter Active user ID: "))
        users, scores = Helpers.findSimilarUsers(2,activeUser,collaborativeMatrix)
        ratings = self.ratingsDataset.iloc[users]
        activeItem = int(input("Enter Active Item ID: "))
        ratings = np.array(ratings.iloc[:, activeItem])
        print('k most similar users: ' + str(users))
        print('Similarity scores of similar users : ' + str(scores))
        print('Raw ratings of similar users for Item ' + str(activeItem) + ' : ' + str(ratings))
        predictedRating = int(Helpers.predictScore(ratings, scores, 2))
        print("Predicted Rating for Item " + str(activeItem) + ' is : '  + str(predictedRating))


In [53]:
ARS = AnimeRecommenderSystem()
categories = {
    "1": "animeGenres",
    "2": "producers",
    "3": "licensors",
    "4": "studios"
}

In [None]:
ARS.permuteAndCreateUnratedAnimeMatrices()

In [25]:
userAnimeList = input("Enter comma separated MAL_IDs for the anime you want to rate: ")
userAnimeList = userAnimeList.split(",")
for i in range(len(userAnimeList)):
    userAnimeList[i] = int(userAnimeList[i])

In [26]:
userAnimeRatings = input("Enter comma separated ratings for the anime: ")
userAnimeRatings = userAnimeRatings.split(",")
for i in range(len(userAnimeRatings)):
    userAnimeRatings[i] = int(userAnimeRatings[i])

In [20]:
contentFilterCategories = input("Enter comma separated categories: 1-animeGenres 2-producers 3-licensors 4-studios")
contentFilterCategories = contentFilterCategories.split(",")
for i in range(len(contentFilterCategories)):
    contentFilterCategories[i] = categories.get(contentFilterCategories[i])

In [43]:
cosineSimilarities = ARS.contentRecommender({"userAnimeIDs":userAnimeList, "userRatings":userAnimeRatings},contentFilterCategories)
cosineSimilarities

0        0.269189
2        0.166264
3        0.000000
4        0.179586
7        0.179586
           ...   
17557    0.393452
17558    0.000000
17559    0.318457
17560    0.219947
17561    0.000000
Length: 17559, dtype: float64


13479    0.659840
10119    0.646386
9958     0.590179
1991     0.538757
3695     0.538757
dtype: float64

In [46]:
print("Top Recommendations:")
recommendedAnime = []
for animeID in cosineSimilarities.index:
    recommendation = {}
    recommendation["MAL_ID"] = ARS.animeDataset[ARS.animeDataset['MAL_ID'] == animeID]['MAL_ID'].tolist()[0]
    recommendation["title"] = ARS.animeDataset[ARS.animeDataset['MAL_ID'] == animeID]['Name'].tolist()[0]
    recommendedAnime.append(recommendation)
print("Anime ID | Title")
for anime in recommendedAnime:
    print(anime["MAL_ID"], " | ", anime["title"])

Top Recommendations:
Anime ID | Title
13479  |  Uchuu Kyoudai: Apo's Dream
10119  |  Seitokai Yakuindomo OVA
9958  |  Hayate no Gotoku! Heaven Is a Place on Earth
1991  |  Juusou Kikou Dancougar Nova
3695  |  Spectral Force Chronicle Divergence


In [54]:
ARS.userBasedCollaborativeFiltering()

        31043  33795  32262  16035  8408   37569  11759  11285  30091  21105  \
208765      9      0      0      0      0      0      0      0      0      0   
127763      0      5      0      0      0      0      0      0      0      0   
232262      0      0      9      0      0      0      0      0      0      0   
271499      0      0      0      8      0      0      0      0      0      0   
325141      0      0      0      0      5      0      0      0      0      0   
...       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
41429       0      0      0      0      0      0      0      0      0      0   
321888      0      0      0      0      0      0      0      0      0      0   
57540       0      0      0      0      0      0      0      0      0      0   
115642      0      0      0      0      0      0      0      0      0      0   
185217      0      0      0      0      0      0      0      0      0      0   

        ...  19369  19111  20931  9888 