In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load Anime and Ratings Data
animes=pd.read_csv('anime.csv') # Load the anime dataset
ratings=pd.read_csv('rating1.csv').sample(10000) # Load and sample 10,000 rows from the ratings dataset

# Merge the anime and ratings datasets
ratings=pd.merge(animes,ratings)
ratings=ratings.drop(['genre','type','episodes','new_rating','members'],axis=1)  # Drop unnecessary columns

In [None]:
ratings

Unnamed: 0,anime_id,name,user_id,rating
0,32281,Kimi no Na wa.,9278,10
1,32281,Kimi no Na wa.,6021,9
2,5114,Fullmetal Alchemist: Brotherhood,1209,9
3,5114,Fullmetal Alchemist: Brotherhood,3017,9
4,5114,Fullmetal Alchemist: Brotherhood,1048,1
...,...,...,...,...
9995,5560,Aoi Kokuhaku,6268,1
9996,1639,Boku no Pico,9676,1
9997,1639,Boku no Pico,219,7
9998,1639,Boku no Pico,8628,1


In [None]:
# Select Essential Columns
ratings=ratings[['user_id','name','rating']]

In [None]:
# Pivot DataFrame to Create User-Item Matrix and fill the missing values with 0
ratings=ratings.pivot(index='user_id', columns='name', values='rating').fillna(0)

In [None]:
#  Define a function to standardize data
def standardize(row):
  new_row=(row-row.mean())/(row.max()-row.min()) # Standardize the row values
  return new_row

In [None]:
# Apply the standardization function to each row of the ratings DataFrame
ratings=ratings.apply(standardize)

In [None]:
ratings.head()

name,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Trilogy,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,.hack//Roots,.hack//Sign,.hack//Tasogare no Udewa Densetsu,...,ef: A Tale of Melodies. - Prologue,ef: A Tale of Memories.,ef: A Tale of Memories. - Prologue,ef: A Tale of Memories. - Recollections,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.000223,-0.000245,-0.000223,-0.000312,-0.000223,-0.000223,-0.000418,-0.000473,-0.00047,-0.000573,...,-0.000223,-0.001248,-0.000379,-0.000223,-0.000371,-0.001381,-0.001634,-0.000255,-0.000223,-0.000624
3,-0.000223,-0.000245,-0.000223,-0.000312,-0.000223,-0.000223,-0.000418,-0.000473,-0.00047,-0.000573,...,-0.000223,-0.001248,-0.000379,-0.000223,-0.000371,-0.001381,-0.001634,-0.000255,-0.000223,-0.000624
4,-0.000223,-0.000245,-0.000223,-0.000312,-0.000223,-0.000223,-0.000418,-0.000473,-0.00047,-0.000573,...,-0.000223,-0.001248,-0.000379,-0.000223,-0.000371,-0.001381,-0.001634,-0.000255,-0.000223,-0.000624
5,-0.000223,-0.000245,-0.000223,-0.000312,-0.000223,-0.000223,-0.000418,-0.000473,-0.00047,-0.000573,...,-0.000223,-0.001248,-0.000379,-0.000223,-0.000371,-0.001381,-0.001634,-0.000255,-0.000223,-0.000624
7,-0.000223,-0.000245,-0.000223,-0.000312,-0.000223,-0.000223,-0.000418,-0.000473,-0.00047,-0.000573,...,-0.000223,-0.001248,-0.000379,-0.000223,-0.000371,-0.001381,-0.001634,-0.000255,-0.000223,-0.000624


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Calculate cosine similarity between items (anime)
item_similarity=cosine_similarity(ratings.T)

# Create a DataFrame to store item similarity scores
item_sim_df=pd.DataFrame(item_similarity,index=ratings.columns,columns=ratings.columns)

In [None]:
# Define a function to get similar anime based on a given anime and user rating
def get_similar_anime(anime_name,user_rating):

  # Calculate the similarity scores for the given anime, weighted by the user's rating
  similar_score=item_sim_df[anime_name]*(user_rating)
  # Sort the similarity scores in descending order and return the result
  return similar_score.sort_values(ascending=False)

In [None]:
print(get_similar_anime('Death Note',6))

name
Death Note                                      6.000000
Ookiku Furikabutte Special                      1.044791
Pokemon: Pikachu no Obake Carnival              1.044791
Dragon Ball Movie 2: Majinjou no Nemuri Hime    1.044791
Gallery Fake                                    1.035677
                                                  ...   
Steins;Gate                                    -0.046590
Fullmetal Alchemist: Brotherhood               -0.046615
Tokyo Ghoul                                    -0.046734
Code Geass: Hangyaku no Lelouch                -0.048754
Fullmetal Alchemist                            -0.053387
Name: Death Note, Length: 2732, dtype: float64


In [None]:
# List of user ratings for specific anime
temp_user=[('Death Note',7),('Naruto',7),('Fullmetal Alchemist: Brotherhood',7),('Kimi no Na wa.',7)]

In [None]:
# Initialize an empty DataFrame to store similar anime recommendations
similar_animes=pd.DataFrame()

# Loop through each anime in the user's ratings
for anime,rating in temp_user:
  # Get similar anime for each rated anime and append to the DataFrame
  similar_animes=similar_animes._append(get_similar_anime(anime,rating),ignore_index=True)

In [None]:
similar_animes.sum().sort_values(ascending=False).head(10)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kimi no Na wa.,6.962526
Naruto,6.893424
Fullmetal Alchemist: Brotherhood,6.890538
Death Note,6.879328
World Destruction: Sekai Bokumetsu no Rokunin,3.686471
Nanatsu no Taizai: Seisen no Shirushi,3.640114
Umineko no Naku Koro ni,2.55565
Hatsukoi Limited: Gentei Shoujo,1.42744
Resort Boin,1.42744
Panchira Teacher,1.387883


**We have done the same thing as above but here we have taken only a single anime**

In [None]:
temp_user=[('Kimi no Na wa.',5)]
similar_animes=pd.DataFrame()

for anime,rating in temp_user:
  similar_animes=similar_animes._append(get_similar_anime(anime,rating),ignore_index=True)

similar_animes.sum().sort_values(ascending=False).head(10)

Unnamed: 0_level_0,0
name,Unnamed: 1_level_1
Kimi no Na wa.,5.0
World Destruction: Sekai Bokumetsu no Rokunin,2.661483
Nanatsu no Taizai: Seisen no Shirushi,2.626886
Umineko no Naku Koro ni,1.326851
Samurai Champloo,0.886905
Manie-Manie: Meikyuu Monogatari,-0.001574
Sasameki Koto,-0.001574
Slayers Revolution,-0.001574
The iDOLM@STER: 765 Pro to Iu Monogatari,-0.001574
Bakusou Kyoudai Let&#039;s &amp; Go,-0.001574


**ALS Analysis**

In [None]:
from sklearn.model_selection import train_test_split
df=pd.read_csv('rating1.csv')
df=df[['user_id','anime_id','rating']]

# Drop any rows with missing values
df.dropna()
# Split the data into training and test sets (90% training, 10% testing)
train, test = train_test_split(df, test_size=0.1, random_state=13)

In [None]:
train.head(6)

Unnamed: 0,user_id,anime_id,rating
335812,3426,985,7
493513,5026,3783,8
492811,5016,255,9
637897,6116,1149,1
834183,7654,11033,8
494261,5031,21105,10


In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [None]:
# Initialize a Spark session
spark = SparkSession.builder.appName("app").getOrCreate()

# Convert the pandas DataFrames (train and test) to PySpark DataFrames
train_spark = spark.createDataFrame(train)
test_spark = spark.createDataFrame(test)

# Initialize the ALS (Alternating Least Squares) model for collaborative filtering
als = ALS(
    maxIter=5,
    regParam=0.01,
    userCol="user_id",
    itemCol="anime_id",
    ratingCol="rating",
)

# Fit the ALS model using the training data
model = als.fit(train_spark)

In [None]:
predictions = model.transform(test_spark) # Use the trained ALS model to make predictions on the test data
predictions.show(10) # Display the top 10 predictions

+-------+--------+------+----------+
|user_id|anime_id|rating|prediction|
+-------+--------+------+----------+
|   4372|    2025|     1| 1.0155635|
|   1623|   17074|     7|  7.924421|
|   1993|    1378|     8| 2.9944267|
|   4725|     384|     8|  8.667171|
|   2547|   20047|    10|  8.240953|
|   4414|   22117|     1| 6.8485713|
|   2706|     237|    10|  7.080355|
|   3489|   19221|     1| 1.5296133|
|    776|   24703|     5| 6.2348204|
|   3949|    1817|     7| 5.1313233|
+-------+--------+------+----------+
only showing top 10 rows



In [None]:
# Convert the PySpark DataFrame to a Pandas DataFrame
predictions_pd=predictions.toPandas()

# Drop rows with missing values (if any)
predictions_pd.dropna(inplace=True)

In [None]:
# Check and display rows that contain any missing values (NaN)
predictions_pd[predictions_pd.isnull().any(axis=1)]

Unnamed: 0,user_id,anime_id,rating,prediction


In [None]:
# Extract the actual and predicted ratings
actual_ratings = predictions_pd['rating']
predicted_ratings = predictions_pd['prediction']

# Calculate Mean Squared Error (MSE)
mse = np.mean((actual_ratings - predicted_ratings) ** 2)

print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 3.670681545056706


In [None]:
animes=pd.read_csv('anime.csv')

# Set 'anime_id' as the index for the DataFrame
animes=animes.set_index('anime_id')

In [None]:
def predict_animes(id_user):
  user_df = predictions_pd[predictions_pd['user_id'] == id_user]  # Filter predictions for the given user ID

  # Get the top 10 anime recommendations based on the predicted ratings
  top_10_animes = user_df.nlargest(10, 'prediction')[['anime_id', 'prediction']]

  # Print the names of the top 10 recommended animes
  for x in list(top_10_animes['anime_id']):
    print(animes.iloc[x]['name'])

In [None]:
predict_animes(108)

gdgd Fairies 2 Episode 0
Hyper ERT
Rizelmine
Choubatsu Yobikou
Kakutou Bijin Wulong: Rebirth
Neo Ranga
Soukyuu no Fafner: Right of Left - Single Program


**SVD Analysis**

In [None]:
ratings_data = pd.read_csv("rating1.csv")
anime_data = pd.read_csv("anime.csv")

In [None]:
!pip install surprise



In [None]:
from surprise import Dataset
from surprise import Reader

ratings_data = pd.read_csv("rating1.csv")
movies_data = pd.read_csv("anime.csv")

min_rating = ratings_data.rating.min()
max_rating = ratings_data.rating.max()

reader = Reader(rating_scale=(min_rating, max_rating))

In [None]:
from surprise import SVD
from surprise.model_selection import cross_validate

try:
    data = Dataset.load_from_df(ratings_data[['user_id', 'anime_id', 'rating']], reader)
except ValueError as e:
    print(f"Error loading data: {e}")
    print(ratings_data.info())
    print(ratings_data.isnull().sum())
    ratings_data.dropna(inplace=True)
    ratings_data['user_id'] = ratings_data['user_id'].astype(int)
    ratings_data['anime_id'] = ratings_data['anime_id'].astype(int)
    data = Dataset.load_from_df(ratings_data[['user_id', 'anime_id', 'rating']], reader)

svd = SVD(n_epochs=10)
results = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=10, verbose=True)

if np.isnan(results['test_rmse']).any() or np.isnan(results['test_mae']).any():
    print("Warning: NaN values encountered in cross-validation results.")

    results['test_rmse'] = np.nan_to_num(results['test_rmse'])
    results['test_mae'] = np.nan_to_num(results['test_mae'])

print("Average MAE: ", np.average(results["test_mae"]))
print("Average RMSE: ", np.average(results["test_rmse"]))

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    1.7253  1.7274  1.7233  1.7180  1.7272  1.7199  1.7268  1.7154  1.7202  1.7184  1.7222  0.0041  
MAE (testset)     1.1623  1.1627  1.1604  1.1595  1.1644  1.1589  1.1624  1.1519  1.1582  1.1584  1.1599  0.0033  
Fit time          15.47   11.50   11.43   11.97   11.83   11.98   11.58   11.90   11.79   11.81   12.13   1.13    
Test time         1.48    1.16    2.08    1.55    1.13    1.53    2.99    1.18    1.51    1.58    1.62    0.53    
Average MAE:  1.1599206668167854
Average RMSE:  1.7221959364139245


In [None]:
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

param_grid = {
  'n_factors': [10, 30, 50],
  'n_epochs': [5, 10, 20]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=10)
gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.7203700989419874
{'n_factors': 50, 'n_epochs': 10}


In [None]:
from surprise.model_selection import train_test_split

# best hyperparameters
best_factor = gs.best_params['rmse']['n_factors']
best_epoch = gs.best_params['rmse']['n_epochs']

# sample random trainset and testset
# test set is made of 20% of the ratings.
trainset, testset = train_test_split(data, test_size=.20)

# We'll use the famous SVD algorithm.
svd = SVD(n_factors=best_factor, n_epochs=best_epoch)

# Train the algorithm on the trainset
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7d904ce4ac50>

In [None]:
def generate_recommendation(model, user_id, ratings_df, animes_df, n_items):
    # Get a list of all anime IDs from dataset
    anime_ids = ratings_df["anime_id"].unique()

    # Get a list of all anime IDs that have been watched by user
    anime_ids_user = ratings_df.loc[ratings_df["user_id"] == user_id, "anime_id"]
    # Get a list off all anime IDS that that have not been watched by user
    anime_ids_to_pred = np.setdiff1d(anime_ids, anime_ids_user)

    # Apply a rating of 4 to all interactions (only to match the Surprise dataset format)
    test_set = [[user_id, anime_id, 4] for anime_id in anime_ids_to_pred]

    # Predict the ratings and generate recommendations
    predictions = model.test(test_set)
    pred_ratings = np.array([pred.est for pred in predictions])
    print("Top {0} item recommendations for user {1}:".format(n_items, user_id))
    # Rank top-n animes based on the predicted ratings
    index_max = (-pred_ratings).argsort()[:n_items]
    for i in index_max:
        anime_id = anime_ids_to_pred[i]
        print(animes_df[animes_df["anime_id"]==anime_id]["name"].values[0], pred_ratings[i])

# define which user ID that we want to give recommendation
userID = 23
# define how many top-n animes that we want to recommend
n_items = 10
# generate recommendation using the model that we have trained
generate_recommendation(svd,userID,ratings_data,animes_data,n_items)

Top 10 item recommendations for user 23:
Giant Killing 10.0
Cardcaptor Sakura 10.0
Ghost in the Shell: Stand Alone Complex 10.0
Major S6 10.0
Eureka Seven 10.0
Saint Seiya 10.0
Ore no Nounai Sentakushi ga, Gakuen Love Comedy wo Zenryoku de Jama Shiteiru 10.0
Suzumiya Haruhi no Shoushitsu 10.0
Kyou kara Maou! 10.0
Gundam Build Fighters 10.0
