In a classification setting, the task is to predict which genre or categories a movie belongs to based on certain features such as plot, actors, directors, etc. This type of recommendation is known as content-based filtering.

In a regression setting, the task is to predict the rating a user would give to a movie based on their past ratings and demographic information. This type of recommendation is known as collaborative filtering.

The choice between classification and regression largely depends on the data and the problem being addressed. Both techniques have their own advantages and disadvantages, and it is up to the data scientist to choose the appropriate model based on their understanding of the data and the problem.


# Test 11

In [2]:
df = pd.read_csv(processed_data + "/" + "anime_final.csv")# load anime df
df

Unnamed: 0,anime_id,name,english_title,japanses_title,genre,type,source,duration,episodes,rating,score,rank,members,synopsis,cover
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,Original,24 min per ep,26,R - 17+ (violence & profanity),8.75,40.0,486824,"Crime is timeless. By the year 2071, humanity ...",https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: Tengoku no Tobira,カウボーイビバップ 天国の扉,"Action, Drama, Mystery, Sci-Fi, Space",Movie,Original,1 hr 55 min,1,R - 17+ (violence & profanity),8.38,185.0,137636,"Another day, another bounty—such is the life o...",https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,"Action, Comedy, Sci-Fi",TV,Manga,24 min per ep,26,PG-13 - Teens 13 or older,8.22,315.0,283069,"Vash the Stampede is the man with a $$60,000,0...",https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),"Action, Drama, Magic, Mystery, Police, Superna...",TV,Original,25 min per ep,26,PG-13 - Teens 13 or older,7.25,2791.0,64905,Robin Sena is a powerful craft user drafted in...,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Beet the Vandel Buster,Bouken Ou Beet,冒険王ビィト,"Adventure, Fantasy, Shounen, Supernatural",TV,Manga,23 min per ep,52,PG - Children,6.94,4310.0,9848,It is the dark century and the people are suff...,https://cdn.myanimelist.net/images/anime/7/215...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12196,34514,Pokemon Generations,Pokemon Generations,ポケモンジェネレーションズ,"Action, Adventure, Fantasy, Game, Kids",ONA,Game,4 min per ep,18,PG - Children,7.46,1855.0,295,Pokémon Generations revisits each generation o...,https://cdn.myanimelist.net/images/anime/11/83...
12197,34519,Mobile Suit Gakuen: G-Reco Koushien,Kidou Senshi Gakuen: G-Reco Koushien,モビルスーツ学園「Ｇ-レコ甲子園」,Comedy,Special,Original,4 min per ep,9,PG - Children,5.27,12388.0,94,Short specials bundled with the Blu-ray volumes.,https://cdn.myanimelist.net/images/anime/2/832...
12198,34522,"Wake Up, Girls! Shin Shou","Wake Up, Girls! Shin Shou","Wake Up, Girls！新章","Drama, Music",TV,Original,23 min per ep,Unknown,PG-13 - Teens 13 or older,6.52,6499.0,381,"""Wake Up, Girls!"" is the story of growth and f...",https://cdn.myanimelist.net/images/anime/10/87...
12199,34525,Centaur no Nayami,Centaur no Nayami,セントールの悩み,"Comedy, Fantasy, Slice of Life, Supernatural",TV,Manga,23 min per ep,Unknown,R+ - Mild Nudity,6.43,7025.0,108,"Himeno is a sweet, shy little centaur girl. In...",https://cdn.myanimelist.net/images/anime/2/867...


In [25]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import os
import sys
import joblib

#Preparing folder variables
#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
raw_data = (data_folder + "\_raw")
processed_data = (data_folder + "\processed")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")


'''
This version of the function takes two lists as inputs: genres and types. 
If both lists have at least one value, the function filters the DataFrame 
to include only rows where the genre column matches one of the genres 
in the list and the type column matches one of the types in the list.
'''
def filtering_su(genres, types):
    df = pd.read_csv(processed_data + "/" + "_anime_to_compare_with_name.csv")# load anime df
    df['genre'] = df['genre'].str.split(', ')
    df = df.explode('genre')

    if "All" in genres:
        return df

    if genres and types:
        # If both lists are empty, the original DataFrame is returned without any filtering.
        filtered = df[df['genre'].isin(genres)]
        filtered = filtered[filtered['type'].isin([t for t in types])]
        return filtered

    # Add "All" option to type list
    if "All" in types:
        return df
    
    elif genres:
        # If only the genres list has values, the function filters the DataFrame 
        # to include only rows where the genre column matches one of the genres in the list.
        filtered = df[df['genre'].isin(genres)]
        return filtered
        
    elif types:
        # If only the types list has values, the function filters the DataFrame 
        # to include only rows where the type column matches one of the types in the list.
        filtered = df[df['type'].isin([t for t in types])]
        return filtered
        
    else:
        return df

'''
Create dict of records with the filters selected - each row becomes a dictionary where key is column name and value is the data in the cell.
'''
def create_dict_su(final_df,gen,typ,n=100):
    final_df = filtering_su(gen,typ)
    final_df = final_df.head(n)
    if final_df.empty:
        sentence = print('WOW!!!! Sorry, there is no matches for the anime and options selected! \n Try again, you might have mroe luck')
        return sentence
    else:
        final_dict = final_df.to_dict('records')

        return final_dict



def sort_it(id):
    algo = joblib.load(saved_models_folder + "\SVD_samople_fit.pkl")
    df = pd.read_csv(processed_data + "/" + "anime_final.csv")# load anime df
    df['Estimate_Score'] = df['anime_id'].apply(lambda x: algo.predict(id, x).est)
    df = df.sort_values('Estimate_Score', ascending=False).drop(['anime_id'], axis = 1)
    blankIndex=[''] * len(df)
    df.index=blankIndex 
    return df
# Define the options for the multiselects
option_genre = ['Drama', 'Romance']
option_type = ['Movie', 'TV']

create_dict(sort_it(200),option_genre,option_type,100)

[{'anime_id': 1,
  'name': 'cowboy bebop',
  'english_title': 'Cowboy Bebop',
  'japanses_title': 'カウボーイビバップ',
  'genre': 'Drama',
  'type': 'TV',
  'source': 'Original',
  'duration': '24 min per ep',
  'episodes': 26.0,
  'rating': 'R - 17+ (violence & profanity)',
  'score': 8.75,
  'rank': 40.0,
  'members': 486824.0,
  'synopsis': "Crime is timeless. By the year 2071, humanity has expanded across the galaxy, filling the surface of other planets with settlements like those on Earth. These new societies are plagued by murder, drug use, and theft, and intergalactic outlaws are hunted by a growing number of tough bounty hunters.\n\nSpike Spiegel and Jet Black pursue criminals throughout space to make a humble living. Beneath his goofy and aloof demeanor, Spike is haunted by the weight of his violent past. Meanwhile, Jet manages his own troubled memories while taking care of Spike and the Bebop, their ship. The duo is joined by the beautiful con artist Faye Valentine, odd child Edward 

In [21]:
create_dict(sort_it(200),"Parody","Movie",100)

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

SVD-based anime recommendation system using Truncated SVD with PCA, 
perform hyperparameter tuning with GridSearchCV to find the best parameters for the model, evaluate the model using mean squared error, 
and predict N number of animes for each user. The predictions will be saved in a dataframe sorted from higher to lower and saved to a csv file.

In [9]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [32]:
# Load the anime ratings dataset into a pandas dataframe
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

# Merge the ratings dataframe with the anime names dataframe
df = pd.merge(df, pd.read_csv(raw_data + "/" + "anime.csv"), on='anime_id')

In [33]:
# Reducin the df using a sample to test the model faster to see if it works.
size = 100000
rating_sample = df.groupby("rating_x", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(df))))).sample(frac=1).reset_index(drop=True)

In [34]:
# Create a user-item matrix from the ratings dataframe
matrix = df.pivot_table(index='user_id', columns='name', values='rating_x')

In [35]:
# Impute missing values with the mean rating for each anime
matrix = matrix.fillna(matrix.mean())


In [43]:
# Saving the matrix to pickle
import joblib
import pickle
joblib.dump(matrix,processed_data + "/" + "tests_matrix.pkl")

['c:\\Users\\christiandda\\Documents\\GitHub\\Anime_recommendation_systems-1\\src\\data\\processed/tests_matrix.pkl']

In [36]:
# Perform Truncated SVD with PCA
svd = TruncatedSVD(n_components=50, random_state=42)
anime_matrix = svd.fit_transform(matrix)

In [37]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(anime_matrix, matrix, test_size=0.2, random_state=42)

In [38]:
# Perform hyperparameter tuning with GridSearchCV
param_grid = {'n_components': [50, 100, 200], 'random_state': [42]}
grid_search = GridSearchCV(TruncatedSVD(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "c:\Users\christiandda\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\christiandda\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 220, in __call__
    return self._score(
           ^^^^^^^^^^^^
  File "c:\Users\christiandda\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 262, in _score
    y_pred = method_caller(estimator, "predict", X)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\christiandda\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\_scorer.py", line 72, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: 'TruncatedSVD' object has no attribute 'pre

In [41]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)


AttributeError: 'TruncatedSVD' object has no attribute 'predict'

In [42]:
# Predict N number of animes for each user
N = 10
user_predictions = best_model.predict(anime_matrix)

AttributeError: 'TruncatedSVD' object has no attribute 'predict'

In [None]:
# Create a dataframe of anime names and predicted ratings
anime_names = matrix.columns
prediction_df = pd.DataFrame(user_predictions, columns=anime_names)

In [None]:
# Sort the predictions for a specific anime
input_anime = input("Enter an anime name: ")
sorted_predictions = prediction_df[input_anime].sort_values(ascending=False)

In [None]:
# Display the top N recommendations
print("Top", N, "recommendations for", input_anime, ":")
for i in range(N):
    print(sorted_predictions.index[i], sorted_predictions.values[i])

# Test 13

In [44]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [None]:
# Load the anime ratings dataset into a pandas dataframe
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

# Merge the ratings dataframe with the anime names dataframe
df = pd.merge(df, pd.read_csv(raw_data + "/" + "anime.csv"), on='anime_id')

In [None]:
df.head()

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,1,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,10,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [None]:
# Reducin the df using a sample to test the model faster to see if it works.
size = 100000
df = df.groupby("rating_x", group_keys=False).apply(lambda x: x.sample(int(np.rint(size*len(x)/len(df))))).sample(frac=1).reset_index(drop=True)

In [None]:
df.head(3)

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,27997,194,7,Macross Zero,"Adventure, Mecha, Military, Sci-Fi, Shounen",OVA,5,7.69,23568
1,16880,4938,8,Tsubasa: Shunraiki,"Action, Adventure, Drama, Fantasy, Magic, Myst...",OVA,2,8.23,40420
2,39151,9989,7,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...,"Drama, Slice of Life, Supernatural",TV,11,8.62,463835


In [None]:
# Create a user-item matrix from the ratings dataframe
matrix = df.pivot_table(index='user_id', columns='name', values='rating_x')

In [None]:
# Impute missing values with the mean rating for each anime
matrix = matrix.fillna(matrix.mean())

KeyboardInterrupt: 

In [45]:
# Loading saved matrix
matrix = joblib.load(processed_data + "/" + "tests_matrix.pkl")

In [46]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(matrix, matrix, test_size=0.2, random_state=42)

MemoryError: Unable to allocate 4.91 GiB for an array with shape (11196, 58812) and data type float64

In [None]:
# Define a function for model training and evaluation
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [None]:
# Train and evaluate seven different models
models = [TruncatedSVD(), PCA(), NearestNeighbors(), Lasso(), Ridge(), RandomForestRegressor()]
model_names = ['Truncated SVD', 'PCA', 'Nearest Neighbors', 'Lasso', 'Ridge', 'Random Forest Regressor']
best_mse = np.inf
best_model = None
best_model_name = None
for i, model in enumerate(models):
    mse = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
    print("Model:", model_names[i], "MSE:", mse)
    if mse < best_mse:
        best_mse = mse
        best_model = model
        best_model_name = model_names[i]

In [None]:
# Choose the best model and predict N number of animes for each user
N = 10
best_model.fit(matrix, matrix)
user_predictions = best_model.predict(matrix)

In [None]:
# Create a dataframe of anime names and predicted ratings
anime_names = matrix.columns
prediction_df = pd.Data

# Test 14

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [None]:
# Load the anime ratings dataset into a pandas dataframe
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

# Merge the ratings dataframe with the anime names dataframe
df = pd.merge(df, pd.read_csv(raw_data + "/" + "anime.csv"), on='anime_id')


In [None]:
# Create a user-item matrix from the ratings dataframe
matrix = df.pivot_table(index='user_id', columns='anime_name', values='rating')

In [None]:
# Loading saved matrix
matrix = joblib.load(processed_data + "/" + "tests_matrix.pkl")

In [None]:
# Impute missing values with the mean rating for each anime
matrix = matrix.fillna(matrix.mean())

In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(matrix, matrix, test_size=0.2, random_state=42)

In [None]:
# Define a function for model training and evaluation
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
# Train and evaluate different models
models = [KNeighborsClassifier(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
model_names = ['KNeighbors Classifier', 'SVM Classifier', 'Decision Tree Classifier', 'Random Forest Classifier']
best_accuracy = 0
best_model = None
best_model_name = None
for i, model in enumerate(models):
    accuracy = train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
    print("Model:", model_names[i], "Accuracy:", accuracy)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_model_name = model_names[i]

In [None]:
# Choose the best model and predict N number of animes for each user
N = 
input_anime = input("Enter an anime name:")
anime_index = np.where(matrix.columns == input_anime)[0][0]
user_ratings = matrix.iloc[:, anime_index]

In [None]:
# Predict the top N similar animes for the input anime
best_model.fit(matrix, user_ratings)
anime_similarities = best_model.predict(matrix)

In [None]:
# Save the predictions in a dataframe sorted from higher to lower
prediction_df = df.iloc[most_similar]
prediction_df = prediction_df.sort_values(by="rating", ascending=False)
prediction_df.to_csv("anime_recommendations.csv", index=False)

# Test 16

Anime recommendation system using an SVC model with :
- hyperparameter tuning
- cross-validation
- evaluation
- StandardScaler
- StratifiedKFold

In [None]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [None]:
# Load the anime dataset
df_name = pd.read_csv(raw_data + "/" + "anime.csv")

In [None]:
# Prepare the data for modeling
X = df.drop(columns=["anime_name", "rating"])
y = df["rating"]

In [None]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
# Split the data into training and testing sets using StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {'C': [1, 5, 10, 50], 'kernel': ['linear', 'rbf']}

# Create a SVC model
svc = SVC()

In [None]:
# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(svc, param_grid, cv=skf, scoring='accuracy')
grid_search.fit(X, y)

In [None]:
# Select the best hyperparameters
best_params = grid_search.best_params_

In [None]:
# Train the SVC model using the best hyperparameters
svc = SVC(C=best_params["C"], kernel=best_params["kernel"])
svc.fit(X, y)

In [None]:
# Predict the ratings for a specific anime
input_anime = "Naruto"
input_index = df[df["anime_name"] == input_anime].index[0]
input_features = X[input_index, :].reshape(1, -1)
input_rating = svc.predict(input_features)[0]

In [None]:
# Find the N most similar animes
N = 10
predictions = svc.predict(X)
similarity_scores = np.abs(predictions - input_rating)
most_similar = np.argsort(similarity_scores)[:N]

In [None]:
# Save the predictions in a dataframe sorted from higher to lower
prediction_df = df.iloc[most_similar]
prediction_df = prediction_df.sort_values(by="rating", ascending=False)
prediction_df.to_csv("anime_recommendations.csv", index=False)

# Test 18

Anime recommendation system using a regression model with:
- hyperparameter tuning
- cross-validation
- evaluation
- StandardScaler
- StratifiedKFold

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso

import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [None]:
# Load the anime dataset
df = pd.read_csv(raw_data + "/" + "anime.csv")

# Preprocessing
X = df.drop("rating", axis=1)
y = df["rating"]

In [None]:
# Split the data into training and testing sets using StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# Define the pipeline for scaling and regression
pipe = Pipeline([("scaler", StandardScaler()),
                 ("reg", LinearRegression())])

In [None]:
# Define the grid search with cross-validation
grid = GridSearchCV(pipe, param_grid, cv=skf, n_jobs=-1, return_train_score=False)

In [None]:
# Fit the grid search to the data
grid.fit(X, y)

In [None]:
# Find the best hyperparameters and model
best_params = grid.best_params_
best_model = grid.best_estimator_

In [None]:
# Predict the ratings for the input anime name
input_anime = "One Punch Man"
input_df = df[df["name"] == input_anime].drop("rating", axis=1)
predictions = best_model.predict(input_df)

In [None]:
# Get the indices of the N number of similar animes
N = 10
top_N = np.argsort(predictions)[-N:]

In [None]:
# Get the names of the top N similar animes
top_N_animes = df.iloc[top_N]["name"]

In [None]:
# Create a dataframe with the predictions and names of the top N similar animes
prediction_df = pd.DataFrame({"anime_name": top_N_animes,
                              "prediction": predictions[top_N]})

In [None]:
# Sort the dataframe from higher to lower based on the predictions
prediction_df.sort_values("prediction", ascending=False, inplace=True)

In [None]:
# Reset the index of the dataframe
prediction_df.reset_index(drop=True, inplace=True)

In [None]:
# Print the dataframe
print(prediction_df)

# Test 17

Anime recommendation system using XGBoost with:
- hyperparameter tuning
- cross-validation
- evaluation:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
import xgboost as xgb

import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [None]:
# Load the anime data
anime_data = pd.read_csv(raw_data + "/" + "anime.csv")

# Preprocess the data
anime_data.drop(["anime_id", "name", "genres"], axis=1, inplace=True)

# Get the target variable
target = anime_data["rating"]
anime_data.drop(["rating"], axis=1, inplace=True)

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(anime_data, target, test_size=0.2, random_state=42)

In [None]:
# Define the XGBoost model
xgb_model = xgb.XGBRegressor()

In [None]:
# Define the hyperparameters to tune
parameters = {"learning_rate": [0.1, 0.01, 0.001],
              "n_estimators": [100, 200, 300],
              "max_depth": [3, 5, 7]}

In [None]:
# Perform grid search with cross-validation
xgb_grid = GridSearchCV(xgb_model, parameters, cv=5, scoring="neg_mean_squared_error")
xgb_grid.fit(X_train, y_train)


In [None]:
# Get the best hyperparameters
best_params = xgb_grid.best_params_

In [None]:
# Train the final model with the best hyperparameters
xgb_final = xgb.XGBRegressor(learning_rate=best_params["learning_rate"],
                             n_estimators=best_params["n_estimators"],
                             max_depth=best_params["max_depth"])
xgb_final.fit(X_train, y_train)

In [None]:
# Evaluate the model on the test set
mse = mean_squared_error(y_test, xgb_final.predict(X_test))

In [None]:
# Predict the ratings for a specific anime
input_anime = "One Piece"
input_anime_index = anime_data[anime_data["name"] == input_anime].index[0]
input_anime_features = anime_data.iloc[input_anime_index].values.reshape(1, -1)
input_anime_rating = xgb_final.predict(input_anime_features)

In [None]:
# Predict the ratings for N number of animes
N = 10
anime_ratings = xgb_final.predict(anime_data)
top_N_indexes = np.argsort(anime_ratings)[::-1][:N]
top_N_animes = anime_data.iloc[top_N_indexes]["name"].values

In [None]:
# Save the predictions in a dataframe sorted from higher to lower
results = pd.DataFrame({"anime": top_N_animes, "rating": anime_ratings[top_N_indexes]})

In [None]:
# Sort the dataframe from higher to lower based on the predictions
prediction_df.sort_values("prediction", ascending=False, inplace=True)

In [None]:
# Reset the index of the dataframe
prediction_df.reset_index(drop=True, inplace=True)

In [None]:
# Print the dataframe
print(prediction_df)

# Test 22

A matrix factorization-based recommendation system, using the Singular Value Decomposition (SVD) algorithm, and evaluates its performance using precision, recall, and F1-score metrics-

This code uses the Surprise library, which provides an easy-to-use implementation of matrix factorization-based recommendation algorithms. The user-item matrix is loaded into the Surprise library's Dataset object, which can handle missing values and sparse matrices. The data is then split into training and test sets, and the SVD model is trained on the training set. The performance of the model is evaluated on the test set by computing precision, recall, and F1-score metrics, which are commonly used metrics for evaluating recommendation systems.

This code uses the Surprise library's SVD model to make predictions for each user and each anime in the dataset. The predictions are stored in a dictionary, with the user ID as the key and a list of tuples of anime ID and predicted rating as the value. The dictionary is then converted to a Pandas DataFrame and saved to a CSV file. The code defines the number of anime recommendations to predict for each user as N = 10.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from surprise import SVD
from surprise import Dataset
from surprise import Reader

import numpy as np
import os
import sys

#Preparing folder variables
os.chdir(os.path.dirname(sys.path[0])) # This command makes the notebook the main path and can work in cascade.
main_folder = sys.path[0]
data_folder = (main_folder + "\data")
saved_models_folder = (data_folder + "\saved_models")
sounds_folder = (main_folder + "\sounds")
saved_models = (main_folder + "\saved_models")
processed_data = (data_folder + "\processed")
raw_data = (data_folder + "\_raw")
user_based_unsupervised_data = (data_folder + "\processed\_user_based_unsupervised")
content_based_unsupervised_data = (data_folder + "\processed\content_based_unsupervised")
content_based_supervised_data = (data_folder + "\processed\content_based_supervised")

In [None]:
# Load the anime ratings dataset into a pandas DataFrame
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

# Create a user-item matrix from the ratings dataframe
reader = Reader(rating_scale=(0, 10))
data = Dataset.load_from_df(df[['user_id', 'anime_id', 'rating']], reader)

In [None]:
# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
# Train the SVD model
model = SVD()
model.fit(trainset)

In [None]:
# Predict ratings for the test set
predictions = model.test(testset)

In [None]:
# Define the number of anime recommendations to predict for each user
N = 10

In [None]:
# Compute precision, recall, and F1-score metrics
y_true = [pred.r_ui for pred in predictions]
y_pred = [pred.est for pred in predictions]
precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

In [None]:
# Make predictions for each user
user_predictions = {}
for user_id in df['user_id'].unique():
    anime_ratings = []
    for anime_id in df['anime_id'].unique():
        pred = model.predict(user_id, anime_id)
        anime_ratings.append((anime_id, pred[3]))
    anime_ratings.sort(key=lambda x: x[1], reverse=True)
    user_predictions[user_id] = anime_ratings[:N]

In [None]:
# Save the predictions to a DataFrame and write to a CSV file
pred_df = pd.DataFrame.from_dict(user_predictions, orient='index')
pred_df.to_csv('user_predictions.csv')

# Test 23

Recommendations for a specific user by user ID using the SVD (Singular Value Decomposition) model

The SVD model from the surprise library is used to make predictions about the ratings for the test set. The function get_user_predictions is used to get the predictions for a specific user ID by filtering the ratings for that user from the trainset and passing them to the test method of the SVD model. The predictions are sorted from highest to lowest, and N animes are recommended to the user. The recommendations are merged with the anime names dataframe and saved to a csv file.

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

In [None]:
# Load the ratings dataset into a pandas dataframe
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

In [None]:
# Convert the dataframe into a Surprise dataset format
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(df[['user_id', 'anime_id', 'rating']], reader)

In [None]:
# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

In [None]:
# Train the SVD model
algo = SVD()
algo.fit(trainset)

In [None]:
# Use the trained model to make predictions for the test set
predictions = algo.test(testset)

In [None]:
# Evaluate the accuracy of the predictions
accuracy.rmse(predictions)

In [None]:
# Get the predictions for a specific user ID
def get_user_predictions(user_id, algo, trainset):
    user_inner_id = algo.trainset.to_inner_uid(user_id)
    user_ratings = algo.trainset.ur[user_inner_id]
    user_predictions = algo.test(user_ratings)
    return user_predictions

In [None]:
user_id = 123
user_predictions = get_user_predictions(user_id, algo, trainset)

In [None]:
# Sort the predictions from highest to lowest
user_predictions.sort(key=lambda x: x.est, reverse=True)

In [None]:
# Recommend N animes for the user
N = 10
user_recommendations = [prediction.iid for prediction in user_predictions[:N]]

In [None]:
# Merge the recommendations with the anime names dataframe
recommendations_df = pd.merge(df[['anime_id', 'anime_name']], 
                              pd.DataFrame({'anime_id': user_recommendations}), 
                              on='anime_id', 
                              how='inner')

In [None]:
# Save the recommendations to a csv file
recommendations_df.to_csv('user_{}_recommendations.csv'.format(user_id), index=False)

# Test 24

Surprise library to recommend animes to a user using the Singular Value Decomposition (SVD) model, and predicts the rating that the user would give to each recommended anime:

In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split

# Load the ratings dataset into a pandas dataframe
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

# Create a reader object
reader = Reader(rating_scale=(1, 10))

# Load the data into a dataset using the reader
data = Dataset.load_from_df(df[['user_id', 'anime_id', 'rating']], reader)

# Split the data into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

# Train the SVD model
algo = SVD()
algo.fit(trainset)


# Load the ratings dataset into a pandas dataframe
df = pd.read_csv(raw_data + "/" + "rating.csv.zip")

# Get the anime names dataframe
anime_names = pd.read_csv(raw_data + "/" + "anime.csv.zip")

# Function to recommend animes to a user
def recommend_animes(user_id, N):
    # Get the user's rated animes
    user_ratings = df[df['user_id'] == user_id]
    
    # Get the user's anime IDs
    user_anime_ids = user_ratings['anime_id'].tolist()
    
    # Get the anime names for the user's rated animes
    user_anime_names = anime_names[anime_names['anime_id'].isin(user_anime_ids)].anime_name.tolist()
    
    # Get the predicted ratings for the animes that the user has not rated
    predictions = []
    for anime_id in anime_names['anime_id']:
        if anime_id not in user_anime_ids:
            prediction = algo.predict(user_id, anime_id)
            predictions.append((prediction.est, anime_id))
    
    # Sort the predictions from highest to lowest
    predictions = sorted(predictions, key=lambda x: x[0], reverse=True)
    
    # Get the anime names for the recommended animes
    recommendations = [anime_names[anime_names['anime_id'] == anime_id].anime_name.values[0] for _, anime_id in predictions[:N]]
    
    # Get the predicted ratings for the recommended animes
    prediction_ratings = [prediction for prediction, _ in predictions[:N]]
    
    # Return the recommendations and prediction ratings
    return recommendations, prediction_ratings

# Recommend N animes to a user
user_id = 12345
N = 10
recommendations, prediction_ratings = recommend_animes(user_id, N)

# Print the recommendations and prediction ratings
print("Recommended Animes:")
for i in range(N):
    print(f"{i+1}. {recommendations[i]} ({prediction_ratings[i]:.2f})")
