# BHT Data Applications project
# Automatic Anime recommendation Algorithm
### This project aims to create an algorithm that can determine what anime to recommend to a user.
##### Authors: Rashmi Di Michino and Antonin Mathubert

The 320000 users and 16000 animes dataset was taken from https://www.kaggle.com/datasets/hernan4444/anime-recommendation-database-2020 <br>
We are going to use this dataset to build a model that can recommend an anime based on the animes that the user is watching, has dropped, has kept on hold or put on their watching list.

### 1. Importing and parsing the data
First, we want to import all of our available data in a suitable manner so it is treatable for the next steps of the project.<br><br>
In order to load the data, we are going to do it by chunking the csv file so it's more efficient. Then we're changing the default type of the columns to be more convenient memory wise.

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from itertools import combinations
import plotly.graph_objs as go
from tqdm.notebook import tqdm
import networkx as nx
import pandas as pd
import numpy as np
import itertools

In [None]:
path = "C:/Users/rashm/OneDrive/Desktop/data_applications_project/julius/anime_dataset/"

In [None]:
path = "dataset/anime/"

In [None]:
dataset_chunks = pd.read_csv(path+"animelist.csv", chunksize=20000)
animes_df = pd.read_csv(path+"anime.csv")


chunks = []
for chunk in dataset_chunks:
    chunks.append(chunk)
    
dataset = pd.concat(chunks, ignore_index=True)
dataset = dataset.astype({'user_id': "int32", 'anime_id': 'int32', "watching_status": "int16", "rating": "int16"})

dataset_chunks = None
chunks = None

### 2. Recommendation system based on the watched animes
In this first version we're going to implement a recommendation system based on which animes the users have seen, for example if someone has watched cowboy bepop, they're going to be recommended to see death note
#### Reducing the dataset
As the dataset we're working with is too large, we're going to reduce it

In [None]:
dataset.drop(['watched_episodes'], axis=1, inplace=True)
dataset = dataset[(dataset['anime_id'] > 1) & (dataset['user_id'] < 20000)]
dataset = dataset[(dataset['user_id'] != 61960) & (dataset['watching_status'] != 4)]
dataset = dataset.drop("watching_status", axis=1)

Here we can see a sample of how the dataset is now structured

In [None]:
display(dataset.head(10))
len(dataset)

### 3. Graphs


In [None]:
# Filter the dataset to include only the first 1000 most watched animes
top_animes = dataset['anime_id'].value_counts().nlargest(250).index
df = dataset[dataset['anime_id'].isin(top_animes)]
anime_counts = df["anime_id"].value_counts()

# Create a graph
G = nx.Graph()

# Group by user_id
user_groups = df.groupby('user_id')

# Create edges for each user's watched animes with ratings as weights
for user_id, group in tqdm(user_groups):
	animes = group['anime_id'].tolist()
	ratings = group['rating'].tolist()
	for (anime1, rating1), (anime2, rating2) in combinations(zip(animes, ratings), 2):
		if G.has_edge(anime1, anime2):
			G[anime1][anime2]['weight'] += (rating1 + rating2) / 20
		else:
			G.add_edge(anime1, anime2, weight=(rating1 + rating2) / 20)
            


# Get the top 5 most frequent links
top_edges = sorted(G.edges(data=True), key=lambda x: x[2]['weight'], reverse=True)[:250]

# Create a new graph with only the top 5 edges
top_G = nx.Graph()
top_G.add_edges_from([(u, v, {'weight': d['weight']}) for u, v, d in top_edges])

# Normalize the weights between 0 and 5
weights = [d['weight'] for u, v, d in top_G.edges(data=True)]
min_weight = min(weights)
max_weight = max(weights)

min_count = anime_counts.min()
max_count = anime_counts.max()

node_sizes = [
	10 + (anime_counts[node] - min_count) * (50 - 10) / (max_count - min_count)
	for node in top_G.nodes()
]

for u, v, d in top_G.edges(data=True):
	d['normalized_weight'] = 5 * (d['weight'] - min_weight) / (max_weight - min_weight)



In [None]:
# Get positions for all nodes
pos = nx.spring_layout(top_G, k=10, iterations=100)

In [None]:
hover_texts = []
for node in top_G.nodes():
    neighbors = list(top_G[node])
    weights = [top_G[node][neighbor]['weight'] for neighbor in neighbors]
    total_weight = sum(weights)
    percentages = [(neighbor, top_G[node][neighbor]['weight'] / total_weight * 100) for neighbor in neighbors]
    percentages = sorted(percentages, key=lambda x: x[1], reverse=True)[:5]
    
    hover_text = f"{animes_df[animes_df['MAL_ID'] == node]['English name'].values[0]}<br>" + "<br>".join([f"{animes_df[animes_df['MAL_ID'] == neighbor]['English name'].values[0]}: {weight:.2f}%" for neighbor, weight in percentages])
    hover_texts.append(hover_text)

In [None]:
# Create edge traces
edge_trace = []
for edge in top_G.edges(data=True):
	x0, y0 = pos[edge[0]]
	x1, y1 = pos[edge[1]]
	trace = go.Scatter(
		x=[x0, x1, None],
		y=[y0, y1, None],
		line=dict(width=edge[2]['normalized_weight'], color='gray'),  # Adjusted width for better visualization
		hoverinfo="none",
		mode='lines'
	)
	edge_trace.append(trace)

# Create node trace
node_trace = go.Scatter(
	x=[pos[node][0] for node in top_G.nodes()],
	y=[pos[node][1] for node in top_G.nodes()],
	hovertext=hover_texts,
	text=[animes_df[animes_df['MAL_ID'] == node]['English name'].values[0] for node in top_G.nodes()],
	mode='markers+text',
	textposition='top center',
	marker=dict(
		size=node_sizes,
		color='skyblue',
		line=dict(width=2, color='black')
	)
)


In [None]:
# Create the figure
fig = go.Figure(
	data=edge_trace + [node_trace],
	layout=go.Layout(
		title='Top 5 Most Frequent Anime Watching Links',
		titlefont_size=16,
		showlegend=False,
		hovermode='closest',
		height=800,
		margin=dict(b=20, l=5, r=5, t=40),
		annotations=[dict(
			text="",
			showarrow=False,
			xref="paper", yref="paper"
		)],
		xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
		yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
	)
)

# Show the figure
fig.show()

### 4. Transforming the data into binary matrix

In [None]:
dataset = dataset.pivot(index='user_id', columns='anime_id', values='rating')

We are now converting our matrix into a binary matrix in order to be able to retrieve the association rules: we only take into account the ratings that are above 3.

In [None]:
dataset = dataset > 3

#### Retrieving the association rules
Finally, we are exploiting the mlxtend library to build the recommendation system and we're retrieving the association rules

In [None]:
frequent_itemsets  = apriori(dataset, use_colnames=True, min_support=0.15) #Getting under 0.1 support takes too much computation time / memory and lacks of meaning.

rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.1)

In [None]:
rules["antecedents"] = rules["antecedents"].apply(lambda x: [x for x in x])
rules["consequents"] = rules["consequents"].apply(lambda x: [x for x in x])
rules = rules[rules["confidence"] > 0.2].reset_index().drop("index", axis=1).sort_values("lift", ascending=False)

Here are some of the rules detected by the algorithm.

In [None]:
display(rules.head(20))
print(f"{len(rules)} rules found.")

#### Parsing the rules
These functions are designed to parse and filter the results of the detected rules, so we can understand them more easily.

<hr>

```find_recommendations_precise``` will compute every possible combination of the watched anime ids, and try to find them in the rules dataset.
<hr>

```find_recommendations_free``` will look for every occurence of each anime id in the rules, even if the antecedents frozen set isn't containing only the given id. 

It will return one dataset per seen anime, ordered from highest to lowest weight. Weight is computed by adding every confidence value obtained in the rules containing the recommendated anime in the consequents.

In [None]:
def generate_combinations(ids):
	result = []
	for r in range(1, len(ids) + 1):
		permutations = itertools.permutations(ids, r)
		for p in permutations:
			result.append(list(p))

	print(f"Found {len(result)} possible combinations.")
	return result

def find_recommendations_precise(anime_ids):
	recommendations = []
	
	for combination in tqdm(generate_combinations(anime_ids), desc="Trying every possible combination..."):
		filter_df = rules["antecedents"].apply(lambda x: x == combination) & rules["consequents"].apply(lambda x: np.all([id not in x for id in anime_ids]))
		if filter_df.apply(lambda x: x != False).sum() < 1:
			continue
		recommendation = (combination, rules[filter_df]["consequents"].values, rules[filter_df]["confidence"].values, rules[filter_df]["lift"].values)
		recommendations.append(recommendation)

	return sorted(recommendations, key=lambda x: x[3], reverse=True)

def find_recommendations_free(anime_ids):
	recommendations = []

	for id in anime_ids:
		filter_df = rules["antecedents"].apply(lambda x: id in x) & rules["consequents"].apply(lambda x: np.all([id not in x for id in anime_ids]))
		if filter_df.apply(lambda x: x != False).sum() < 1:
			continue

		recommendation = pd.DataFrame({"source": id, "antecedents": rules[filter_df]["antecedents"].values, "consequents": rules[filter_df]["consequents"].values, "confidence": rules[filter_df]["confidence"].values, "lift": rules[filter_df]["lift"].values})
		recommendations.append(recommendation)

	recommendations = pd.concat(recommendations)
	recommendations_dict = {anime: {}  for anime in anime_ids}
	recommendations_df = []
	for anime in recommendations_dict:
		rows = recommendations[recommendations["source"] == anime]
		for _, row in rows.iterrows():
			for x in row["consequents"]:
				if x in recommendations_dict[anime]:
					recommendations_dict[anime][x] += row["lift"] * row["confidence"]
				else:
					recommendations_dict[anime][x] = row["lift"]* row["confidence"]
		
		for anime_recommended in recommendations_dict[anime]:
			recommendations_df.append([animes_df[animes_df["MAL_ID"] == anime]["English name"].values[0], animes_df[animes_df["MAL_ID"] == anime_recommended]["English name"].values[0], recommendations_dict[anime][anime_recommended]])

	recommendations_df = pd.DataFrame(recommendations_df, columns=['source', 'recommended_id', 'weight']).sort_values(by="weight", ascending=False).groupby("source")

	return recommendations_df

Here we use the previously defined function and parse the results to print them and link them with the anime infos dataset.

In [None]:
seen_animes = [23273] #More than 7 at a time takes forever.
print(seen_animes)

In [None]:
for recommendations in find_recommendations_precise(seen_animes):
	for i in range(len(recommendations[1])):
		recommendation = (recommendations[0], recommendations[1][i], recommendations[2][i], recommendations[3][i])
		print("Because you have seen %s, we think you would like %s with %.3f%% confidence. You are also %.3f%% more likely to watch this/these anime(s)." % (
			" and ".join([animes_df[animes_df["MAL_ID"] == x]["English name"].values[0] + f" ({str(x)})" for x in recommendation[0]]), 
			" and ".join([animes_df[animes_df["MAL_ID"] == x]["English name"].values[0] + f" ({str(x)})" for x in recommendation[1]]), 
			recommendation[2] * 100,
			recommendation[3] * 100 - 100)
		)

In [None]:
# This part needs improvement, because the results are not consise enough. 
# We could create a list of every anime that has been watch by people who's seen a certain anime, and return the most common ones instead of every one. 
for index, recommendation_df in find_recommendations_free(seen_animes):
	display(recommendation_df.head(5))