# BHT Data Applications project
# Automatic Anime recommendation Algorithm
### This project aims to create an algorithm that can determine what anime to recommend to a user.
##### Authors: Rashmi Di Michino and Antonin Mathubert

The 320000 users and 16000 animes dataset was taken from https://www.kaggle.com/datasets/hernan4444/anime-recommendation-database-2020 <br>
We are going to use this dataset to build a model that can recommend an anime based on the animes that the user is watching, has dropped, has kept on hold or put on their watching list.

### 1. Importing and parsing the data
First, we want to import all of our available data in a suitable manner so it is treatable for the next steps of the project.<br><br>
Here, we are going to propose two methods to do that. The first one is very slow but the "clean" way to read json data. Because we don't actually need all the data from the files, we are going to use a second hand made method that is significantly faster.

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import random
import json
import time
import os
import re

files = os.listdir("dataset/data")
print(files[:10])

#### Method 1 
~4h on i7 10th gen (8 cores)

This method is the clean one. We use python libraries (pandas and json) that will parse ALL of the data for us, we just keep what is interesting.


The down side of this method is that it is very slow because we have to parse 100% of the data.

In [None]:
saved_data = "dataset/data.json"

tracks_df = pd.read_json(saved_data)
initial_skip = True

for file in tqdm(files, desc="Importing files..."):
  if len(tracks_df) > 0 and initial_skip:
    if file == tracks_df.tail(1)['Slice origin'].values[0]:
      initial_skip = False
    else:
      continue

  with open(os.path.join("dataset/data", file), 'r') as json_file:
    json_data = json.loads(json_file.read())
    playlists = json_data["playlists"]

    file_df = []
    for playlist in playlists:
      playlist_df = pd.DataFrame(columns=['Track name', 'Track album', 'Track artist', 'Playlist name', 'Playlist pid', 'Slice origin'])

      playlist_name = playlist["name"]
      playlist_pid = playlist["pid"]
      tracks = playlist["tracks"]

      for track in tracks:
        playlist_df.loc[len(file_df)] = [track['track_name'], track['album_name'], track['artist_name'], playlist_name, playlist_pid, file]

      file_df.append(playlist_df)    
  tracks_df = pd.concat([tracks_df] + file_df, ignore_index=True)
  tracks_df.to_json(saved_data, orient="records", indent=4)


#### Method 2
~ 0.167h on i7 10th gen 8 cores

This method is the fast one, and the one we used in our case. In this method, we don't need to parse the entirety of a file. We just focus on the data that we want to extract, and parse it according to our need.

This is significantly faster because of we have to parse only ~50% of the data and it is not converted into a DataFrame until the end of a chunk save.

In [None]:
save_dir = "/media/fiddle/Data/OneDrive - Université de Technologie de Troyes/Dev/Projets/Python/bht-data/data/"

In [None]:
tracks = []
slice_i = 0

def getStrValue(line):
  return re.search(r'.*?: "(.*)"', line).group(1)

def getIntValue(line):
  return re.search(r'.*?: (.*)', line).group(1)

for file in tqdm(files, desc="Importing files..."):
  with open(os.path.join("dataset/data", file), 'r') as data_file:
    data = data_file.read().split("\n")
    playlist_name = ""
    playlist_pid = ""

    track_info = [False] * 3
    
    for line in data:
      if '"name"' in line:
        playlist_name = getStrValue(line)
      
      if '"pid"' in line:
        playlist_pid = getIntValue(line)

      if '"artist_name"' in line:
        track_info[2] = getStrValue(line)

      if '"track_name"' in line:
        track_info[0] = getStrValue(line)

      if '"album_name"' in line:
        track_info[1] = getStrValue(line)

      if not False in track_info:
        tracks.append(track_info + [playlist_name, playlist_pid, str(file)])
        track_info = [False] * 3
        if len(tracks) > 2 ** 21: #Creating chunked data to be sure to have enough RAM for the whole process
          tracks_df = pd.DataFrame(tracks, columns=['Track name', 'Track album', 'Track artist', 'Playlist name', 'Playlist pid', 'Slice origin'])
          tracks_df.to_json(save_dir + f"dataslice_{slice_i}.json", orient="records", indent=4)
          slice_i += 1
          tracks = []
          tracks_df = None

#### Importing the converted data

In [None]:
slices = os.listdir(save_dir)
main_df = pd.DataFrame(columns=["Track name", "Track album", "Track artist", "Playlist name", "Playlist pid", "Slice origin"])
keep_slices = random.sample(slices, 5)

all_artists = np.array([], dtype='object')
all_albums = np.array([], dtype='object')
all_tracks = np.array([], dtype='object')

for df_slice_item in tqdm(slices, desc="Loading slices..."):
	df_slice = pd.read_json(save_dir + df_slice_item)
			
	all_artists = np.append(all_artists, df_slice['Track artist'].unique())
	all_albums = np.append(all_albums, df_slice['Track album'].unique())
	all_tracks = np.append(all_tracks, df_slice['Track name'].unique())

	if df_slice_item in keep_slices:
		main_df = pd.concat([main_df, df_slice])

all_artists = set(all_artists)
all_albums = set(all_albums)
all_tracks = set(all_tracks)

In [None]:
main_df['Playlist pid'] = main_df["Playlist pid"].apply(lambda x: x.replace(",", ""))
main_df = main_df.groupby("Playlist pid")
len(main_df)

In [None]:
slices = os.listdir(save_dir)
df_tracks_album_relation_count = pd.DataFrame(columns=list(all_albums), dtype='int16')
df_tracks_artist_relation_count = pd.DataFrame(columns=list(all_artists), dtype='int16')

In [None]:
for playlist_pid in tqdm(random.sample(list(main_df.groups.keys()), 1000), desc="Loading data..."):
	playlist = main_df.get_group(playlist_pid)
	albums = playlist['Track album'].value_counts()
	artists = playlist['Track artist'].value_counts()
	for track in playlist['Track name'].unique():
		if not track in df_tracks_album_relation_count.index:
			row = pd.DataFrame(albums, dtype='int16').fillna(0).transpose()
			row.index = [track]
			start_df = time.time()
			df_tracks_album_relation_count = pd.concat([df_tracks_album_relation_count, row], axis=0)
			print("%.3fs to concat" % (time.time() - start_df))
			if_done = time.time()
			continue

		# if not track in df_tracks_artist_relation_count.index:
		# 	row = pd.DataFrame(artists, index=[track], ).transpose().fillna(0).astype('int16')
		# 	df_tracks_artist_relation_count = pd.concat([df_tracks_artist_relation_count, row])
		# 	continue

		df_tracks_album_relation_count.at[track] = df_tracks_album_relation_count.loc[track].add(albums, fill_value=0)
		# df_tracks_artist_relation_count.at[track] = df_tracks_artist_relation_count.loc[track].add(artists, fill_value=0)


In [None]:
track in df_tracks_album_relation_count.index

In [None]:
df_tracks_album_relation_count = pd.concat([df_tracks_album_relation_count, row], axis=0)
df_tracks_album_relation_count

In [None]:
df1 = pd.DataFrame({"A": [1, 2, 3], "B": [2, 3, 5]})
df1.index = ['a', 'b', 'c']
df2 = pd.DataFrame({"C": [2], "D": [2]})
df2.index = ['d']

df = pd.concat([df1, df2])
df

hello there