## Package

In [None]:
import warnings
# Suppress warning printouts
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
from utils import threshold_interactions_df
from scipy import sparse
import pickle

In [None]:
# Read the AlbumId info from albumData1.txt
data_albumId = []
with open('../ydata1/albumData1.txt', 'r') as f:
    for line in f:
        fields = line.strip().split('|')
        data_albumId.append(fields)
df_album = pd.DataFrame(data_albumId)
df_album.columns = ['AlbumId','ArtistId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId']
num_rows_album = len(df_album)
#print(df_album.info())

# Read the first column (ArtistId) from artistData1.txt
df_artist = pd.read_csv('../ydata1/artistData1.txt', delimiter='|', header=None, names=['ArtistId'])
num_rows_artist = len(df_artist)
#print(df_artist.info())

# Read the first column (GenreId) from genreData1.txt
df_genre = pd.read_csv('../ydata1/genreData1.txt', delimiter='|', header=None, names=['GenreId'])
num_rows_genre = len(df_genre)
#print(df_genre.info())

# Read the TrackId info from trackData.txt
data_trackId = []
with open('../ydata1/trackData1.txt', 'r') as f:
    for line in f:
        fields = line.strip().split('|')
        data_trackId.append(fields)
df_track = pd.DataFrame(data_trackId)
df_track.columns = ['TrackId','AlbumId','ArtistId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId','GenreId']
num_rows_track = len(df_track)
#print(df_track.info())

# Calculate the total sum of the number of rows for all the files
total_sum = num_rows_track + num_rows_album + num_rows_artist + num_rows_genre
print(f"Total sum of the number of items for all files: {total_sum}")

In [None]:
data_track = pd.read_csv('../ydata1/filtered_data_track_50.csv')
data_genre = pd.read_csv('../ydata1/filtered_data_genre_50.csv')
data_genre.rename(columns={'item_id': 'genre_id'}, inplace=True)
data_artist = pd.read_csv('../ydata1/filtered_data_artist_50.csv')
data_artist.rename(columns={'item_id': 'artist_id'}, inplace=True)
#print(data_track.info())
#print(data_genre.info())
#print(data_artist.info())

In [None]:
print("User count: ", data_track['user_id'].nunique())
print("Last user value: ", data_track['user_id'].iloc[-1])
print("Item count: ", data_track['item_id'].nunique())

In [None]:
set(data_track['user_id']) == set(data_genre['user_id']) == set(data_artist['user_id'])

In [None]:
x_id_counts = data_track['user_id'].value_counts()
print("Length of user_id interactions with count more than 50:", len(x_id_counts[x_id_counts >= 50]))

In [None]:
x_id_counts = data_track['item_id'].value_counts()
print("Length of item_id interactions with count more than 50:", len(x_id_counts[x_id_counts >= 50]))

In [None]:
filtered_data_track = threshold_interactions_df(data_track,'user_id','item_id',50,50)
filtered_data_track.reset_index(drop=True, inplace=True)
filtered_data_track.info()

In [None]:
print("User count: ", filtered_data_track['user_id'].nunique())
print("Last user value: ", filtered_data_track['user_id'].iloc[-1])
print("Item count: ", filtered_data_track['item_id'].nunique())

In [None]:
# Find common user_id values
common_user_ids = set(filtered_data_track['user_id']) & set(data_genre['user_id']) & set(data_artist['user_id'])

In [None]:
# Filter each dataframe based on common user_id values
filtered_data_genre = data_genre[data_genre['user_id'].isin(common_user_ids)]
filtered_data_genre.reset_index(drop=True, inplace=True)
#print(filtered_data_genre.info())
print("User count: ", filtered_data_genre['user_id'].nunique())
print("Last user value: ", filtered_data_genre['user_id'].iloc[-1])
print("Item count: ", filtered_data_genre['genre_id'].nunique())

In [None]:
# Filter each dataframe based on common user_id values
filtered_data_artist = data_artist[data_artist['user_id'].isin(common_user_ids)]
filtered_data_artist.reset_index(drop=True, inplace=True)
#print(filtered_data_artist.info())
print("User count: ", filtered_data_artist['user_id'].nunique())
print("Last user value: ", filtered_data_artist['user_id'].iloc[-1])
print("Item count: ", filtered_data_artist['artist_id'].nunique())

In [None]:
# Find common user_id values
set(filtered_data_track['user_id']) == set(filtered_data_genre['user_id']) == set(filtered_data_artist['user_id'])

In [None]:
# Create a mapping between unique user_id values and their corresponding factorization labels
#print(len(filtered_data_track['user_id'].unique()))
#print(len(filtered_data_genre['user_id'].unique()))
#print(len(filtered_data_artist['user_id'].unique()))
unique_user_ids = pd.concat([filtered_data_track['user_id'], filtered_data_genre['user_id'], filtered_data_artist['user_id']]).unique()
print (len(unique_user_ids))
factorization_labels = range(0, len(unique_user_ids))
mapping = dict(zip(unique_user_ids, factorization_labels))
#print(mapping)
# Update the user_id columns in both DataFrames using the mapping
filtered_data_track['user_id'] = filtered_data_track['user_id'].map(mapping)
filtered_data_genre['user_id'] = filtered_data_genre['user_id'].map(mapping)
filtered_data_artist['user_id'] = filtered_data_artist['user_id'].map(mapping)
filtered_data_track.reset_index(drop=True, inplace=True)
filtered_data_genre.reset_index(drop=True, inplace=True)
filtered_data_artist.reset_index(drop=True, inplace=True)
#print(filtered_data_track.info())
#print(filtered_data_genre.info())
#print(filtered_data_artist.info())

In [None]:
df_track['TrackId'] = df_track['TrackId'].astype('int64')
df_track_subset = df_track[df_track['TrackId'].isin(filtered_data_track['item_id'])]
df_track_subset.reset_index(drop=True, inplace=True)
#df_track_subset.info()

In [None]:
# Create an empty dictionary
track_genre_dict = {}
# Iterate over the DataFrame rows
for index, row in df_track_subset.iterrows():
    # Get the TrackId and artistId values from the row
    track_id = int(row['TrackId'])
    genre_ids = [int(genre_id) for genre_id in row['GenreId'] if genre_id is not None]
    # Add the non-empty artistId values to the dictionary
    if track_id not in track_genre_dict:
        track_genre_dict[track_id] = genre_ids
    else:
        track_genre_dict[track_id].extend(genre_ids)
print(len(track_genre_dict))

In [None]:
all_values = []
for value_list in track_genre_dict.values():
    all_values.extend(value_list)
all_values = (set(all_values))
len(all_values)

In [None]:
f_filtered_data_genre = filtered_data_genre[filtered_data_genre['genre_id'].isin(set(all_values))]
f_filtered_data_genre.reset_index(drop=True, inplace=True)
#filtered_data_genre.info()

In [None]:
print(filtered_data_genre['genre_id'].nunique())
print(f_filtered_data_genre['genre_id'].nunique())

In [None]:
# Step 1: Group and count ratings by genre_id
genre_counts = f_filtered_data_genre.groupby('genre_id')['rating'].count()
# Step 2: Sort the counts in descending order
sorted_counts = genre_counts.sort_values(ascending=False)
# Step 3: Select the top 50 genres
top_50_genres = sorted_counts.head(168)
# Print the top 50 genres and their ratings count
print(len(set(top_50_genres.index)))

In [None]:
# Step 1: Group and count ratings by item_id
item_counts = filtered_data_track.groupby('item_id')['rating'].count()
# Step 2: Sort the counts in descending order
sorted_counts = item_counts.sort_values(ascending=False)
# Step 3: Select the top 50 items
top_50_items = sorted_counts.head(5754)
print(type(top_50_items))
# Print the top 50 items and their ratings count
print(len(set(top_50_items.index)))

In [None]:
# Initialize lists to store the item_ids covered in top_50_genres and the genre_ids covered in top_50_items
item_ids_in_top_genres = []
genre_ids_in_top_items = []
for item_id in set(top_50_items.index):
    if item_id in track_genre_dict.keys():
        for genre_id in track_genre_dict[item_id]:
            # Check if any genre_id in the genre_ids list is in top_50_items
            if genre_id in top_50_genres.index:
                genre_ids_in_top_items.append(genre_id)
                item_ids_in_top_genres.append(item_id)
    else:
        print("FALSE")
print(len(set(item_ids_in_top_genres)))
print(len(set(genre_ids_in_top_items)))

In [None]:
print(len(set(genre_ids_in_top_items)))
print(len(set(top_50_genres.index)))
print(len(set(genre_ids_in_top_items)) - len(set(top_50_genres.index)))

In [None]:
print(len(set(item_ids_in_top_genres)))
print(len(set(top_50_items.index)))
print(len(set(item_ids_in_top_genres)) - len(set(top_50_items.index)))

In [None]:
print(filtered_data_track['item_id'].nunique())
print(len(set(item_ids_in_top_genres)))
print(filtered_data_genre['genre_id'].nunique())
print(len(set(genre_ids_in_top_items)))

In [None]:
# Create a mapping between item_id and their corresponding factorization labels
unique_item_ids = filtered_data_track['item_id'].unique()
print (len(unique_item_ids))
factorization_item_labels = range(0, len(unique_item_ids))
mapping = dict(zip(unique_item_ids, factorization_item_labels))
# Update the item_id columns using the mapping
filtered_data_track['item_id'] = filtered_data_track['item_id'].map(mapping)
filtered_data_track.reset_index(drop=True, inplace=True)
#print(filtered_data_track.info())

In [None]:
# Initialize the updated list
updated_item_ids_in_top_genres = []

# Update each item_id in the list using the mapping dictionary
for item_id in set(item_ids_in_top_genres):
    updated_item_ids_in_top_genres.append(mapping.get(item_id, item_id))

# Use the updated list as needed
print(len(sorted(updated_item_ids_in_top_genres)))
#print(sorted(updated_item_ids_in_top_genres))

In [None]:
#print(all(item in filtered_data_track['item_id'].unique() for item in updated_item_ids_in_top_genres))
#print(all(item in updated_item_ids_in_top_genres for item in filtered_data_track['item_id'].unique()))

In [None]:
filtered_data_genre = filtered_data_genre[filtered_data_genre['genre_id'].isin(set(genre_ids_in_top_items))]
filtered_data_genre.reset_index(drop=True, inplace=True)
print(filtered_data_genre['genre_id'].nunique())
#filtered_data_genre.info()

In [None]:
# Create a mapping between item_id and their corresponding factorization labels
unique_item_ids = filtered_data_genre['genre_id'].unique()
print (len(unique_item_ids))
factorization_item_labels = range(0, len(unique_item_ids))
mapping = dict(zip(unique_item_ids, factorization_item_labels))
# Update the genre_id columns using the mapping
filtered_data_genre['genre_id'] = filtered_data_genre['genre_id'].map(mapping)
filtered_data_genre.reset_index(drop=True, inplace=True)
#print(filtered_data_genre.info())

In [None]:
# Initialize the updated list
updated_genre_ids_in_top_items = []

# Update each item_id in the list using the mapping dictionary
for genre_id in set(genre_ids_in_top_items):
    updated_genre_ids_in_top_items.append(mapping.get(genre_id, genre_id))

# Use the updated list as needed
print(len(sorted(updated_genre_ids_in_top_items)))
#print(sorted(updated_item_ids_in_top_genres))

In [None]:
print(all(item in filtered_data_genre['genre_id'].unique() for item in updated_genre_ids_in_top_items))
print(all(item in updated_genre_ids_in_top_items for item in filtered_data_genre['genre_id'].unique()))

In [None]:
# Create a mapping between item_id and their corresponding factorization labels
unique_item_ids = filtered_data_artist['artist_id'].unique()
print (len(unique_item_ids))
factorization_item_labels = range(0, len(unique_item_ids))
mapping = dict(zip(unique_item_ids, factorization_item_labels))
# Update the artist_id columns using the mapping
filtered_data_artist['artist_id'] = filtered_data_artist['artist_id'].map(mapping)
filtered_data_artist.reset_index(drop=True, inplace=True)
#print(filtered_data_artist.info())

In [None]:
set(filtered_data_track['user_id']) == set(filtered_data_genre['user_id'])

In [None]:
filtered_data_genre.to_csv('data/filtered_data_genre.csv', index = False)
filtered_data_artist.to_csv('data/filtered_data_artist.csv', index = False)
filtered_data_track.to_csv('data/filtered_data_track.csv', index = False)

In [None]:
with open("data/top_items.pkl", "wb") as file:
    pickle.dump(updated_item_ids_in_top_genres, file)
with open("data/top_genres.pkl", "wb") as file:
    pickle.dump(updated_genre_ids_in_top_items, file)