In [40]:
STANDARD_GENRE_DICT = {
    "Rock": [
        "rock", "classic rock", "alternative rock", "indie rock", "hard rock", "soft rock",
        "psychedelic rock", "garage rock", "punk rock", "folk rock", "glam rock",
        "southern rock", "arena rock", "progressive rock", "art rock", "grunge"
    ],
    "Pop": [
        "pop", "dance pop", "electropop", "teen pop", "pop rock", "synthpop", "indie pop",
        "dream pop", "k-pop", "j-pop", "bubblegum pop", "power pop"
    ],
    "Hip-Hop": [
        "hip-hop", "hip hop", "rap", "trap", "gangsta rap", "conscious hip hop",
        "alternative hip hop", "old school hip hop", "boom bap", "east coast rap",
        "west coast rap", "dirty south", "drill", "mumble rap"
    ],
    "Electronic": [
        "electronic", "edm", "house", "techno", "trance", "drum and bass", "dnb",
        "dubstep", "electro", "idm", "ambient", "glitch", "breakbeat", "future bass",
        "synthwave", "chillwave", "deep house", "progressive house", "hardstyle"
    ],
    "Jazz": [
        "jazz", "smooth jazz", "bebop", "cool jazz", "hard bop", "free jazz",
        "fusion", "latin jazz", "swing", "jazz funk", "vocal jazz", "avant-garde jazz"
    ],
    "Classical": [
        "classical", "baroque", "romantic", "modern classical", "orchestral", "chamber music",
        "opera", "symphony", "piano", "early music", "contemporary classical", "minimalism"
    ],
    "Metal": [
        "metal", "heavy metal", "black metal", "death metal", "thrash metal", "doom metal",
        "power metal", "symphonic metal", "folk metal", "nu metal", "progressive metal",
        "metalcore", "grindcore", "industrial metal"
    ],
    "R&B": [
        "r&b", "rnb", "soul", "neo soul", "contemporary r&b", "quiet storm",
        "motown", "funk", "new jack swing", "blue-eyed soul"
    ],
    "Reggae": [
        "reggae", "roots reggae", "dub", "dancehall", "ska", "rocksteady", "ragga"
    ],
    "Country": [
        "country", "alt-country", "country rock", "bluegrass", "americana", "honky tonk",
        "country pop", "traditional country", "outlaw country", "neo-traditional country"
    ],
    "Blues": [
        "blues", "delta blues", "electric blues", "chicago blues", "country blues",
        "blues rock", "texas blues", "rhythm and blues"
    ],
    "Folk": [
        "folk", "indie folk", "folk rock", "contemporary folk", "traditional folk",
        "acoustic", "singer-songwriter", "americana", "celtic", "bluegrass"
    ],
    "Latin": [
        "latin", "reggaeton", "latin pop", "latin rock", "salsa", "bachata",
        "merengue", "cumbia", "tango", "bossa nova", "mariachi", "norteño", "latin trap"
    ],
    "World": [
        "world", "world music", "afrobeat", "afropop", "balkan", "celtic", "arabic",
        "flamenco", "fado", "indian", "klezmer", "traditional", "ethnic", "tribal"
    ],
    "Soundtrack": [
        "soundtrack", "ost", "original soundtrack", "score", "film score", "video game music",
        "musical", "broadway", "tv soundtrack", "anime soundtrack"
    ],
    "Experimental": [
        "experimental", "avant-garde", "noise", "glitch", "no wave", "industrial",
        "dark ambient", "sound art", "field recording", "drone", "electroacoustic"
    ],
    "Punk": [
        "punk", "punk rock", "pop punk", "hardcore punk", "post-punk", "emo", "skate punk",
        "anarcho-punk", "garage punk", "crust punk"
    ],
    "Religious": [
        "gospel", "christian", "christian rock", "worship", "praise", "contemporary christian",
        "ccm", "hymns", "sacred", "spiritual"
    ],
    "New Age": [
        "new age", "meditation", "relaxation", "healing", "ambient", "space music",
        "yoga music", "nature sounds"
    ],
    "Children": [
        "children", "kids", "nursery rhymes", "disney", "lullabies", "children's music"
    ],
}

In [57]:
import difflib

def map_tag_to_core_genre(user_tag, genre_dict, cutoff=0.8):
    """
    Maps a user-generated tag to the closest core genre from a standard genre dictionary.
    
    Parameters:
        user_tag (str): The input tag from a user (e.g., 'dream pop').
        genre_dict (dict): A dictionary mapping core genres to lists of tags.
        cutoff (float): Similarity threshold for fuzzy matching.
        
    Returns:
        str or None: The matched core genre, or None if no close match found.
    """
    user_tag = user_tag.strip().lower()
    
    # Flatten genre dictionary
    flat_tag_to_genre = {}
    for core_genre, tags in genre_dict.items():
        for tag in tags:
            flat_tag_to_genre[tag.lower()] = core_genre
    
    # Exact match
    if user_tag in flat_tag_to_genre:
        return flat_tag_to_genre[user_tag]
    
    # Fuzzy match
    all_tags = list(flat_tag_to_genre.keys())
    close_matches = difflib.get_close_matches(user_tag, all_tags, n=1, cutoff=cutoff)
    if close_matches:
        return flat_tag_to_genre[close_matches[0]]
    
    # Substring match
    for tag in all_tags:
        if user_tag in tag or tag in user_tag:
            return flat_tag_to_genre[tag]
    
    return 'Other'  # No match found

In [58]:
import pandas as pd

df = pd.read_csv('/Users/sohamchatterjee/Documents/RecSYS_TA/LastFM_Raw/hetrec2011-lastfm-2k/tags.dat', delimiter='\t',encoding='latin1')

# Show the contents
print(df)

       tagID           tagValue
0          1              metal
1          2  alternative metal
2          3          goth rock
3          4        black metal
4          5        death metal
...      ...                ...
11941  12644              suomi
11942  12645          symbiosis
11943  12646            sverige
11944  12647               eire
11945  12648     electro latino

[11946 rows x 2 columns]


In [59]:
df.head()

Unnamed: 0,tagID,tagValue
0,1,metal
1,2,alternative metal
2,3,goth rock
3,4,black metal
4,5,death metal


In [60]:
df['category'] = df['tagValue'].apply(lambda word: map_tag_to_core_genre(word,STANDARD_GENRE_DICT))

In [61]:
d=df[df['category']=='Other']

In [62]:
df['category'].value_counts()

category
Other           9020
Rock             558
Metal            361
Pop              334
Electronic       245
Hip-Hop          185
Punk             181
Soundtrack       178
Folk             104
R&B               96
Classical         87
Jazz              86
Experimental      74
Reggae            70
World             68
Country           68
Blues             66
New Age           53
Latin             45
Religious         44
Children          23
Name: count, dtype: int64

In [63]:
len(df)

11946

In [64]:
df['category'].value_counts().sum()

11946

# Artist - category mappings

In [67]:
import pandas as pd

artists = pd.read_csv('/Users/sohamchatterjee/Documents/RecSYS_TA/LastFM_Raw/hetrec2011-lastfm-2k/user_taggedartists-timestamps.dat', delimiter='\t',encoding='latin1')



In [68]:
artists.head()

Unnamed: 0,userID,artistID,tagID,timestamp
0,2,52,13,1238536800000
1,2,52,15,1238536800000
2,2,52,18,1238536800000
3,2,52,21,1238536800000
4,2,52,41,1238536800000


In [69]:
df.head()

Unnamed: 0,tagID,tagValue,category
0,1,metal,Metal
1,2,alternative metal,Metal
2,3,goth rock,Rock
3,4,black metal,Metal
4,5,death metal,Metal


In [70]:
result = pd.merge(df, artists, on='tagID', how='inner')

In [71]:
result.head()

Unnamed: 0,tagID,tagValue,category,userID,artistID,timestamp
0,1,metal,Metal,4,918,1209592800000
1,1,metal,Metal,12,181,1272664800000
2,1,metal,Metal,12,198,1264978800000
3,1,metal,Metal,12,500,1264978800000
4,1,metal,Metal,12,503,1267398000000


In [90]:
artist_genre = result[['category','artistID']]
artist_genre_lists = artist_genre.groupby('artistID')['category'].apply(list).reset_index()


In [91]:
artist_genre_lists

Unnamed: 0,artistID,category
0,1,"[Rock, Rock, Rock, Rock, Rock, Other, Other, O..."
1,2,"[New Age, Electronic, Other, Other, Other, Oth..."
2,3,"[Metal, Metal, Metal, Metal, Metal, Metal, Oth..."
3,4,"[Metal, Metal, Metal, Rock, Rock, Rock, Rock, ..."
4,5,"[Other, Rock, Other, Other, Other, Rock, Other]"
...,...,...
12518,18737,"[Other, Experimental, Rock, Electronic, Other]"
12519,18739,"[Electronic, Hip-Hop, Rock, Rock, Rock, Other,..."
12520,18740,"[Experimental, Other]"
12521,18741,"[Experimental, Other]"


In [122]:
from collections import defaultdict
def select_max_freq_category(cat_list):
    #print(cat_list)
    d=defaultdict(int)
    for item in cat_list:
        d[item]+=1
    sorted_dict = dict(sorted(d.items(),key= lambda item:item[1],reverse=True))
    sorted_list = list(sorted_dict.items())
    #print(sorted_list)
    cat = 'Other'
    if sorted_list[0][0] != 'Other':
        cat = sorted_list[0][0]
    elif  sorted_list[0][0] == 'Other' and len(sorted_list) > 1:
        cat = sorted_list[1][0]
    return cat

        

In [123]:
artist_genre_lists['artist_category'] = artist_genre_lists['category'].apply(lambda cat_list: select_max_freq_category(cat_list))

In [126]:
artist_genre_lists.head()

Unnamed: 0,artistID,category,artist_category
0,1,"[Rock, Rock, Rock, Rock, Rock, Other, Other, O...",Rock
1,2,"[New Age, Electronic, Other, Other, Other, Oth...",Experimental
2,3,"[Metal, Metal, Metal, Metal, Metal, Metal, Oth...",Metal
3,4,"[Metal, Metal, Metal, Rock, Rock, Rock, Rock, ...",Rock
4,5,"[Other, Rock, Other, Other, Other, Rock, Other]",Rock


In [128]:
artist_genre_lists['artist_category'].value_counts()

artist_category
Rock            2897
Pop             1445
Other           1334
Electronic      1289
Metal           1086
Punk             647
Hip-Hop          638
Experimental     589
Folk             525
Soundtrack       306
R&B              297
Jazz             253
New Age          253
Classical        176
Reggae           164
World            162
Country          148
Blues            117
Religious         92
Latin             86
Children          19
Name: count, dtype: int64

In [129]:
artist_genre_lists.head()

Unnamed: 0,artistID,category,artist_category
0,1,"[Rock, Rock, Rock, Rock, Rock, Other, Other, O...",Rock
1,2,"[New Age, Electronic, Other, Other, Other, Oth...",Experimental
2,3,"[Metal, Metal, Metal, Metal, Metal, Metal, Oth...",Metal
3,4,"[Metal, Metal, Metal, Rock, Rock, Rock, Rock, ...",Rock
4,5,"[Other, Rock, Other, Other, Other, Rock, Other]",Rock


In [130]:
artist_genre_lists.to_csv('/Users/sohamchatterjee/Documents/RecSYS_TA/LastFM_Raw/hetrec2011-lastfm-2k/artist_category_mapping.dat', sep='\t', index=False)


# Entropy calculation

In [1]:
import pandas as pd
import re

# Path to the prediction file
file_path = '/Users/sohamchatterjee/Documents/RecSYS_TA/Pre-Processed_Dataset/BSARec_LastFM_predictions.txt'

# Prepare a list to store each row
data = []

with open(file_path, 'r') as f:
    for line in f:
        match = re.match(r'User (\d+): \[(.*)\]', line.strip())
        if match:
            user_id = int(match.group(1))
            items = list(map(int, match.group(2).split(',')))
            data.append([user_id] + items)


In [5]:
import pickle


with open('/Users/sohamchatterjee/Documents/RecSYS_TA/Pre-Processed_Dataset/data_maps.pkl', 'rb') as f:
    data_maps = pickle.load(f)


In [7]:
id2item = data_maps['id2item']

In [8]:
id2item

{'1': '52',
 '2': '63',
 '3': '73',
 '4': '96',
 '5': '995',
 '6': '6160',
 '7': '6177',
 '8': '9322',
 '9': '152',
 '10': '181',
 '11': '217',
 '12': '920',
 '13': '1870',
 '14': '1873',
 '15': '170',
 '16': '183',
 '17': '1009',
 '18': '1385',
 '19': '167',
 '20': '193',
 '21': '2961',
 '22': '64',
 '23': '72',
 '24': '151',
 '25': '156',
 '26': '173',
 '27': '174',
 '28': '175',
 '29': '188',
 '30': '1824',
 '31': '163',
 '32': '1917',
 '33': '162',
 '34': '2370',
 '35': '3416',
 '36': '53',
 '37': '70',
 '38': '80',
 '39': '154',
 '40': '157',
 '41': '165',
 '42': '184',
 '43': '192',
 '44': '463',
 '45': '467',
 '46': '632',
 '47': '706',
 '48': '726',
 '49': '1248',
 '50': '1745',
 '51': '3863',
 '52': '171',
 '53': '1713',
 '54': '51',
 '55': '99',
 '56': '153',
 '57': '180',
 '58': '185',
 '59': '1295',
 '60': '1810',
 '61': '3988',
 '62': '918',
 '63': '182',
 '64': '190',
 '65': '195',
 '66': '202',
 '67': '215',
 '68': '222',
 '69': '232',
 '70': '233',
 '71': '236',
 '72': 

In [9]:
raw_artists = []
for item in data:
    artist_list = []
    for index in range(1,len(item)):
        artist = item[index]
        raw_artist = id2item[str(artist)]
        artist_list.append(raw_artist)
    raw_artists.append(artist_list)
    
        
    

In [10]:
raw_artists

[['6522',
  '6150',
  '5803',
  '7352',
  '5713',
  '5358',
  '8582',
  '9974',
  '7344',
  '82',
  '9322',
  '7821',
  '10616',
  '7029',
  '9255',
  '44',
  '5550',
  '5299',
  '11193',
  '7021'],
 ['4312',
  '4317',
  '3191',
  '8781',
  '8789',
  '3978',
  '30',
  '4609',
  '4036',
  '3190',
  '3856',
  '187',
  '75',
  '3921',
  '4493',
  '155',
  '67',
  '3197',
  '3570',
  '5841'],
 ['416',
  '431',
  '434',
  '433',
  '3068',
  '14534',
  '629',
  '1633',
  '231',
  '859',
  '11234',
  '425',
  '602',
  '1732',
  '2402',
  '9823',
  '1075',
  '622',
  '212',
  '2871'],
 ['5079',
  '288',
  '5772',
  '4291',
  '6193',
  '5988',
  '89',
  '7639',
  '5266',
  '4262',
  '67',
  '289',
  '4031',
  '3719',
  '3889',
  '8392',
  '3795',
  '265',
  '16164',
  '6249'],
 ['1090',
  '533',
  '424',
  '433',
  '441',
  '622',
  '615',
  '757',
  '689',
  '425',
  '863',
  '1106',
  '618',
  '629',
  '602',
  '969',
  '1098',
  '1504',
  '604',
  '606'],
 ['498',
  '461',
  '466',
  '681',


In [25]:
import pandas as pd

# Load the CSV file
artist_genre = pd.read_csv('/Users/sohamchatterjee/Documents/RecSYS_TA/LastFM_Raw/hetrec2011-lastfm-2k/artist_category_mapping.dat', delimiter='\t',encoding='latin1')
print(artist_genre.head())

   artistID                                           category artist_category
0         1  ['Rock', 'Rock', 'Rock', 'Rock', 'Rock', 'Othe...            Rock
1         2  ['New Age', 'Electronic', 'Other', 'Other', 'O...    Experimental
2         3  ['Metal', 'Metal', 'Metal', 'Metal', 'Metal', ...           Metal
3         4  ['Metal', 'Metal', 'Metal', 'Rock', 'Rock', 'R...            Rock
4         5  ['Other', 'Rock', 'Other', 'Other', 'Other', '...            Rock


In [26]:
artist_genre = artist_genre[['artistID','artist_category']]

In [27]:
len(artist_genre)

12523

In [36]:
artist_genre_list = artist_genre.values.tolist()

In [37]:
artist_genre_list

[[1, 'Rock'],
 [2, 'Experimental'],
 [3, 'Metal'],
 [4, 'Rock'],
 [5, 'Rock'],
 [6, 'Metal'],
 [7, 'Metal'],
 [8, 'Rock'],
 [9, 'Experimental'],
 [10, 'Electronic'],
 [11, 'Electronic'],
 [12, 'Metal'],
 [13, 'Electronic'],
 [14, 'Experimental'],
 [15, 'Metal'],
 [16, 'Experimental'],
 [17, 'Experimental'],
 [18, 'Electronic'],
 [19, 'Experimental'],
 [20, 'Metal'],
 [22, 'Experimental'],
 [23, 'Metal'],
 [24, 'Experimental'],
 [25, 'Metal'],
 [26, 'Electronic'],
 [27, 'Experimental'],
 [28, 'Metal'],
 [29, 'Electronic'],
 [30, 'Pop'],
 [31, 'Electronic'],
 [32, 'Pop'],
 [33, 'Electronic'],
 [34, 'Pop'],
 [35, 'Experimental'],
 [36, 'Experimental'],
 [37, 'Experimental'],
 [38, 'Experimental'],
 [39, 'Electronic'],
 [40, 'Rock'],
 [41, 'Electronic'],
 [43, 'Experimental'],
 [44, 'Experimental'],
 [45, 'Experimental'],
 [46, 'Classical'],
 [47, 'Metal'],
 [48, 'Metal'],
 [49, 'Electronic'],
 [50, 'Metal'],
 [51, 'Pop'],
 [52, 'Hip-Hop'],
 [53, 'Electronic'],
 [54, 'Hip-Hop'],
 [55, 'Pop

In [38]:
artist_genre_dict={}
for item in artist_genre_list:
    artist_genre_dict[item[0]]= item[1]
    

# Entropy

In [64]:
import math

def compute_entropy(category_counts):
    """
    Computes the entropy of a category distribution.
    
    Args:
        category_counts (dict): A dictionary with category names as keys and their counts as values.
    
    Returns:
        float: The entropy value.
    """
    total = sum(category_counts.values())
    entropy = 0.0
    
    for count in category_counts.values():
        if count == 0:
            continue
        p_i = count / total
        entropy -= p_i * math.log2(p_i)
    
    return entropy

# # Example usage
# category_counts = {
#     'Action': 5,
#     'Comedy': 3,
#     'Drama': 2
# }

# entropy = compute_entropy(category_counts)



In [66]:
from collections import defaultdict
total_entropy = 0
for item in raw_artists:
    category_counts = defaultdict(int)
    categories = []
    top_6 = item[:6]
    for artist in top_6:
        categories.append(artist_genre_dict[int(artist)])
    for category in categories:
        category_counts[category] +=1
    #print(category_counts)
    sample_entropy = compute_entropy(category_counts)
    total_entropy+=sample_entropy
    
    
    
    

In [67]:
total_entropy

1357.0013214672936

In [69]:
print("Average Entropy for Top-6 recommendations:",total_entropy/len(raw_artists))

Average Entropy for Top-6 recommendations: 1.2449553407956822


In [70]:
len(raw_artists)

1090