## Personality Traits Derivation Pipeline

### Import all needed packages 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
import pickle

### Dataset
Load the main dataset with implicit ratings and do simple modifications on the dataset (dropping out the users with less than or equal to 30 ratings) 

In [None]:
# Load Main Dataset
df = pd.read_csv("usersha1-artmbid-artname-plays.tsv", sep='\t', header = None)
df.columns = ['userId', 'artistId', 'artist', 'plays']

In [None]:
print("Total number of records: ", len(df))
print("Number of unique users: ", len(df['userId'].unique()))
print("Number of unique artists: ", len(df['artistId'].unique()))

In [None]:
# Drop out the users with less than 30 records on artists 
df = df.dropna()
data = df.copy()
user_rc = data.groupby("userId").count()
user_rc.reset_index(inplace = True)
user_rc = user_rc.rename(columns = {"artistId": "count"})
selectedu = user_rc.loc[user_rc['count'] >= 30, 'userId']
data = data[data['userId'].isin(selectedu)]
print("Number of unique userId after dropping out the one with less than 30 different artists: ",
      len(data['userId'].unique()))
print("Number of unique artists:", len(data['artistId'].unique()))
print("Number of records:", len(data))


### Get a list of artists in the data for future artist profiles building

In [None]:
artist_id_list = data['artistId'].unique()
artist_name_list = data['artist'].unique()

In [None]:
print("The length of the artist id list is: ", len(artist_id_list))
print("The length of the artist name list is: ", len(artist_name_list))


### Match Artist Names

Next, we are going to match all artist names associated with the same artist id to be the first one that appears. 


In [None]:
checkdata = data.groupby('artistId').first()

In [None]:
key_id_list = list(checkdata.index)
value_name_list = checkdata['artist']
dictionary = dict(zip(key_id_list, value_name_list))

In [None]:
id_list = list(data['artistId'])
artist_list = []
for i in range(len(id_list)):
    id_ind = id_list[i]
    artist = dictionary.get(id_ind)
    artist_list.append(artist)
data['artist'] = artist_list

In [None]:
len(data['artistId'].unique())
artist_id_list = data['artistId'].unique()
artist_name_list = data['artist'].unique()

In [None]:
print("The length of the artist id list is: ", len(artist_id_list))
print("The length of the artist name list is: ", len(artist_name_list))

The reason lying behind the unmatching between artist Id number and artist name number is because of the inconsistent of name noting in the dataset. For one artist, it might have special characters for foreign artists or abbreviations. Because of the generalization of Spotify API, we are able to detect the information related to a user even with the variations of their names. Therefore, we continue on using the names instead of id. (The reason why we don't use id is because of the unmatch between Musicbrainz id and Spotify artist id). 

### Utilize Spotify API to retrieve artist genres 

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
cid = 'your own cid codes for Spotify API'
secret = 'your own credentials for Spotify API'
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [None]:
old_new_name = {}
name_genre = {}
for i in range(len(artist_name_list)):
    name = artist_name_list[i]
    artist_results = sp.search(q=name, type='artist', limit=1)
    if not artist_results['artists']['items']:
        genre_ind = ''
        artist_name = name
    else: 
        for i, t in enumerate(artist_results['artists']['items']):
            genre_ind = t['genres']
            artist_name = t['name']
    old_new_name[name] = artist_name
    name_genre[artist_name] = genre_ind

In [None]:
# Save the results as pickle files for future used
with open('old_new_name.pkl', 'wb') as f:
    pickle.dump(old_new_name, f)
# Restore the stored variables of old_new_name and name_genre dictionaries
with open('old_new_name.pkl', 'rb') as f:
    old_new_name = pickle.load(f)
with open('name_genre.pkl', 'rb') as f:
    name_genre = pickle.load(f)

### Standardization of names & Attach corresponding genres

In [None]:
# Replace old names (with different variations with the standard name)
original_name_list = list(data['artist'])
new_name_list = []
for i in range(len(original_name_list)):
    old = original_name_list[i]
    new = old_new_name.get(old)
    new_name_list.append(new)
data['artist'] = new_name_list
data = data.dropna()

#### Genre Generalization
Generalize the genres given by Spotify API to the general one for future personality derivation

In [None]:
unique_genre = []
for genre_l in name_genre.values():
    for genre_i in genre_l:
        unique_genre.append(genre_i)
unique_genre = set(unique_genre)

In [None]:
print("Number of unique genres in current dataset: ", len(unique_genre))

In [None]:
large_genre_dict = {}
large_genre_dict['classical'] = 'classic'
large_genre_dict['film'] = 'movie'
large_genre_dict['background'] = 'movie'
large_genre_dict['soul'] = 'r&b'
large_genre_dict['rap'] = 'hip hop'
large_key = large_genre_dict.keys()

In [None]:
# Selected 11 lists
standard_genre = ['blues', 'jazz', 'classic', 'classical', 'rock', 
                  'metal', 'alternative', 'pop', 'classic',
                 'film', 'movie', 'background', 'soul', 'r&b', 'rap', 'hip hop',
                 'electronic']
modified_genre = ['blues', 'jazz', 'classic', 'rock', 'metal', 'alternative', 'pop', 'movie',
                 'r&b', 'hip hop', 'electronic']

In [None]:
# Modify the genre list for each artist using the standardization
for key in name_genre:
    current_genre_list = name_genre.get(key)
    new_genre_list = []
    for i in current_genre_list:
        for element in standard_genre:
            if element in i:
                if element in large_key:
                    element = large_genre_dict.get(element)
                new_genre_list.append(element)
    if new_genre_list:
        new_genre_list = list(set(new_genre_list))
    name_genre[key] = new_genre_list

In [None]:
# Attach genres for each artist to the dataset
new_name_list = list(data['artist'])
genre_list = []
for i in range(len(data)):
    name = new_name_list[i]
    genre = name_genre.get(name)
    genre_list.append(genre)
data['genres'] = genre_list

In [None]:
# Drop the rows with no artist genre assignemnt
data = data[data['genres'].map(lambda d: len(d)) > 0]

In [None]:
data['total'] = data['plays'] * data['genres'].str.len()

In [None]:
total = data.groupby('userId')['total'].sum()
data = data.drop('total', axis = 1)

#### Save the dataframe of user-genre-profile as csv for future used

In [None]:
# Save the data with genres attached for each user for 
data.to_csv("genre_data.csv", index = False)

In [None]:
# Change the count to decimal
expand_data = data.explode('genres')
ed1 = expand_data.groupby(['userId', 'genres']).sum()

In [None]:
# Derive the genres score for each user
# Take approximately 2 hours
user_list = list(data['userId'].unique())
user_dict = {}
for user in user_list:
    dictionary = dict.fromkeys(modified_genre, 0)
    total_ind = total[user]
    ed2 = ed1.loc[user]['plays'] / total_ind
    d = ed2.to_dict()
    dictionary.update(d)
    user_dict[user] = dictionary

### Derive Music Preferences for All Users

In [None]:
# Derive Music Preference from Genres
music_preference = ['R&C', 'I&R', 'U&C', 'E&R']
user_music_pref = {}
dominant_pref = {}

# Use the standardized regression weights to calculate the music preference score
for user in user_list:
    dictionary = user_dict[user]
    preference_dict = dict.fromkeys(music_preference, 0)
    rc = (0.93 * dictionary['blues'] + 0.73 * dictionary['jazz'] + 0.51 * dictionary['classic'])/3
    ir = (0.83 * dictionary['rock'] + 0.74 * dictionary['metal'] + 0.58 * dictionary['alternative'])/3
    uc = (0.63 * dictionary['pop'] + 0.4 * dictionary['movie'])/2
    er = (0.95 * dictionary['r&b'] + 0.71 * dictionary['hip hop'] + 0.18 * dictionary['electronic'])/3
    preference_dict['R&C'] = rc
    preference_dict['I&R'] = ir
    preference_dict['U&C'] = uc
    preference_dict['E&R'] = er
    user_music_pref[user] = preference_dict
    index = np.argmax([rc, ir, uc, er])
    dominant_preference = music_preference[index]
    dominant_pref[user] = dominant_preference

In [None]:
dominant_count = [0, 0, 0, 0]
for value in dominant_pref.values():
    index = music_preference.index(value)
    dominant_count[index] = dominant_count[index] + 1

In [None]:
print("The dominant music preferences count is: ")
print("R&C: ", dominant_count[0])
print("I&R: ", dominant_count[1])
print("U&C: ", dominant_count[2])
print("E&R: ", dominant_count[3])

modified_genre = ['blues', 'jazz', 'classic', 'rock', 'metal', 'alternative', 'pop', 'movie',
                 'r&b', 'hip hop', 'electronic']

In [None]:
with open('selected_user.pkl', 'wb') as f:
    pickle.dump(dominant_pref, f)