In [5]:
from email.mime import audio
import numpy as np
import os
import spotipy
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import tqdm
import pandas as pd

In [3]:
def get_name_and_artist_from_txt(txt_file):
    song_dict = {}
    with open(txt_file,encoding="utf8", errors='ignore') as f:
        for line in f:
            splits = line.split(":")
            if len(splits)< 4:
                continue
            audioid =splits[0]
            song_name = splits[1]
            artist = splits[2]
            duration = splits[3].rstrip()
            song_dict[audioid] = {
                'id': audioid,
                'name': song_name,
                'artist': artist,
                'duration':duration
            }
    return song_dict

In [5]:
sad_train_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Sad/Train/info.txt')
sad_test_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Sad/Test/info.txt')

angry_train_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Angry/Train/info.txt')
angry_test_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Angry/Test/info.txt')

relax_train_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Relaxed/Train/info.txt')
relax_test_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Relaxed/Test/info.txt')

happy_train_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Happy/Train/info.txt')
happy_test_dicts = get_name_and_artist_from_txt('./NJU_MusicMood_v1.0/Happy/Test/info.txt')

In [6]:
def search_matching_song(sp,data):
    track_tuples = []
    track_results = sp.search(q=f"track:{data['name']}, artist:{data['artist']}", limit=1,offset=0)
    if len(track_results['tracks']['items']) == 0:
        track_tuples = (data['id'],'invalid')
    else:
       # print(data['name'])
        #for t in track_results['tracks']['items']:
        track_tuples = (data['id'],track_results['tracks']['items'][0]['id'])
    return track_tuples

In [6]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="0eee6e2465a94368ad5ec7450a2fd62a",
                                               client_secret="35113bd096574a4dae4381bc4090f7ba"))

In [8]:
def get_spotify_id(all_data):
    track_list = []
    for idx in all_data:
        data = all_data[idx]
        tracks = search_matching_song(sp,data)
        track_list.append(tracks)
    return track_list
#  # List to store the URIs
# for idx, (sId,data) in enumerate(a_test.items()):
# # for idx,_ in tqdm(a_train[0:10].iterrows()):
#     tracks = search_matching_song(sp,data)
#     track_list.append(tracks)

In [9]:
sad_test_list = get_spotify_id(sad_test_dicts)
sad_train_list = get_spotify_id(sad_train_dicts)

In [10]:
angry_test_list = get_spotify_id(angry_test_dicts)
angry_train_list = get_spotify_id(angry_train_dicts)

In [11]:
relaxed_test_list = get_spotify_id(relax_test_dicts)
relaxed_train_list = get_spotify_id(relax_train_dicts)

In [12]:
happy_train_list = get_spotify_id(happy_train_dicts)
happy_test_list = get_spotify_id(happy_test_dicts)

In [13]:
def gen_dataframde(id_list,data): #id, spotify id
    new_dict = {'musicId':[],'spotifyId':[],'name':[],'artist':[],'duration':[]}
    for (idx,sid) in id_list:
       #print(int(idx),sid)
        if sid == 'invalid':
            continue
        info = data[idx]
        new_dict['musicId'].append(int(idx))
        new_dict['spotifyId'].append(sid)
        new_dict['name'].append(info['name'])
        new_dict['artist'].append(info['artist'])
        new_dict['duration'].append(info['duration'])
    return pd.DataFrame.from_dict(new_dict)

In [14]:
df_sad_test = gen_dataframde(sad_test_list,sad_test_dicts)
df_sad_train = gen_dataframde(sad_train_list,sad_train_dicts)

df_angry_test = gen_dataframde(angry_test_list,angry_test_dicts)
df_angry_train = gen_dataframde(angry_train_list,angry_train_dicts)

df_relaxed_test = gen_dataframde(relaxed_test_list,relax_test_dicts)
df_relaxed_train = gen_dataframde(relaxed_train_list,relax_train_dicts)

df_happy_test = gen_dataframde(happy_test_list,happy_test_dicts)
df_happy_train = gen_dataframde(happy_train_list,happy_train_dicts)

In [16]:
df_sad_test['emotion'] = 'Sad'
df_sad_train['emotion'] = 'Sad'

df_angry_test['emotion'] = 'Angry'
df_angry_train['emotion'] = 'Angry'

df_relaxed_test['emotion'] = 'Relaxed'
df_relaxed_train['emotion'] = 'Relaxed'

df_happy_test['emotion'] = 'Happy'
df_happy_train['emotion'] = 'Happy'

In [71]:
df_train = pd.concat([df_sad_train, df_angry_train,df_happy_train,df_relaxed_train])
df_test = pd.concat([df_sad_test, df_angry_test,df_happy_test,df_relaxed_test])

<h2>Reindex</h2>

In [72]:
idx = np.arange(len(df_train))
np.random.shuffle(idx)
df_train = df_train.iloc[idx]
df_train['NJU_idx'] = df_train['emotion'] +'_'+ df_train['musicId'].astype(str)
df_train['musicId'] = np.arange(len(df_train))

In [73]:
idx = np.arange(len(df_test))
np.random.shuffle(idx)
df_test = df_test.iloc[idx]
df_test['NJU_idx'] = df_test['emotion'] +'_'+ df_test['musicId'].astype(str)
df_test['musicId'] = np.arange(len(df_test))+len(df_train)

In [75]:
df_train.to_csv('/Users/kexinzheng/Documents/class/5647/project/NJU_MusicMood_v1.0/train_reindex.csv',index = False)
df_test.to_csv('./NJU_MusicMood_v1.0/test_reindex.csv',index = False)

In [26]:
df_train.to_csv('./NJU_MusicMood_v1.0/train_all.csv',index=False)
df_test.to_csv('./NJU_MusicMood_v1.0/test_all.csv',index=False)

In [50]:
def segment_info_list(segs,track_info):
    segs_list = []
    for seg in segs:
        seg_list = [track_info[0],track_info[1],track_info[2],track_info[3]]
        seg_list.append(seg["start"])
        seg_list.append(seg["duration"])
        seg_list.append(seg["confidence"])
        seg_list.append(seg["loudness_start"])
        seg_list.append(seg["loudness_max"])
        seg_list.append(seg["loudness_max_time"])
        seg_list.append(seg["loudness_end"])
        seg_list.extend(seg["pitches"])
        seg_list.extend(seg["timbre"])
        segs_list.append(seg_list)
    return segs_list
    
def get_audio_analysis(data):
    spotifyID = data.spotifyId
    audio_info = sp.audio_analysis(spotifyID)
    audio_segs = audio_info['segments']
    audio_trackinfo = audio_info['track']#"analysis_sample_rate" "analysis_channels""time_signature""key""mode"
    useful_info = []
    useful_info.append(data.musicId)
    useful_info.append(spotifyID)
    useful_info.append(audio_trackinfo['analysis_sample_rate'])
    useful_info.append(audio_trackinfo['analysis_channels'])
    return segment_info_list(audio_segs,useful_info)

In [83]:
def get_segment_info(data,sp):
    total_segs = []
    for idx,d in data.iterrows():
        seg_info = get_audio_analysis(d)
        total_segs.extend(seg_info)
    return total_segs

In [84]:
test_segments = get_segment_info(df_test,sp)

In [85]:
df_test_segs = pd.DataFrame(test_segments, columns=['musicId','spotifyId','analysis_sample_rate','analysis_sample_channel',
'start','duration','confidence',
'loudness_start','loudness_max','loudness_max_time','loudness_end',
'pitches_0','pitches_1','pitches_2','pitches_3','pitches_4','pitches_5',
'pitches_6','pitches_7','pitches_8','pitches_9','pitches_10','pitches_11',
'timbre_0','timbre_1','timbre_2','timbre_3','timbre_4','timbre_5','timbre_6',
'timbre_7','timbre_8','timbre_9','timbre_10','timbre_11'])

In [65]:
train_segments = get_segment_info(df_train,sp)

In [66]:
df_test_segs = pd.DataFrame(test_segments, columns=['musicId','spotifyId','analysis_sample_rate','analysis_sample_channel',
'start','duration','confidence',
'loudness_start','loudness_max','loudness_max_time','loudness_end',
'pitches_0','pitches_1','pitches_2','pitches_3','pitches_4','pitches_5',
'pitches_6','pitches_7','pitches_8','pitches_9','pitches_10','pitches_11',
'timbre_0','timbre_1','timbre_2','timbre_3','timbre_4','timbre_5','timbre_6',
'timbre_7','timbre_8','timbre_9','timbre_10','timbre_11'])

df_train_segs = pd.DataFrame(train_segments, columns=['musicId','spotifyId','analysis_sample_rate','analysis_sample_channel',
'start','duration','confidence',
'loudness_start','loudness_max','loudness_max_time','loudness_end',
'pitches_0','pitches_1','pitches_2','pitches_3','pitches_4','pitches_5',
'pitches_6','pitches_7','pitches_8','pitches_9','pitches_10','pitches_11',
'timbre_0','timbre_1','timbre_2','timbre_3','timbre_4','timbre_5','timbre_6',
'timbre_7','timbre_8','timbre_9','timbre_10','timbre_11'])

In [87]:
df_test.head()
df_test_segs.to_csv('./NJU_MusicMood_v1.0/test_segments.csv',index = False)

In [67]:
df_test_segs.to_csv('./NJU_MusicMood_v1.0/test_segments.csv',index = False)
df_train_segs.to_csv('./NJU_MusicMood_v1.0/train_segments.csv',index = False)

In [89]:
print(len(df_train),len(df_train_segs.musicId.unique()))
print(len(df_test),len(df_test_segs.musicId.unique()))

328 328
320 320


In [10]:
def get_audio_features(spotifyID,sp):
    #"key""mode""acousticness""energy""danceability""instrumentalness""loudness""tempo""valence"
    features = ['key',"mode","energy","danceability","loudness","tempo","valence"]
    audio_info = sp.audio_features(spotifyID)[0]
    audio_features = [spotifyID]
    for f in features:
        audio_features.append(audio_info[f])
    return audio_features

In [23]:
df = pd.read_csv('./NJU_MusicMood_v1.0/train_reindex.csv')
df2 = pd.read_csv('./NJU_MusicMood_v1.0/test_reindex.csv')

In [12]:
df.columns

Index(['musicId', 'spotifyId', 'name', 'artist', 'duration', 'emotion',
       'NJU_idx'],
      dtype='object')

In [20]:
all_feat = []
for i in range(len(df)):
    feats = get_audio_features(df.spotifyId[i],sp)
    all_feat.append(feats)

In [24]:
for i in range(len(df2)):
    feats = get_audio_features(df2.spotifyId[i],sp)
    all_feat.append(feats)

In [27]:
df_total = pd.concat((df,df2),axis=0)

In [29]:
df_total.iloc[-3:].head()

Unnamed: 0,musicId,spotifyId,name,artist,duration,emotion,NJU_idx
317,645,6iq27lWBX3zvaX1ank9M2V,Nothing Compares,Pixie Lott,217,Sad,Sad_28
318,646,5Om4eWWZh9j05pEQVdxQ8b,Sad News,Chris Garneau,364,Relaxed,Relaxed_39
319,647,3dKFxCpVtK3g9KAB8S1FwY,Never Know,Jack Johnson,212,Happy,Happy_34


In [36]:
#d = {'profile_id': i, 'listing_id': profiles[i], 'view_time': view_times[i]}
feat = all_feat[0]
final_df = []
for i in range(len(all_feat)):
    feat = all_feat[i]
    d = {'musicId':i,'spotifyId':feat[0],'key':feat[1],"mode":feat[2],"energy":feat[3],"danceability":feat[4],"loudness":feat[5],"tempo":feat[6]}
    td = pd.DataFrame(data=d,index=[i])
    final_df.append(td)
result = pd.concat(final_df)

In [37]:
result.head()

Unnamed: 0,musicId,spotifyId,key,mode,energy,danceability,loudness,tempo
0,0,7Kpv6QtVDSNBJYTgwUTOAu,10,1,0.448,0.634,-8.468,121.933
1,1,35SJCzSCA6RFY7VysNNN2i,1,0,0.575,0.65,-5.604,81.693
2,2,3vIVCdRx0jaxegLrtuGYvH,2,1,0.722,0.628,-5.693,118.004
3,3,4KacUpvbA3Mfo05gttTjhN,9,1,0.914,0.402,-5.215,196.505
4,4,0GgN4MhR5GKn5IcKN0e0rG,4,1,0.515,0.457,-4.31,74.984


In [39]:
for i in range(len(df_total)):
    if df_total.iloc[i].spotifyId != result.iloc[i].spotifyId:
        print(123)

In [40]:
result.to_csv('./nju_spotify_features.csv',index=False)

In [41]:
train_seg = pd.read_csv('./NJU_MusicMood_v1.0/train_segments.csv')
test_seg = pd.read_csv('./NJU_MusicMood_v1.0/test_segments.csv')

In [42]:
dd = pd.concat((train_seg,test_seg),axis=0)

In [44]:
dd.to_csv('./NJU_MusicMood_v1.0/nju_segment_features.csv',index=False)

In [54]:
def emotion_to_arousal(r):
    if r.emotion == 'Relaxed' or r.emotion == 'Sad':
        return 'Low Arousal'
    else:
        return 'High Arousal'

def emotion_to_valence(r):
    if r.emotion == 'Angry' or r.emotion == 'Sad':
        return 'Low Valence'
    else:
        return 'High Valence'

In [49]:
df_total_final = df_total.copy()
df_total_final['arousal'] = df_total_final.apply(lambda r: emotion_to_arousal(r),axis=1)

In [55]:
df_total_final['valence'] = df_total_final.apply(lambda r: emotion_to_valence(r),axis=1)

In [59]:
df_total_final.to_csv('./nju_meta.csv',index=False)

In [61]:
df_total_final.valence.value_counts()

High Valence    365
Low Valence     283
Name: valence, dtype: int64

In [75]:
df_happy = df_total_final[df_total_final.emotion=='Happy']
df_sad = df_total_final[df_total_final.emotion == 'Sad']
df_angry = df_total_final[df_total_final.emotion =='Angry']
df_relaxed = df_total_final[df_total_final.emotion =='Relaxed']

In [85]:
def split_set(df):
    train_len = int(len(df)*0.8)
    test_len = int(len(df)*0.1)
    valid_len = len(df) - train_len - test_len
    df_train = df.iloc[:train_len]
    df_valid = df.iloc[train_len:train_len+valid_len]
    df_test = df.iloc[-test_len:]
    return df_train,df_valid,df_test

In [86]:
train_happy,valid_happy,test_happy = split_set(df_happy)
train_sad,valid_sad,test_sad = split_set(df_sad)
train_relaxed,valid_relaxed,test_relaxed = split_set(df_relaxed)
train_angry,valid_angry,test_angry = split_set(df_angry)

In [87]:
train_final = pd.concat((train_happy,train_sad,train_angry,train_relaxed),axis = 0)
valid_final = pd.concat((valid_happy,valid_sad,valid_angry,valid_relaxed),axis = 0)
test_final = pd.concat((test_happy,test_sad,test_angry,test_relaxed),axis = 0)

In [None]:
train_final = train_final.sort_values(by='musicId')
valid_final = valid_final.sort_values(by='musicId')
test_final = test_final.sort_values(by='musicId')

In [88]:
len(train_final)+len(valid_final)+len(test_final)

648

In [89]:
train_final.to_csv('NJU_MusicMood_v1.0/nju_train.csv',index = False)
valid_final.to_csv('NJU_MusicMood_v1.0/nju_valid.csv',index = False)
test_final.to_csv('NJU_MusicMood_v1.0/nju_test.csv',index = False)