# Raw Data Getting Merged and Cleaned In order to start Feature Selection

In [72]:
import pandas as pd
import os
import glob

## Merge the data into one dataframe

In [73]:
folder_path = 'C:/Users/mlar5/OneDrive/Desktop/Code Folder/Python Projects/IRL projects/Aspire - Affective Computing Project/Playlists Data/Audio Features/emotions'
df_list = []
for foldername in os.listdir(folder_path):
    folder = os.path.join(folder_path, foldername)
    for file in glob.glob(os.path.join(folder, '*.csv')):
        csv_data = pd.read_csv(file)
        df_list.append(csv_data)
df = pd.concat(df_list)

In [74]:
df.shape

(10655, 22)

In [75]:
#change the affect column to mood
df.rename(columns={'affect':'mood'}, inplace=True)

In [76]:
df.head(1)

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,song,playlist,mood,genre
0,0.548,0.833,7,-3.294,1,0.0497,0.199,0.0,0.305,0.467,...,6dDIOU6AqgLkrjFNPgvKY6,spotify:track:6dDIOU6AqgLkrjFNPgvKY6,https://api.spotify.com/v1/tracks/6dDIOU6AqgLk...,https://api.spotify.com/v1/audio-analysis/6dDI...,229693,3,FU (feat. French Montana),angry break up songs to scream to.,angry,pop


## Check for duplicate uris in the data by mood

In [77]:
#for each mood, check for duplicate uri's
for mood in df.mood.unique():
    print(mood)
    print(df[df.mood==mood].uri.duplicated().sum())

angry
13
calm
19
content
6
depressed
392
energetic
1
excited
57
happy
81
sad
40


In [78]:
#drop the duplicates that have the same uri and mood
df.drop_duplicates(subset=['uri','mood'], inplace=True)
# determine the size value count for each mood
df.mood.value_counts()

depressed    4185
sad          1319
calm         1304
content       955
excited       855
energetic     646
happy         481
angry         301
Name: mood, dtype: int64

## Apply Label Encoding to the affect label

In [79]:
df['mood'] = df['mood'].astype('category')

## Check for intersection of Uri's in multiple moods

In [80]:
df.uri.duplicated().sum()

638

In [81]:
638*8

5104

In [82]:
#From the uri duplicates, determine which ones are the same song but different moods
uriDup=df[df.uri.duplicated(keep=False)].sort_values(by='uri')
print(uriDup.shape)
#determine all the songs that are in multiple moods, and return a list of all the mood combinations, and the number of times they occur
uriFreqs =uriDup.uri.value_counts()

(1209, 22)


In [83]:
#make a dataframe of the rows containing the uri's that are present in 3 or more rows of uriDup
uriManyDup = uriDup[uriDup.uri.isin(uriFreqs[uriFreqs>2].index.get_level_values(0))]
uriManyDup.shape

(187, 22)

In [84]:
#remove the rows in uriDup2 from uriDup
uriDup = uriDup[~uriDup.uri.isin(uriManyDup.uri)]

In [85]:
uriDup.shape

(1022, 22)

## Remove Intersection songs from the data

In [86]:
print(df.shape)
#remove the rows in uriDup from df
df = df[~df.uri.isin(uriDup.uri)]
#remove the rows in uriDup2 from df
df = df[~df.uri.isin(uriManyDup.uri)]
df.shape

(10046, 22)


(8837, 22)

## Analyze the overlapping data (did not include in the final dataset yet)

### Duplicates that only occur in two moods

In [17]:
uriDup.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,song,playlist,mood,genre
36,0.686,0.574,8,-8.868,1,0.0272,0.0706,0.0647,0.0894,0.545,...,01FnTmSe5zIE0LmqPISajN,spotify:track:01FnTmSe5zIE0LmqPISajN,https://api.spotify.com/v1/tracks/01FnTmSe5zIE...,https://api.spotify.com/v1/audio-analysis/01Fn...,232195,4,Imaginary Friend,Happy Vibes 2023 ☀️,happy,
127,0.686,0.574,8,-8.868,1,0.0272,0.0706,0.0647,0.0894,0.545,...,01FnTmSe5zIE0LmqPISajN,spotify:track:01FnTmSe5zIE0LmqPISajN,https://api.spotify.com/v1/tracks/01FnTmSe5zIE...,https://api.spotify.com/v1/audio-analysis/01Fn...,232195,4,Imaginary Friend,Chill Vibes 2023 🌙,content,pop
78,0.414,0.416,11,-8.024,0,0.0251,0.1,0.179,0.131,0.196,...,01TnVDiet1DFTsyWKUKovl,spotify:track:01TnVDiet1DFTsyWKUKovl,https://api.spotify.com/v1/tracks/01TnVDiet1DF...,https://api.spotify.com/v1/audio-analysis/01Tn...,186268,4,NUMB,sad rap vibes 2023,sad,rap
168,0.414,0.416,11,-8.024,0,0.0251,0.1,0.179,0.131,0.196,...,01TnVDiet1DFTsyWKUKovl,spotify:track:01TnVDiet1DFTsyWKUKovl,https://api.spotify.com/v1/tracks/01TnVDiet1DF...,https://api.spotify.com/v1/audio-analysis/01Tn...,186268,4,NUMB,depressed as fuck,depressed,rap
73,0.716,0.748,8,-6.06,1,0.0356,0.235,0.849,0.114,0.338,...,01kfSdF9zfcDLri5sSWEoL,spotify:track:01kfSdF9zfcDLri5sSWEoL,https://api.spotify.com/v1/tracks/01kfSdF9zfcD...,https://api.spotify.com/v1/audio-analysis/01kf...,169245,4,RAVE,AGRESSIVE PHONK,energetic,EDM


In [42]:
#make a datafram of only the uri and mood columns of uriDup
uriDupMood = uriDup[['uri','mood']]

#map the mood values to numbers
affect = {'sad':0,'angry':1,'energetic':2,'excited':3,'happy':4,'content':5,'calm':6,'depressed':7}
uriDupMood['mood'] = uriDupMood['mood'].map(affect)
uriDupMood.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uriDupMood['mood'] = uriDupMood['mood'].map(affect)


Unnamed: 0,uri,mood
36,spotify:track:01FnTmSe5zIE0LmqPISajN,4
127,spotify:track:01FnTmSe5zIE0LmqPISajN,5
78,spotify:track:01TnVDiet1DFTsyWKUKovl,0
168,spotify:track:01TnVDiet1DFTsyWKUKovl,7
73,spotify:track:01kfSdF9zfcDLri5sSWEoL,2


In [47]:
moodsToCompare= list(uriDup.mood.cat.codes)

In [70]:
# for each pair of numbers in the list, check if the first number is a diference of -1 or 1 from the second number or if the two numbers are 7 and 0 or 0 and 7
# if the condition is not met, print the index of the pair of numbers
count =0
for i in range(0,len(moodsToCompare)-1,2):
    if (moodsToCompare[i] - moodsToCompare[i+1] == -1) or (moodsToCompare[i] - moodsToCompare[i+1] == 1)  or (moodsToCompare[i] == 7 and moodsToCompare[i+1] == 0) or (moodsToCompare[i] == 0 and moodsToCompare[i+1] == 7):
        continue
    else:
        count+=1
count/len(moodsToCompare)

0.19765166340508805

### Duplicates that occur in three or more moods

## Export the data to a csv file

### Removing unnecessary columns before feature selection

In [87]:
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,id,uri,track_href,analysis_url,duration_ms,time_signature,song,playlist,mood,genre
0,0.548,0.833,7,-3.294,1,0.0497,0.199,0.0,0.305,0.467,...,6dDIOU6AqgLkrjFNPgvKY6,spotify:track:6dDIOU6AqgLkrjFNPgvKY6,https://api.spotify.com/v1/tracks/6dDIOU6AqgLk...,https://api.spotify.com/v1/audio-analysis/6dDI...,229693,3,FU (feat. French Montana),angry break up songs to scream to.,angry,pop
1,0.556,0.661,6,-5.052,0,0.204,0.3,0.0,0.101,0.668,...,6PERP62TejQjgHu81OHxgM,spotify:track:6PERP62TejQjgHu81OHxgM,https://api.spotify.com/v1/tracks/6PERP62TejQj...,https://api.spotify.com/v1/audio-analysis/6PER...,178148,4,good 4 u,angry break up songs to scream to.,angry,pop
2,0.816,0.677,11,-4.057,0,0.197,0.358,0.0,0.165,0.571,...,7A9rdAz2M6AjRwOa34jxIP,spotify:track:7A9rdAz2M6AjRwOa34jxIP,https://api.spotify.com/v1/tracks/7A9rdAz2M6Aj...,https://api.spotify.com/v1/audio-analysis/7A9r...,172747,4,Ur So F**kInG cOoL,angry break up songs to scream to.,angry,pop
3,0.528,0.831,8,-3.607,1,0.0429,9.4e-05,0.0,0.141,0.632,...,1JdKrFyoU05abww0Zv0ayQ,spotify:track:1JdKrFyoU05abww0Zv0ayQ,https://api.spotify.com/v1/tracks/1JdKrFyoU05a...,https://api.spotify.com/v1/audio-analysis/1JdK...,220480,3,That's What You Get,angry break up songs to scream to.,angry,pop
4,0.706,0.544,2,-7.275,1,0.0306,0.0273,0.0,0.087,0.477,...,4ka1FkKAMde6dQAFFMXKac,spotify:track:4ka1FkKAMde6dQAFFMXKac,https://api.spotify.com/v1/tracks/4ka1FkKAMde6...,https://api.spotify.com/v1/audio-analysis/4ka1...,177583,4,i hope ur miserable until ur dead,angry break up songs to scream to.,angry,pop


In [88]:
#drop analysis_url, track_href, id, and playlist columns
df.drop(columns=['analysis_url','track_href','id','playlist'], inplace=True)

In [90]:
df.drop(columns=['type'], inplace=True)

In [91]:
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,uri,duration_ms,time_signature,song,mood,genre
0,0.548,0.833,7,-3.294,1,0.0497,0.199,0.0,0.305,0.467,190.029,spotify:track:6dDIOU6AqgLkrjFNPgvKY6,229693,3,FU (feat. French Montana),angry,pop
1,0.556,0.661,6,-5.052,0,0.204,0.3,0.0,0.101,0.668,168.56,spotify:track:6PERP62TejQjgHu81OHxgM,178148,4,good 4 u,angry,pop
2,0.816,0.677,11,-4.057,0,0.197,0.358,0.0,0.165,0.571,78.979,spotify:track:7A9rdAz2M6AjRwOa34jxIP,172747,4,Ur So F**kInG cOoL,angry,pop
3,0.528,0.831,8,-3.607,1,0.0429,9.4e-05,0.0,0.141,0.632,131.094,spotify:track:1JdKrFyoU05abww0Zv0ayQ,220480,3,That's What You Get,angry,pop
4,0.706,0.544,2,-7.275,1,0.0306,0.0273,0.0,0.087,0.477,115.935,spotify:track:4ka1FkKAMde6dQAFFMXKac,177583,4,i hope ur miserable until ur dead,angry,pop


### Exporting the data to a csv file

In [99]:
df.to_csv('Merged Emotions Data.csv', index=False)