In [41]:
import pandas as pd

In [65]:
# Survey Data Cleaning

survey = pd.read_csv('data/survey.csv')
survey['id'] = survey.index
survey.columns = ['timestamp', 'gender', 'age', 'amount_music', 'life_enjoyment', 'resilience', 
                  'balanced_life', 'emotional_flex', 'self_actualization', 'song1', 'song2', 'song3', 'trauma', 'id']
survey['total_health'] = (survey['life_enjoyment'] + survey['resilience'] + survey['balanced_life'] + 
                          survey['emotional_flex'] + survey['self_actualization'])

# Clean song strings for sending to Spotify API
survey['song1'] = survey['song1'].str.replace('by', '').str.replace('-', '').str.replace(',','').str.replace('  ', ' ')
survey['song2'] = survey['song2'].str.replace('by', '').str.replace('-', '').str.replace(',','').str.replace('  ', ' ')
survey['song3'] = survey['song3'].str.replace('by', '').str.replace('-', '').str.replace(',','').str.replace('  ', ' ')

# Arrange songs by ID for Spotify API
songs_only = pd.DataFrame(columns=['id','songs'])

for index, row in survey.iterrows():
    songs_only = songs_only.append({'id' : row['id'] , 'songs' : row['song1']}, ignore_index=True)
    songs_only = songs_only.append({'id' : row['id'] , 'songs' : row['song2']}, ignore_index=True)
    songs_only = songs_only.append({'id' : row['id'] , 'songs' : row['song3']}, ignore_index=True)
    
songs_only.to_csv('data/songs_only.csv', index=False)

## Retrieve audio features using spotipy.py

# Cleaning audio features from Spotify API output

song_metrics = pd.read_csv('data/song_metrics.csv')

avg_metrics = song_metrics.groupby('id').mean()

master = pd.merge(survey, avg_metrics, on='id')
master = master.drop(columns=['Unnamed: 0'])

# Creating categorical variable for mental health
master.loc[master['total_health'] > 15, 'health_categorical'] = 1
master.loc[master['total_health'] <= 15, 'health_categorical'] = 0

master.to_csv('data/master.csv', index=False)

master_categorical = master
master_categorical

Unnamed: 0,timestamp,gender,age,amount_music,life_enjoyment,resilience,balanced_life,emotional_flex,self_actualization,song1,...,total_health,energy,dance,liveness,valence,tempo,instrumental,acoustic,popularity,health_categorical
0,05/11/2016 14:34:00,Male,18 - 30,1 - 2,4,3,5,3,6,Too Good Drake,...,21,0.633000,0.655000,0.110000,0.571500,98.368000,2.535000e-05,0.281600,71.500000,1.0
1,05/11/2016 14:45:26,Male,18 - 30,1 - 2,6,6,3,5,5,Put that on my set asap rocky ft skepta,...,25,0.655000,0.526000,0.116000,0.156000,121.127000,0.000000e+00,0.056600,65.000000,1.0
2,05/11/2016 14:45:41,Female,18 - 30,0 - 1,5,5,4,4,5,Fireproof Coleman Hell,...,23,0.745500,0.742000,0.183500,0.618500,125.009000,2.305000e-05,0.053800,63.500000,1.0
3,05/11/2016 14:45:55,Male,18 - 30,2+,3,6,5,5,5,Eyes Nose Lips TaeYang,...,24,0.739000,0.681667,0.148900,0.596000,113.576000,1.280000e-06,0.348867,60.333333,1.0
4,05/11/2016 14:49:15,Female,18 - 30,0 - 1,2,5,2,1,5,Sugar Wanderlust,...,15,0.539000,0.513000,0.215500,0.291500,122.460500,4.643500e-01,0.599500,3.000000,0.0
5,05/11/2016 14:50:21,Male,18 - 30,0 - 1,3,4,4,5,6,Look on Up Relient K,...,22,0.740667,0.511667,0.114500,0.416000,149.958000,3.223000e-05,0.090370,43.666667,1.0
6,05/11/2016 14:51:28,Male,18 - 30,1 - 2,4,5,2,5,4,Timmy Turner Desiigner,...,20,0.641000,0.692000,0.188867,0.393667,137.599333,9.554500e-04,0.107807,82.000000,1.0
7,05/11/2016 14:52:00,Female,18 - 30,1 - 2,4,4,4,4,5,SelfControl Frank Ocean,...,21,0.496000,0.647000,0.211500,0.422000,115.527500,0.000000e+00,0.822500,72.500000,1.0
8,05/11/2016 14:54:51,Male,18 - 30,2+,3,2,4,3,1,Never say never basement jaxx,...,13,0.549333,0.713333,0.174267,0.365333,111.394333,3.705400e-02,0.235263,58.666667,0.0
9,05/11/2016 14:58:37,Male,18 - 30,2+,2,5,3,5,5,Kids in the Dark,...,20,0.558000,0.515000,0.130433,0.509000,152.126333,7.783333e-04,0.428787,74.000000,1.0
