In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("dupremoved_trackdata.csv", sep = ",")

In [25]:
def min_max(logit):
    '''
    Args:
        logit: N x D pd.DataFrame
    Return:
        score: N x D pd.DataFrame. A normalised score in strict scale 0-1 for each dimension.
    '''
    logit = (logit - logit.min(axis=0))/(logit.max(axis=0) - logit.min(axis=0))
    return logit


In [20]:
def zscore(logit):
    '''
    Args:
        logit: N x D pd.DataFrame
    Return:
        score: N x D pd.DataFrame. A standardised score for each dimension.
    '''
    # Datapoints 1 standard deviation awasy mean will be < 0 or > 1. 
    logit = (logit - logit.mean(axis=0))/logit.std(axis=0, ddof = 1)
    return logit


In [21]:
def softmax(logit):  # [5pts]
    """
    Args:
        logit: N x D numpy array
    Return:
        prob: N x D numpy array. A probability distribution over the set of dimensions. 
    """
    logit = logit - logit.max(axis=1).reshape(len(logit), 1)
    sum_exp = np.sum(np.exp(logit), axis = 1, keepdims = True)
    prob = np.exp(logit) / sum_exp
    return prob

In [27]:
# Min-max normalise all the numeric variables that have not been min-max normalised already. 
normalisation = data[['popularity', 'loudness', 'tempo', 'duration_ms']]
data_minmax = min_max(normalisation)
data[['popularity', 'loudness', 'tempo', 'duration_ms']] = data_minmax
data.to_csv("minmax_normalised_trackdata.csv", index = False)

      popularity  loudness     tempo  duration_ms
0       0.479167  0.641508  0.624447     0.294834
1       0.312500  0.700725  0.753462     0.133043
2       0.000000  0.796883  0.370000     0.160113
3       0.541667  0.762382  0.552128     0.127511
4       0.531250  0.734815  0.533389     0.280151
...          ...       ...       ...          ...
6095    0.656250  0.973955  0.601368     0.214842
6096    0.416667  0.728920  0.757912     0.080876
6097    0.645833  0.881421  0.555611     0.201860
6098    0.614583  0.836023  0.860113     0.321677
6099    0.000000  0.522566  0.809845     0.247221

[6100 rows x 4 columns]


In [37]:
print(data.columns)


Index(['name', 'artist', 'album', 'popularity', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'id', 'duration_ms',
       'time_signature', 'genre'],
      dtype='object')


In [41]:
# Change every numeric variable to zscore normalised.
index_set = ['popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness','instrumentalness', 'liveness', 
             'valence', 'tempo', 'duration_ms']
data[index_set] = zscore(data[index_set])
data.to_csv("zscore_normalised_trackdata.csv", index = False)

In [42]:
# Change every numeric variable to softmax normalised.
data[index_set] = softmax(np.array(data[index_set]))
data.to_csv("softmax_normalised_trackdata.csv", index = False)