# Generating Lyrics Features

### Imports

In [1]:
import pandas as pd
import numpy as np

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from nltk.stem import WordNetLemmatizer
import string
from git import Repo
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA



### Loading data

In [3]:
data = pd.read_csv('merged_cleaned_sentiment_train.csv', delimiter = ',')
data_test = pd.read_csv('merged_cleaned_sentiment_test.csv', delimiter = ',')
data_val = pd.read_csv('merged_cleaned_sentiment_validation.csv', delimiter = ',')


### Vader Sentiment

In [4]:
data_features = data[['pos','neg','neu','compound']]
data_labels = data[['y_valence', 'y_arousal']]

data_features_test = data_test[['pos','neg','neu','compound']]
data_labels_test = data_test[['y_valence', 'y_arousal']]

data_features_val = data_val[['pos','neg','neu','compound']]
data_labels_val = data_val[['y_valence', 'y_arousal']]

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

X_train_vader, y_train = np.array(data_features), np.array(data_labels)
X_test_vader, y_test = np.array(data_features_test), np.array(data_labels_test)
X_val_vader, y_val = np.array(data_features_val), np.array(data_labels_val)

In [12]:
reg = LinearRegression()
reg.fit(X_train_vader,y_train)
prediction = reg.predict(X_val_vader)
print(mean_squared_error(y_val, prediction, squared = False))
print(r2_score(y_val, prediction))

0.9816597264467664
0.022844544146133294


A very high RMSE considering our predicted values go from -1 to 1. The RMSE is half of the range. 

### TF-IDF Features

In [13]:
data_lyrics_train = np.array(data['lyrics_cleaned'])
data_lyrics_test = np.array(data_test['lyrics_cleaned'])
data_lyrics_val = np.array(data_val['lyrics_cleaned'])

#### Cleaning up the lyrics texts

In [14]:
def preprocessing(data): 
    lemmatizer = WordNetLemmatizer()

    clean_data = np.full((data.shape), None)

    for n, lyrics in enumerate(data):
        lyrics = [word.lower().strip(string.punctuation) for word in lyrics.split()]
        lyrics = [lemmatizer.lemmatize(word) for word in lyrics]
        clean_data[n] = ' '.join(lyrics)

    return clean_data
        

In [15]:
clean_train = preprocessing(data_lyrics_train)
clean_test = preprocessing(data_lyrics_test)
clean_val = preprocessing(data_lyrics_val)

#### Generating TF-IDF Counts

In [18]:
vectorizer = TfidfVectorizer(analyzer= 'word', ngram_range = (1,1))
X_train_tfidf = vectorizer.fit_transform(clean_train)
X_test_tfidf = vectorizer.transform(clean_test)
X_val_tfidf = vectorizer.transform(clean_val)

### PCA

In [62]:
pca = PCA(n_components = 100)
pca.fit(X_train_tfidf.toarray().astype(float))
pca_lyrics_train = pca.transform(X_train_tfidf.toarray().astype(float))
pca_lyrics_test = pca.transform(X_test_tfidf.toarray().astype(float))
pca_lyrics_val = pca.transform(X_val_tfidf.toarray().astype(float))
print('Explained variance:', np.sum(pca.explained_variance_ratio_))

Explained variance: 0.26271753604112447


### ANEW Count Features - lexical labels

In [20]:
#Repo.clone_from('https://github.com/JULIELab/X-ANEW.git', Path.home()/'Documents/GitHub/CogSci2-Spotify/anew')

GitCommandError: Cmd('git') failed due to: exit code(128)
  cmdline: git clone -v https://github.com/JULIELab/X-ANEW.git /Users/yananikolova/Documents/GitHub/CogSci2-Spotify/anew
  stderr: 'fatal: destination path '/Users/yananikolova/Documents/GitHub/CogSci2-Spotify/anew' already exists and is not an empty directory.
'

In [21]:
xanew = pd.read_csv('anew/Ratings_Warriner_et_al.csv', delimiter = ',')

In [22]:
xanew = xanew[['Word', 'V.Mean.Sum', 'A.Mean.Sum']]
xanew.columns = ['word', 'valence', 'arousal']
xanew.set_index('word', inplace=True)

In [24]:
ewords = [str(el) for el in list(xanew.index)]

vectorizer = CountVectorizer(vocabulary = ewords)
X_train_anew = vectorizer.transform(clean_train)
X_test_anew  = vectorizer.transform(clean_test)
X_val_anew = vectorizer.transform(clean_val)
xanew_t = np.array(xanew.T) #correcting shape for function 


In [25]:
def get_anew(arr):

    valence = lambda x : x * xanew_t[0]
    arousal = lambda x : x * xanew_t[1]

    X_train_valence = np.apply_along_axis(valence, 1, arr.toarray())
    X_train_arousal = np.apply_along_axis(arousal, 1, arr.toarray())

    return X_train_valence, X_train_arousal



In [26]:
X_train_valence, X_train_arousal = get_anew(X_train_anew)

#### PCA on ANEW vectors

In [27]:

pca_v = PCA(n_components = 100)
pca_a = PCA(n_components = 100)
pca_v.fit(X_train_valence)
pca_a.fit(X_train_arousal)
pca_v_train = pca_v.transform(X_train_valence)
pca_a_train = pca_a.transform(X_train_arousal)
print('Explained variance:', np.sum(pca_v.explained_variance_ratio_))
print('Explained variance:', np.sum(pca_a.explained_variance_ratio_))



Explained variance: 0.6821856759845101
Explained variance: 0.617935732258481


In [31]:
pca_anew = np.concatenate((pca_v_train, pca_a_train), axis = 1)

In [63]:
X_val_valence, X_val_arousal = get_anew(X_val_anew)
X_test_valence, X_test_arousal = get_anew(X_test_anew)
pca_v_test = pca_v.transform(X_test_valence)
pca_a_test = pca_a.transform(X_test_arousal)
pca_v_val = pca_v.transform(X_val_valence)
pca_a_val = pca_v.transform(X_val_arousal)


In [64]:
pca_anew_val = np.concatenate((pca_v_val, pca_a_val), axis = 1)
pca_anew_test = np.concatenate((pca_v_test, pca_a_test), axis = 1)

In [65]:
X_train_all = np.concatenate([X_train_vader, pca_lyrics_train, pca_anew], axis = 1)
X_test_all = np.concatenate([X_test_vader, pca_lyrics_test, pca_anew_test], axis = 1)
X_val_all = np.concatenate([X_val_vader, pca_lyrics_val, pca_anew_val], axis = 1)


In [59]:
features_index = ['pos', 'neg', 'neu', 'compound']

for i in range(1,101):
    features_index.append(f'tfidf_pca_{i}')
for n in range(1,201):
    features_index.append(f'anew_pca_{n}')


In [66]:
train_df = pd.DataFrame(X_train_all, columns=features_index)
test_df = pd.DataFrame(X_test_all, columns=features_index)
val_df = pd.DataFrame(X_val_all, columns=features_index)

In [67]:
train_df.to_csv('lyrics_features_train.csv', sep = ',')
test_df.to_csv('lyrics_features_test.csv', sep = ',')
val_df.to_csv('lyrics_features_val.csv', sep = ',')




In [32]:
lyrics_train = pd.read_csv('lyrics_features_train.csv',delimiter = ',')
lyrics_test = pd.read_csv('lyrics_features_test.csv',delimiter = ',')
lyrics_val = pd.read_csv('lyrics_features_val.csv',delimiter = ',')

data = pd.read_csv('merged_cleaned_sentiment_train.csv', delimiter = ',')
data_val = pd.read_csv('merged_cleaned_sentiment_validation.csv', delimiter = ',')
data_test = pd.read_csv('merged_cleaned_sentiment_test.csv', delimiter = ',')

data = pd.concat([data, pd.get_dummies(data.key, drop_first = True, prefix = 'key')], axis=1)
data_val = pd.concat([data_val, pd.get_dummies(data_val.key, drop_first = True, prefix = 'key')], axis=1)
data_test = pd.concat([data_test, pd.get_dummies(data_test.key, drop_first = True, prefix = 'key')], axis=1)

data_multi = pd.concat([data, lyrics_train], axis = 1).dropna(axis = 0)
data_multi_val = pd.concat([data_val, lyrics_val], axis = 1).dropna(axis = 0)
data_multi_test = pd.concat([data_test, lyrics_test], axis = 1).dropna(axis = 0)

data_labels = data_multi[['y_valence', 'y_arousal']]
data_labels_val = data_multi_val[['y_valence', 'y_arousal']]
data_labels_test = data_multi_test[['y_valence', 'y_arousal']]

data_multi = data_multi.drop(columns = ['Unnamed: 0', 'artist', 'trackname', 'id', 'lyrics','lyrics_cleaned', 'neg', 'neu', 'pos', 'compound' ,'y_valence', 'y_arousal'])
data_multi_val = data_multi_val.drop(columns = ['Unnamed: 0', 'artist', 'trackname', 'id', 'lyrics','lyrics_cleaned', 'neg', 'neu', 'pos', 'compound', 'y_valence', 'y_arousal'])
data_multi_test = data_multi_test.drop(columns = ['Unnamed: 0', 'artist', 'trackname', 'id', 'lyrics','lyrics_cleaned', 'neg', 'neu', 'pos', 'compound', 'y_valence', 'y_arousal'])

X_train = data_multi.to_numpy().astype(np.float32)
X_test = data_multi_test.to_numpy().astype(np.float32)
X_val = data_multi_val.to_numpy().astype(np.float32)
y_train = data_labels.to_numpy().astype(np.float32)
y_test = data_labels_test.to_numpy().astype(np.float32)
y_val = data_labels_val.to_numpy().astype(np.float32)


