In [9]:
import numpy as np
import pandas as pd
from English_to_IPA import conversion
import re
import os
import pprint
import matplotlib.pyplot as plt
from num2words import num2words
import gensim
import multiprocessing

## Construct Phonetic Dictionary

In [10]:
eSPEDict = dict()
with open('eSPEPhonologicalTableV2') as openFile:
    for line in openFile.readlines():
        line = line.strip().split('\t')
        eSPEDict[line[0]] = np.array([eval(x) for x in line[1:]])

## Calculate score of energy

In [12]:
def split_lyric(lyrics):   
    splitted_lyrics = re.sub(' +', ' ', clean_lyric(lyrics)).split(' ')
    return [ word.lower() for word in splitted_lyrics if word != '']

def clean_lyric(lyric):
    punct_str = '!"#$%&\()*+,-./:;<=>/?@[\\]^_`{|}~«»\n'
    for p in punct_str:
        lyric = lyric.replace(p,' ')
    return lyric    
    
def is_number(word):
    try:
        int(word)
        return True
    except ValueError:
        pass
    return False

def change_number_to_string(number):
    return num2words(number)

In [13]:
def get_score_from_lyric(lyrics):
    word_count = 0
    score = np.zeros((8,), dtype=np.int)
    words = split_lyric(lyrics)
    for word in words:
        if word == '':
            continue
        if is_number(word):
            word = change_number_to_string(int(word))   
        cmu, ipa = conversion.convert(word)
        ipa = re.sub('[ˌˈ ]' ,'' ,ipa)
        if '*' in ipa:
            continue
        word_count += 1
        i = 0
        while i < len(ipa):
            if i == len(ipa)-1:
                sym = ipa[i]
                score += eSPEDict[sym]
                i += 1
            else:
                try:
                    sym = ipa[i] + ipa[i+1]
                    eSPEDict[sym]
                    i += 2
                except KeyError:
                    sym = ipa
                    eSPEDict[sym]
                    i += 1
    score = np.append(score, word_count)
    return score

In [14]:
class LyricsIterator(object):
    def __init__(self, dataset_name):
        self.dataset_name = dataset_name
    def __iter__(self):
        lyrics_data = pd.read_csv(self.dataset_name)
        for lyric in lyrics_data["text"]:
            yield split_lyric(lyric)
            

In [16]:
#lyrics_data = pd.read_csv("LyricsFreak.csv")
#energy_data = pd.read_csv("EnergyScores.csv")
#lyrics_test = lyrics_data.sample(n= 1000)
lyrics = LyricsIterator('LyricsFreak.csv')


In [21]:
# Dimensionality of the resulting word vectors.
#more dimensions, more computationally expensive to train
#but also more accurate
#more dimensions = more generalized
num_features = 50
# Minimum word count threshold. For pruning dictionary
min_word_count = 1
# Number of threads to run in parallel.
#more workers, faster we train
num_workers = multiprocessing.cpu_count()
# Context window length.
context_size = 5
# Seed for the RNG, to make the results reproducible.
#random number generator
#deterministic, good for debugging
seed = 1

In [22]:
sg_model = gensim.models.Word2Vec(
    lyrics,
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size
)

In [23]:
cbow_model = gensim.models.Word2Vec(
    lyrics,
    sg=0,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size
)

In [24]:
sg_model.save('models/sg_model.w2v')
cbow_model.save('models/cbow_model.w2v')

In [25]:
sg_model = gensim.models.Word2Vec.load('models/sg_model.w2v')
cbow_model = gensim.models.Word2Vec.load('models/cbow_model.w2v')

In [35]:

def songVector(row):
    vector_sum = 0
    words = row.lower().split()
    for word in words:
        vector_sum = vector_sum + songs2vec[word]
    vector_sum = vector_sum.reshape(1,-1)
    normalised_vector_sum = sklearn.preprocessing.normalize(vector_sum)
    return normalised_vector_sum


import time
start_time = time.time()

lyrics_data['song_vector'] = lyrics_data['text'].apply(songVector)

KeyError: "word 'face,' not in vocabulary"