In [2]:
import os
from datasets import load_dataset, concatenate_datasets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.initializers import Constant
from keras.models import Sequential
from keras.utils import plot_model
from keras.layers import Dense, Embedding, Dropout, Flatten, Conv1D, GlobalMaxPooling1D, MaxPooling1D, Embedding
from sklearn.metrics import confusion_matrix
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from matplotlib.pyplot import figure
import seaborn as sn
import pandas as pd
import numpy as np
import string
import re

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DMD028\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DMD028\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DMD028\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

We will try different strategies:

Strategy 1:
- Tokenization
- To lowercase
- Glove

Strategy 2:
- Tokenization
- Punctuation removal
- Stop words removal
- To lowercase
- Glove

Strategy 3:
- Tokenization
- Punctuation removal
- Stop words removal
- Lemmatization
- To lowercase
- Glove

Strategy 4:
- Tokenization
- Punctuation removal
- Stop words removal
- Stemming
- To lowercase
- Glove

Strategy 5:
- Tokenization
- Punctuation removal
- Stop words removal
- Stemming
- Lemmatization
- To lowercase
- Glove

https://www.analyticsvidhya.com/blog/2021/09/sentiment-classification-using-nlp-with-text-analytics/ -> Sentiment classification using nlp text analytics

- Text preprocessing:
    * stemming
    * lemmatization
- Models:
    * Naive Bayes
    * TF-IDF vectorizer
    * Networks:
        ...






In [3]:
dataset = load_dataset("Annanay/aml_song_lyrics_balanced")

In [4]:
# Embedding using GloVe

# Tokenization
# NLTK work_tokenize: divide sentences into words
# Punctuation and Stop words removed
# Lemmatization -> reduce words to lemmas
# Stemming -> shorten words by removing morphological affixes to retain only the word stems. ->NOT GOOD
# Converting words into feature vectors 
# TfidfTransformer to transform the count matrix into a normalized TF (Term frequency) or TF-IDF (Term frequence-Inverse Document Frequency) representation.

In [5]:
filename = '../datasets/Lyrics_dataset/glove.6B.100d.txt'

# Create dictionary of words embeddings
# This words embeddings are word representations in a vectorial space
# The vectors used for this project are the "Globol Vectors for Word Representation (Glove)"
# The size of the vectors is 100
file = open(filename, encoding = "utf-8")
embed_DB = {}
for a_line in file:
    embedding = a_line.split()
    the_word = embedding[0]
    context_array = embedding[1:]
    embed_DB[the_word] = np.asarray(context_array)
file.close()

In [29]:
def get_wordnet_pos(treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return 'a'
        elif treebank_tag.startswith('V'):
            return 'v'
        elif treebank_tag.startswith('N'):
            return 'n'
        elif treebank_tag.startswith('R'):
            return 'r'
        else:
            return 'n'

In [6]:
def process_data(lyrics, stopwords, punctuation, lemmatize, stemmer):
    # Remove jump lines and song structure text "[]"
    lyrics_clean = []
    translator = str.maketrans('', '', string.punctuation)

    for lyric in lyrics:
        # Remove newlines and tags like [Verse], [Chorus], etc.
        cleaned_text = re.sub(r"\\n", " ", lyric)
        cleaned_text = re.sub(r"\[\w+.*?\]", "", cleaned_text)
        if punctuation:
            cleaned_text = cleaned_text.translate(translator)
        lyrics_clean.append(cleaned_text)

    # Separates a text into a list of tokens/words
    lyrics_tokens = []
    for lyric in lyrics_clean:
        tokens = word_tokenize(lyric) 
        lyrics_tokens.append(tokens)

    # Remove stopwords if necessary
    if stopwords:
        stopwords=nltk.corpus.stopwords.words('english') + ['@']
        no_stopwords_data = []

        for lyric in lyrics_tokens:
            temp = [word for word in lyric if not word in stopwords]
            no_stopwords_data.append(temp) 

        lyrics_tokens = no_stopwords_data

    # Remove stems from word lyrics
    if stemmer:
        stemmer = nltk.stem.LancasterStemmer()    
        X_stemmed = []

        for sentence in lyrics_tokens:
            temp = [stemmer.stem(word) for word in sentence]
            X_stemmed.append(temp)

    # Lemmatize the lyrics
    if lemmatize:
        lemmatizer = nltk.stem.WordNetLemmatizer()

        X_pos = []
        X_lemmatized = []

        for lyric in lyrics_tokens:
            temp = nltk.pos_tag(lyric) # Part of speech tagger
            X_pos.append(temp)
            
        for lyric in X_pos :
            temp = [ lemmatizer.lemmatize(word[0],pos=get_wordnet_pos(word[1])) for word in lyric]
            X_lemmatized.append(temp)  

        lyrics_tokens = X_lemmatized 

    # Transforms words to a sequence of numbers
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lyrics_tokens) # Updates vocabulary
    lyrics_sequence = tokenizer.texts_to_sequences(lyrics_tokens) 

    # Creates sparse matrix so every entry has the same dimensions
    highest_tokens = max([len(tokenized_lyric) for tokenized_lyric in lyrics_tokens]) # Number of maximum words
    lyrics_matrix = pad_sequences(lyrics_sequence, maxlen=highest_tokens) 

    # Making embedding matrix
    word_index = tokenizer.word_index
    total_words = len(word_index) + 1
    embed_mat = np.zeros((total_words, 100))
    
    for word, index in word_index.items():
        if index > total_words:
            continue
        embed_vec = embed_DB.get(word)
        # edge cases
        if embed_vec is None:
            continue
        embed_mat[index] = embed_vec
    
    return embed_mat, lyrics_matrix

In [None]:
# Strat 1
train_data, train_mat = process_data(dataset['train']['lyrics'], stopwords=False, punctuation=False, lemmatize=False, stemmer=False)
test_data, test_mat = process_data(dataset['test']['lyrics'], stopwords=False, punctuation=False, lemmatize=False, stemmer=False)

In [None]:
# Strat 2
train_data2, train_mat2 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=False, stemmer=False)
test_data2, test_mat2 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=False, stemmer=False)

In [None]:
# Strat 3
train_data3, train_mat3 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=True, stemmer=False)
test_data3, test_mat3 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=True, stemmer=False)

In [None]:
# Strat 4
train_data4, train_mat4 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=False, stemmer=True)
test_data4, test_mat4 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=False, stemmer=True)

In [None]:
# Strat 5
train_data5, train_mat5 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=True, stemmer=True)
test_data5, test_mat5 = process_data(dataset['train']['lyrics'], stopwords=True, punctuation=True, lemmatize=True, stemmer=True)