In [14]:
import pandas as pd
from tensorflow.keras.layers import Dense, Embedding,GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding

In [16]:
import spacy

def tokenization(text: list[spacy.tokens.token.Token]) -> list[spacy.tokens.token.Token]:
    """Use this function to tokenize text.

    :param text: Text as list
    :type text: list[spacy.tokens.token.Token]
    :return: Tokenized text as list
    :rtype: list[spacy.tokens.token.Token]
    """

    token_list = []
    for doc in text: 
        # iterate over tokens in docs
        for token in doc:
            token_list.append(token)

    return token_list


def stop_word_removal(text: list[spacy.tokens.token.Token]) -> list[spacy.tokens.token.Token]: 
    """Use this function to remove stop words. 

    :param text: Tokens to remove stop words from 
    :type text: list[spacy.tokens.token.Token]
    :return: Tokens without stop words
    :rtype: list[spacy.tokens.token.Token]
    """

    token_list_without_stop = []
    # Don't add token to list if stop word
    for token in text:
        if token.is_stop == False: 
            token_list_without_stop.append(token)

    return token_list_without_stop


def punctutation_removal(text: list[spacy.tokens.token.Token]) -> list[spacy.tokens.token.Token]: 
    """Use this function to remove punctuation.

    :param text: Tokens to remove punctuation from
    :type text: list[spacy.tokens.token.Token]
    :return: Tokens without punctuation
    :rtype: list[spacy.tokens.token.Token]
    """

    token_list_no_stop_no_punct = []
    # Don't add token to list if punctuation
    for token in text:
        if token.is_punct == False:
            token_list_no_stop_no_punct.append(token)

    return token_list_no_stop_no_punct


def lemmatization(text: list[spacy.tokens.token.Token]) -> list[str]: 
    """Use this function to lemmatize a given text.

    :param text: Tokens to lemmatize
    :type text: list[spacy.tokens.token.Token]
    :return: lemmatized tokens
    :rtype: list[str]
    """

    token_list_no_stop_no_punct_lemmatized = []
    for token in text: 
        if "\n" not in token.lemma_:
            token_list_no_stop_no_punct_lemmatized.append(token.lemma_)
    return token_list_no_stop_no_punct_lemmatized


def processing_pipeline(song_data: dict) -> dict:
    """Use this function to execute the entire processing pipeline on given song data.
    Preprocessing steps:
    - Tokenization
    - Stop word removal
    - Punctuation removal
    - Lemmatization
    - ...

    :param song_data: song data saved in a json file containing song name, artist name and lyrics
    :type song_data: dict
    :return: preprocessed song data
    :rtype: dict
    """

    nlp = spacy.load("en_core_web_sm", disable = ['ner'])
    text_nlp_pipe = list(nlp.pipe([song_data["Lyrics"]]))
    
    # Tokenization
    song_data["Lyrics"] = tokenization(text_nlp_pipe)
    # Stop word removal
    song_data["Lyrics"] = stop_word_removal(song_data["Lyrics"])
    # Punctuation removal
    song_data["Lyrics"] = punctutation_removal(song_data["Lyrics"])
    # Lemmatization
    song_data["Lyrics"] = lemmatization(song_data["Lyrics"])

    return song_data


TypeError: 'type' object is not subscriptable

In [6]:
lyrics_df = pd.read_csv('./data/lyrics-data.csv')
artists_df = pd.read_csv('./data/artists-data.csv')

In [7]:
english_songs = lyrics_df["language"] == "en"
english_songs_df = lyrics_df[english_songs]
english_songs_df.head(5)
#drop SLink Column since it has no use
english_songs_df=english_songs_df.drop('SLink', axis=1)
# rename columns to more expressive names
english_songs_df= english_songs_df.rename(columns={'ALink': 'Link', 'SName': 'Song_Name', 'Lyric': 'Song_Text', 'language': 'Language'})
# Get artist names from dataset 2
english_songs_df = pd.merge(english_songs_df, artists_df[['Link','Artist']], on='Link', how='left')
english_songs_df = english_songs_df.drop(["Link"], axis=1)

In [11]:
test = english_songs_df[:2]
test.head(2)

Unnamed: 0,Song_Name,Song_Text,Language,Artist
0,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,en,Ivete Sangalo
1,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,Ivete Sangalo


In [12]:
test[1:]

Unnamed: 0,Song_Name,Song_Text,Language,Artist
1,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",en,Ivete Sangalo


In [15]:
model = Sequential([
    Embedding(vocab_size, 8, input_length=max_length),
   Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
  Dense(10, activation='relu'),
  Dense(1, activation='sigmoid')
])

NameError: name 'Sequential' is not defined