In [2]:
import pandas as pd
import joblib
import numpy as np
import os
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from torchnlp.encoders.text import WhitespaceEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
lyrics =pd.read_csv('https://raw.githubusercontent.com/tashapiro/predicting-song-music-genre/main/data/lyrics_processed.csv')

In [4]:
EMOJI_REGEX = re.compile(
    "(["
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "])"
)

In [5]:
print(EMOJI_REGEX)

re.compile('([🌀-🗿😀-🙏🚀-\U0001f6ff🜀-\U0001f77f🞀-\U0001f7ff🠀-\U0001f8ff🤀-🧿🨀-\U0001fa6f🩰-\U0001faff✂-➰Ⓜ-🉑])')


In [6]:
# remove puntuation
lyrics['final_lyrics'] = lyrics['cleaned_lyrics'].map(lambda x : re.sub(r"""[!"#\$%&'\(\)\*\+,-\./:;\<=\>?\[\]\^_`\{\|\}~“”’]""", '', x))
#lower case
lyrics['final_lyrics'] = lyrics['final_lyrics'].map(lambda x: x.lower())
#remove emoji
lyrics['final_lyrics'] = lyrics['final_lyrics'].str.replace(
        EMOJI_REGEX, r" \1 ", regex=True
    ) 
#remove double space
lyrics['final_lyrics'] = lyrics["final_lyrics"].str.replace(r"\s+", " ", regex=True).str.strip()

In [7]:
df = lyrics.loc[:,['genre','final_lyrics']]
df.head(5)

Unnamed: 0,genre,final_lyrics
0,country,every time our eyes meet this feeling inside m...
1,country,when the sun goes down on my side of town that...
2,country,it was seven hundred fence posts from your pla...
3,country,kelsea ballerini dibs dolly parton jolene clar...
4,country,something bout the way shes wearing her dress ...


## Lemmatize

In [8]:
def lemmatize(lyrics):
    lemmatizer=WordNetLemmatizer()
    lyrics_str=word_tokenize(lyrics)
    new_words = []
    for word in lyrics_str:
        new_words.append(lemmatizer.lemmatize(word))
    return ' '.join(new_words)

df["ly"] = df["final_lyrics"].apply(lemmatize)

## using nltk to remove stop words
Stop words are common words like ‘the’, ‘and’, ‘I’, etc. that are very frequent in text, and so don’t convey insights into the specific topic of a document. We can remove these stop words from the text in a given corpus to clean up the data, and identify words that are more rare and potentially more relevant to what we’re interested in.

In [9]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['oh','ohh'])
rm_words = set(stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df["rm_ly"] = df["ly"].str.split().apply(lambda x: " ".join(word for word in x if word not in rm_words))

In [11]:
df.head()

Unnamed: 0,genre,final_lyrics,ly,rm_ly
0,country,every time our eyes meet this feeling inside m...,every time our eye meet this feeling inside me...,every time eye meet feeling inside almost take...
1,country,when the sun goes down on my side of town that...,when the sun go down on my side of town that l...,sun go side town lonesome feelin come door who...
2,country,it was seven hundred fence posts from your pla...,it wa seven hundred fence post from your place...,wa seven hundred fence post place neither one ...
3,country,kelsea ballerini dibs dolly parton jolene clar...,kelsea ballerini dibs dolly parton jolene clar...,kelsea ballerini dibs dolly parton jolene clar...
4,country,something bout the way shes wearing her dress ...,something bout the way shes wearing her dress ...,something bout way shes wearing dress little t...


In [12]:
#convert genre to category and rm_ly to string
df.rm_ly = df.rm_ly.astype("string")
df.genre = df.genre.astype("category")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3420 entries, 0 to 3419
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   genre         3420 non-null   category
 1   final_lyrics  3420 non-null   object  
 2   ly            3420 non-null   object  
 3   rm_ly         3420 non-null   string  
dtypes: category(1), object(2), string(1)
memory usage: 83.8+ KB


In [18]:
xtrain, xtest, ytrain, ytest = train_test_split(
    df["rm_ly"].to_frame(), df["genre"], test_size=0.2, random_state=42
)

In [20]:
for x_ in (xtrain, xtest):
    print(x_.shape)

(2736, 1)
(684, 1)


In [16]:
def whitespace_encode(data: pd.DataFrame) -> pd.DataFrame:
    input_ = df["rm_ly"].tolist()
    encoder = WhitespaceEncoder(input_, min_occurrences=2)
    encoded_data = [encoder.encode(example) for example in input_]
    with open("../encoder.pickle", "wb") as file:
        joblib.dump(encoder, file)
    print("Saved encoder to disk.")

In [17]:
_ = whitespace_encode(xtrain)

Saved encoder to disk.
