In [1]:
import pandas as pd
import joblib
import numpy as np
import os
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from torchnlp.encoders.text import WhitespaceEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
lyrics =pd.read_csv('https://raw.githubusercontent.com/tashapiro/predicting-song-music-genre/main/data/lyrics_processed.csv')

In [None]:
EMOJI_REGEX = re.compile(
    "(["
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"
    "])"
)

In [None]:
print(EMOJI_REGEX)

re.compile('([🌀-🗿😀-🙏🚀-\U0001f6ff🜀-\U0001f77f🞀-\U0001f7ff🠀-\U0001f8ff🤀-🧿🨀-\U0001fa6f🩰-\U0001faff✂-➰Ⓜ-🉑])')


In [None]:
# remove puntuation
lyrics['final_lyrics'] = lyrics['cleaned_lyrics'].map(lambda x : re.sub(r"""[!"#\$%&'\(\)\*\+,-\./:;\<=\>?\[\]\^_`\{\|\}~“”’]""", '', x))
#lower case
lyrics['final_lyrics'] = lyrics['final_lyrics'].map(lambda x: x.lower())
#remove emoji
lyrics['final_lyrics'] = lyrics['final_lyrics'].str.replace(
        EMOJI_REGEX, r" \1 ", regex=True
    ) 
#remove double space
lyrics['final_lyrics'] = lyrics["final_lyrics"].str.replace(r"\s+", " ", regex=True).str.strip()

In [None]:
df = lyrics.loc[:,['genre','final_lyrics']]
df.head(5)

Unnamed: 0,genre,final_lyrics
0,country,every time our eyes meet this feeling inside m...
1,country,when the sun goes down on my side of town that...
2,country,it was seven hundred fence posts from your pla...
3,country,kelsea ballerini dibs dolly parton jolene clar...
4,country,something bout the way shes wearing her dress ...


## Lemmatize
 remove inflectional endings only and to return the base or dictionary form of a word,

In [None]:
def lemmatize(lyrics):
    lemmatizer=WordNetLemmatizer()
    lyrics_str=word_tokenize(lyrics)
    new_words = []
    for word in lyrics_str:
        new_words.append(lemmatizer.lemmatize(word))
    return ' '.join(new_words)

df["ly"] = df["final_lyrics"].apply(lemmatize)

## using nltk to remove stop words
Stop words are common words like ‘the’, ‘and’, ‘I’, etc. that are very frequent in text, and so don’t convey insights into the specific topic of a document. We can remove these stop words from the text in a given corpus to clean up the data, and identify words that are more rare and potentially more relevant to what we’re interested in.

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['oh','ohh'])
rm_words = set(stop_words)


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df["rm_ly"] = df["ly"].str.split().apply(lambda x: " ".join(word for word in x if word not in rm_words))

In [None]:
df.head()

Unnamed: 0,genre,final_lyrics,ly,rm_ly
0,country,every time our eyes meet this feeling inside m...,every time our eye meet this feeling inside me...,every time eye meet feeling inside almost take...
1,country,when the sun goes down on my side of town that...,when the sun go down on my side of town that l...,sun go side town lonesome feelin come door who...
2,country,it was seven hundred fence posts from your pla...,it wa seven hundred fence post from your place...,wa seven hundred fence post place neither one ...
3,country,kelsea ballerini dibs dolly parton jolene clar...,kelsea ballerini dibs dolly parton jolene clar...,kelsea ballerini dibs dolly parton jolene clar...
4,country,something bout the way shes wearing her dress ...,something bout the way shes wearing her dress ...,something bout way shes wearing dress little t...


In [None]:
#convert genre to category and rm_ly to string
#df.rm_ly = df.rm_ly.astype("string")
#df.genre = df.genre.astype("category")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3420 entries, 0 to 3419
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genre         3420 non-null   object
 1   final_lyrics  3420 non-null   object
 2   ly            3420 non-null   object
 3   rm_ly         3420 non-null   object
dtypes: object(4)
memory usage: 107.0+ KB


In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(
    df["rm_ly"].to_frame(), df["genre"], test_size=0.2, random_state=42
)

In [None]:
for x_ in (xtrain, xtest):
    print(x_.shape)

(2736, 1)
(684, 1)


In [None]:
xtrain

Unnamed: 0,rm_ly
1621,dont need education dont need thought control ...
2084,want hang youve got take cocaine want get grou...
2177,livin moonlight lookin hill hill dont shine ri...
871,fuck steve harvey life game wan na play counti...
2398,dirty old part city sun refused shine people t...
...,...
1095,lp come word bond people said couldnt happen c...
1130,home dont play game j cause youll see sideline...
1294,butterfly searching relax pulling jazz stack c...
860,fuck bitch get money fuck nigga get money fuck...


In [None]:
input= xtrain['rm_ly'].tolist()
for i in input:


['dont need education dont need thought control dark sarcasm classroom teacher leave kid alone hey teacher leave kid alone another brick wall youre another brick wall dont need education dont need thought control dark sarcasm classroom teacher leave kid alone hey teacher leave u kid alone youre another brick wall youre another brick wall wrong child playing wrong dont eat meat cant pudding wrong pudding dont eat meat wrong yes behind bike shed stand still laddie dont eat meat cant pudding pudding dont eat meat yes behind bike shed stand still laddie child playing phone beeping sound70embed',
 'want hang youve got take cocaine want get ground cocaine dont lie dont lie dont lie cocaine got bad news want kick blue cocaine day done wan na ride cocaine dont lie dont lie dont lie cocaine day gone want ride cocaine dont forget fact cant get back cocaine dont lie dont lie dont lie cocaine dont lie dont lie dont lie cocaine8embed',
 'livin moonlight lookin hill hill dont shine right lookin nose

In [None]:
#def whitespace_encode(data: pd.DataFrame) -> pd.DataFrame:
    #input_ = df["rm_ly"].tolist()
    #encoder = WhitespaceEncoder(input_, min_occurrences=2)
    #encoded_data = [encoder.encode(example) for example in input_]
    #with open("../encoder.pickle", "wb") as file:
        #joblib.dump(encoder, file)
    #print("Saved encoder to disk.")

In [None]:
#_ = whitespace_encode(xtrain)

## Vectorize data and fit model

In [None]:
#load encoder
with open("encoder.pickle", "rb") as f:
    encoder: WhitespaceEncoder = joblib.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'encoder.pickle'

In [None]:
input= xtrain['rm_ly'].tolist()
input

NameError: name 'xtrain' is not defined

In [None]:
cv = CountVectorizer(stop_words='english')
xtrain_matrix = cv.fit_transform(input)
lda = LatentDirichletAllocation(n_components=4, random_state=42, n_jobs=-1)
lda.fit(xtrain_matrix)

In [None]:
words = cv.get_feature_names()
words



['007',
 '01101001',
 '02',
 '03',
 '04',
 '06',
 '07',
 '070',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000000',
 '1005',
 '100k',
 '100x',
 '101',
 '1010',
 '102',
 '104',
 '105',
 '106',
 '108',
 '108th',
 '109',
 '10k',
 '10kcaash',
 '10ll',
 '10th',
 '11',
 '110',
 '1111',
 '112',
 '1124',
 '1130',
 '1145',
 '115',
 '116',
 '117embed',
 '11embed',
 '11phenyl',
 '12',
 '120embed',
 '122embed',
 '123',
 '1230',
 '1234',
 '125',
 '125th',
 '127',
 '129',
 '12embed',
 '12gauge',
 '12k',
 '12th',
 '13',
 '131',
 '133embed',
 '1365',
 '13embed',
 '13kembed',
 '13th',
 '13x',
 '14',
 '144000',
 '149',
 '14embed',
 '14k',
 '15',
 '151',
 '155',
 '15embed',
 '15th',
 '16',
 '165',
 '16embed',
 '16k',
 '17',
 '1718',
 '175embed',
 '17embed',
 '17yearold',
 '18',
 '180',
 '1800',
 '18002738255',
 '1800jódete',
 '1800seeya',
 '1806',
 '182',
 '1830',
 '185',
 '187',
 '1870',
 '1895',
 '18embed',
 '18th',
 '18wheeler',
 '18wheelers',
 '19',
 '1900mixalot',
 '1916',
 '1922'

In [None]:
#actually print top word per genre
top_k_per_genre = lda.components_.argsort(axis=1)[:, -50:]
for idx, genre in enumerate(top_k_per_genre):
    print("=" * 20 + f"Genre #{idx}" + "=" * 20)
    print(encoder.decode(genre[::-1]))
    print()

demonic 💩 governor quantity known floridians gain jefferson visor tears closely medication bruised feb scary drs staked ing im jan lungs clots dealing ive whales hahaha mentions ho trending ecosys wannabe ⁠ portrait base minimum uplifting acquire capital beginner unboxing notifications ist merchandise celebration 주 즈 보 릴 거 짝

demonic 💩 governor quantity known floridians gain jefferson visor tears closely medication bruised feb scary drs staked ing im jan lungs clots dealing ive whales hahaha mentions ho trending ecosys wannabe ⁠ portrait base minimum uplifting acquire capital beginner unboxing notifications ist merchandise celebration 주 즈 보 릴 거 짝

demonic 💩 governor quantity known floridians gain jefferson visor tears closely medication bruised feb scary drs staked ing im jan lungs clots dealing ive whales hahaha mentions ho trending ecosys wannabe ⁠ portrait base minimum uplifting acquire capital beginner unboxing notifications ist merchandise celebration 주 즈 보 릴 거 짝

demonic 💩 govern