# NLP Modeling

Building and training an NLP model with track lyrics.

---

In [1]:
# Imports
import re
import numpy as np
import pandas as pd
import spacy
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.optimizers import Adam
from spacy import cli

cli.download('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
# Load data
pd.set_option('display.max_columns', 25)
DATA_PATH = '../data/raw/spotify_songs.csv.zip'
df = pd.read_csv(DATA_PATH)
print(df.shape)
df.head(10)

(18454, 25)


Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language
0,0017A6SJgTbfQVU2EtsPNo,Pangarap,Barbie's Cradle,Minsan pa Nang ako'y napalingon Hindi ko alam ...,41,1srJQ0njEQgd8w4XSqI4JQ,Trip,2001-01-01,Pinoy Classic Rock,37i9dQZF1DWYDQ8wBxd7xt,rock,classic rock,0.682,0.401,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,tl
1,004s3t0ONYlzxII9PLgU6z,I Feel Alive,Steady Rollin,"The trees, are singing in the wind The sky blu...",28,3z04Lb9Dsilqw68SHt6jLB,Love & Loss,2017-11-21,Hard Rock Workout,3YouF0u7waJnolytf9JCXf,rock,hard rock,0.303,0.88,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,en
2,00chLpzhgVjxs1zKC9UScL,Poison,Bell Biv DeVoe,"NA Yeah, Spyderman and Freeze in full effect U...",0,6oZ6brjB8x3GoeSYdwJdPc,Gold,2005-01-01,"Back in the day - R&B, New Jack Swing, Swingbe...",3a9y4eeCJRmG9p4YKfqYIx,r&b,new jack swing,0.845,0.652,6,-7.504,0,0.216,0.00432,0.00723,0.489,0.65,111.904,262467,en
3,00cqd6ZsSkLZqGMlQCR0Zo,Baby It's Cold Outside (feat. Christina Aguilera),CeeLo Green,I really can't stay Baby it's cold outside I'v...,41,3ssspRe42CXkhPxdc12xcp,CeeLo's Magic Moment,2012-10-29,Christmas Soul,6FZYc2BvF7tColxO8PBShV,r&b,neo soul,0.425,0.378,5,-5.819,0,0.0341,0.689,0.0,0.0664,0.405,118.593,243067,en
4,00emjlCv9azBN0fzuuyLqy,Dumb Litty,KARD,Get up out of my business You don't keep me fr...,65,7h5X3xhh3peIK9Y0qI5hbK,KARD 2nd Digital Single ‘Dumb Litty’,2019-09-22,K-Party Dance Mix,37i9dQZF1DX4RDXswvP6Mj,pop,dance pop,0.76,0.887,9,-1.993,1,0.0409,0.037,0.0,0.138,0.24,130.018,193160,en
5,00f9VGHfQhAHMCQ2bSjg3D,Soldier,James TW,"Hold your breath, don't look down, keep trying...",70,3GNzXsFbzdwM0WKCZtgeNP,Chapters,2019-04-26,urban contemporary,4WiB26kw0INKwbzfb5M6Tv,r&b,urban contemporary,0.496,0.639,6,-6.157,1,0.055,0.28,0.0,0.0975,0.305,147.764,224720,en
6,00FROhC5g4iJdax5US8jRr,Satisfy You,Diddy,All I want is somebody who's gonna love me for...,52,2dHr0LpUe6CNV5lNsr8x0W,Forever,1999-08-24,"Swingbeat (old skool), New Jack Swing, R&B, Hi...",3krpccUV68nBGAQbvHEZDC,r&b,new jack swing,0.764,0.594,6,-10.05,1,0.185,0.591,0.0,0.145,0.695,87.261,286441,en
7,00GfGwzlSB8DoA0cDP2Eit,Tender Lover,Babyface,Feels good Everybody Tender lover Tender love ...,36,51fAXJ5bMn7DRSunXQ6PMb,Tender Lover,1989-07-07,New Jack Swing,3ykXidKLz1eYPvuGoFlD1e,r&b,new jack swing,0.743,0.86,5,-6.346,1,0.0445,0.226,0.000422,0.0513,0.687,102.459,259267,en
8,00Gu3RMpDW2vO9PjlMVFDL,Hide Away (feat. Envy Monroe),Blasterjaxx,"Don't run away, it's getting colder Our hearts...",42,5pqG85igfoeWcCDIsSi9x7,Hide Away (feat. Envy Monroe),2019-06-21,Big Room EDM - by Spinnin' Records,7xWdFCrU5Gka6qp1ODrSdK,edm,big room,0.573,0.746,10,-4.894,1,0.0421,0.0249,0.0,0.361,0.134,130.001,188000,en
9,00GxbkrW4m1Tac5xySEJ4M,Ti volevo dedicare (feat. J-AX & Boomdabash),Rocco Hunt,Ho una cosa da dirti da tempo Ma non ho mai t...,78,57L1NgMlfxscOxHhmfLjqg,Libertà,2019-08-30,Musica Italiana 2020 - Playlist Pop & Hip-Hop ...,6kVFIQBhLT4003iw2WWEv1,r&b,hip pop,0.754,0.725,8,-6.058,1,0.0661,0.0104,0.0,0.192,0.271,120.002,208133,it


In [3]:
df = df[df.lyrics.isna() == False]
df.isna().sum()

track_id                    0
track_name                  0
track_artist                0
lyrics                      0
track_popularity            0
track_album_id              0
track_album_name            0
track_album_release_date    0
playlist_name               0
playlist_id                 0
playlist_genre              0
playlist_subgenre           0
danceability                0
energy                      0
key                         0
loudness                    0
mode                        0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
duration_ms                 0
language                    0
dtype: int64

In [4]:
def clean_lyrics(lyrics):
    """
    Cleans the song lyrics for tokenization and modeling

    parameters
    ----------
    lyrics: string
    The raw lyrics of a song

    returns
    -------
    clean_lyrics: spacy doc
    The clean lyrics ready for tokenization and modeling
    """

    lyrics = re.sub('[^A-Za-z ]', '', lyrics)
    clean_lyrics = lyrics.lower()

    return clean_lyrics

In [5]:
# Clean track lyrics for modeling
df['clean_lyrics'] = df['lyrics'].apply(clean_lyrics)
df['clean_lyrics'].head(10)

0    minsan pa nang akoy napalingon hindi ko alam n...
1    the trees are singing in the wind the sky blue...
2    na yeah spyderman and freeze in full effect uh...
3    i really cant stay baby its cold outside ive g...
4    get up out of my business you dont keep me fro...
5    hold your breath dont look down keep trying da...
6    all i want is somebody whos gonna love me for ...
7    feels good everybody tender lover tender love ...
8    dont run away its getting colder our hearts un...
9    ho una cosa da dirti da tempo ma non ho mai tr...
Name: clean_lyrics, dtype: object

In [6]:
# Feature data
features = [
    'track_popularity', 'track_album_release_date', 'danceability', 'energy',
    'key', 'loudness', 'mode', 'speechiness',
    'acousticness', 'instrumentalness', 'liveness', 'valence',
    'tempo', 'duration_ms', 'clean_lyrics'
]

X = df[features]

print(X.shape)
X.head()

(18194, 15)


Unnamed: 0,track_popularity,track_album_release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,clean_lyrics
0,41,2001-01-01,0.682,0.401,2,-10.068,1,0.0236,0.279,0.0117,0.0887,0.566,97.091,235440,minsan pa nang akoy napalingon hindi ko alam n...
1,28,2017-11-21,0.303,0.88,9,-4.739,1,0.0442,0.0117,0.00994,0.347,0.404,135.225,373512,the trees are singing in the wind the sky blue...
2,0,2005-01-01,0.845,0.652,6,-7.504,0,0.216,0.00432,0.00723,0.489,0.65,111.904,262467,na yeah spyderman and freeze in full effect uh...
3,41,2012-10-29,0.425,0.378,5,-5.819,0,0.0341,0.689,0.0,0.0664,0.405,118.593,243067,i really cant stay baby its cold outside ive g...
4,65,2019-09-22,0.76,0.887,9,-1.993,1,0.0409,0.037,0.0,0.138,0.24,130.018,193160,get up out of my business you dont keep me fro...


In [7]:
# Helper functions for preprocessing pipeline
def get_numeric(X):
    """
    Function for passing nummeric features through a pipeline; will later be made into a transformer via sklearn
    """

    # Convert date strings to 4-digit year integers
    X.track_album_release_date = X.track_album_release_date.apply(
        lambda date: int(date[:4])
    )

    # Return all features except for clean_lyrics
    return X.drop('clean_lyrics', axis=1)


def get_text(X):
    """
    Function for passing text feature through a pipeline; will be made into a transformer via sklearn
    """

    # Only return clean_lyrics feature
    return X.clean_lyrics


def get_dense(X):
    """
    Transforms sparse matrix into dense matrix; will be made into a transformer via sklearn
    """
    return X.todense()


# Transform helper functions into sklearn transformers
numeric_transformer = FunctionTransformer(get_numeric)
text_transformer = FunctionTransformer(get_text)
dense_transformer = FunctionTransformer(get_dense)

In [8]:
# Create feature preprocessing pipeline
feature_pipe = Pipeline([
    ('feature_union', FeatureUnion([
        ('numeric_features', Pipeline([
            ('numeric_transformer', numeric_transformer),
            ('scaler', StandardScaler())
        ])),
        ('text_features', Pipeline([
            ('text_transformer', text_transformer),
            ('vect', TfidfVectorizer(stop_words='english'))
        ]))
    ])),
    ('condenser', dense_transformer)
])

X = feature_pipe.fit_transform(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [9]:
%%time
# Fit NearestNeighbors model
nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree', n_jobs=-1)
neighbors = nn.fit(X)
neighbors

Wall time: 23min 24s


NearestNeighbors(algorithm='kd_tree', n_jobs=-1, n_neighbors=10)

In [10]:
df[df['track_popularity'] >= 95]

Unnamed: 0,track_id,track_name,track_artist,lyrics,track_popularity,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,playlist_subgenre,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,language,clean_lyrics
1375,0nbXyq5TXYPCO7pr3N8S4I,The Box,Roddy Ricch,Pullin' out the coupe at the lot Told 'em fuck...,98,52u4anZbHd6UInnmHRFzba,Please Excuse Me For Being Antisocial,2019-12-06,RapCaviar,37i9dQZF1DX0XUsuxWHRQd,rap,hip hop,...,10,-6.687,0,0.0559,0.104,0.0,0.79,0.642,116.971,196653,en,pullin out the coupe at the lot told em fuck ...
1754,0sf12qNH5qcw8qpgymFOqD,Blinding Lights,The Weeknd,Yeah I've been tryna call I've been on my own ...,98,2ZfHkwHuoAZrlz7RMj0PDz,Blinding Lights,2019-11-29,Todo Éxitos,2ji5tRQVfnhaX1w9FhmSzk,pop,dance pop,...,1,-4.075,1,0.0629,0.00147,0.000209,0.0938,0.345,171.017,201573,en,yeah ive been tryna call ive been on my own fo...
4813,21jGcNKet2qwijlDFuPiPb,Circles,Post Malone,"Oh, oh, oh Oh, oh, oh Oh, oh, oh, oh, oh We ...",98,4g1ZRSobMefqF6nelkgibi,Hollywood's Bleeding,2019-09-06,Pop - Pop UK - 2019 - Canadian Pop - 2019 - Pop,46Cl6dmeiylK6TRGXr7hHe,pop,post-teen pop,...,0,-3.497,1,0.0395,0.192,0.00244,0.0863,0.553,120.042,215280,en,oh oh oh oh oh oh oh oh oh oh oh we couldnt tu...
5221,2b8fOow8UzyDFAE27YhOZM,Memories,Maroon 5,Here's to the ones that we got Cheers to the w...,98,3nR9B40hYLKLcR0Eph3Goc,Memories,2019-09-20,Todo Éxitos,2ji5tRQVfnhaX1w9FhmSzk,pop,dance pop,...,11,-7.209,1,0.0546,0.837,0.0,0.0822,0.575,91.019,189486,en,heres to the ones that we got cheers to the wi...
5590,2Fxmhks0bxGSBdJ92vM42m,bad guy,Billie Eilish,"White shirt now red, my bloody nose Sleepin', ...",95,0S0KGZnfBGSIssfF54WSJh,"WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?",2019-03-29,Pop Warmup 130 BPM,37i9dQZF1DX3PIAZMcbo2T,pop,dance pop,...,7,-10.965,1,0.375,0.328,0.13,0.1,0.562,135.128,194088,en,white shirt now red my bloody nose sleepin you...
6925,2XU0oxnq2qxCpomAAuJY8K,Dance Monkey,Tones and I,"They say, ""Oh my God, I see the way you shine ...",100,0UywfDKYlyiu1b38DRrzYD,Dance Monkey (Stripped Back) / Dance Monkey,2019-10-17,"post-teen alternative, indie, pop (large variety)",1y42gwI5cuwjBslPyQNfqb,pop,post-teen pop,...,6,-6.4,0,0.0924,0.692,0.000104,0.149,0.513,98.027,209438,en,they say oh my god i see the way you shine tak...
9369,3ZCTVFBt2Brf31RLEnCkWJ,everything i wanted,Billie Eilish,I had a dream I got everything I wanted Not wh...,97,4i3rAwPw7Ln2YrKDusaWyT,everything i wanted,2019-11-13,Todo Éxitos,2ji5tRQVfnhaX1w9FhmSzk,pop,dance pop,...,6,-14.454,0,0.0994,0.902,0.657,0.106,0.243,120.006,245426,en,i had a dream i got everything i wanted not wh...
9490,41L3O37CECZt3N7ziG2z7l,Yummy,Justin Bieber,"Yeah, you got that yummy-yum That yummy-yum, t...",95,1SN6N3fNkZk5oXQ9X46QZ3,Yummy,2020-01-03,Todo Éxitos,2ji5tRQVfnhaX1w9FhmSzk,pop,dance pop,...,9,-6.554,0,0.106,0.404,0.0,0.121,0.495,145.841,210427,en,yeah you got that yummyyum that yummyyum that ...
11342,4TnjEaWOeW0eKTKIEvJyCa,Falling,Trevor Daniel,"Oh Ooh, ooh My last made me feel like I would ...",97,1Czfd5tEby3DbdYNdqzrCa,Falling,2018-10-05,Electropop,2Z5cPJ6Z4EVZAfF08amjvL,pop,electropop,...,10,-8.756,0,0.0364,0.123,0.0,0.0887,0.236,127.087,159382,en,oh ooh ooh my last made me feel like i would n...
14477,696DnlkuDOXcMAnKlTgXXK,ROXANNE,Arizona Zervas,"All for the 'Gram Bitches love the 'Gram Oh, w...",99,6HJDrXs0hpebaRFKA1sF90,ROXANNE,2019-10-10,Global Top 50 | 2020 Hits,1KNl4AYfgZtOVm9KHkhPTF,latin,latin hip hop,...,6,-5.616,0,0.148,0.0522,0.0,0.46,0.457,116.735,163636,en,all for the gram bitches love the gram oh wait...


In [11]:
# Test model with example track
distances, indices = neighbors.kneighbors(X[5221].reshape(1, -1))

print(f"Track suggestions for {df.loc[5221, 'track_name']} by {df.loc[5221, 'track_artist']}:")
[df.loc[ind, ['track_name', 'track_artist']] for ind in indices]

Track suggestions for Memories by Maroon 5:


[                                              track_name   track_artist
 5221                                            Memories       Maroon 5
 12393                              The Reason - Acoustic     Hoobastank
 12585  Big Boy Diamonds (feat. Kodak Black & London o...     Gucci Mane
 15835                                          Boomerang    Lalo Ebratt
 2312   If You Can't Live Without Me, Why Aren't You D...  Mayday Parade
 1224                         Superlove (feat. Oh Wonder)        Whethan
 17266                                   Break Your Heart      Taio Cruz
 15987                                            Stripes      Ben Esser
 16617                       Strangers - Jonas Blue Remix         Sigrid
 14676                                            Too Bad     Rival Sons]