# Song Genre Classifier Project

Brief Description: A classifier model that would identify and predict genre of any song given its lyrics.

### Importing libraries

In [1]:
import numpy as np
import pandas as pd

### Importing datasets and viewing

In [2]:
english_lyrics = pd.read_csv('data/english_cleaned_lyrics.csv')
original_lyrics = pd.read_csv('data/original_cleaned_lyrics.csv')

In [3]:
english_lyrics

Unnamed: 0.1,Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know I'm gonna cut r...
1,1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it's like you seem s...
2,2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,4,4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it's pop...
...,...,...,...,...,...,...,...
218205,362232,362232,who-am-i-drinking-tonight,2012,edens-edge,Country,I gotta say Boy after only just a couple of da...
218206,362233,362233,liar,2012,edens-edge,Country,I helped you find her diamond ring You made me...
218207,362234,362234,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth Looks a...
218208,362235,362235,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth And I'm measu...


In [4]:
original_lyrics

Unnamed: 0.1,Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know I'm gonna cut r...
1,1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it's like you seem s...
2,2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,4,4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it's pop...
...,...,...,...,...,...,...,...
227444,362232,362232,who-am-i-drinking-tonight,2012,edens-edge,Country,I gotta say Boy after only just a couple of da...
227445,362233,362233,liar,2012,edens-edge,Country,I helped you find her diamond ring You made me...
227446,362234,362234,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth Looks a...
227447,362235,362235,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth And I'm measu...


### Dropping irrelevant fields

Since the classifier classifies genre of songs based on its lyrics, dropping fields other than lyrics and genre.

In [5]:
english_lyrics.drop(['Unnamed: 0', 'song', 'year', 'artist'], axis=1, inplace=True)
english_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,Oh baby how you doing You know I'm gonna cut r...
1,1,Pop,playin everything so easy it's like you seem s...
2,2,Pop,If you search For tenderness It isn't hard to ...
3,3,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,4,Pop,Party the people the people the party it's pop...
...,...,...,...
218205,362232,Country,I gotta say Boy after only just a couple of da...
218206,362233,Country,I helped you find her diamond ring You made me...
218207,362234,Country,Look at the couple in the corner booth Looks a...
218208,362235,Country,When I fly off this mortal earth And I'm measu...


In [6]:
original_lyrics.drop(['Unnamed: 0', 'song', 'year', 'artist'], axis=1, inplace=True)
original_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,Oh baby how you doing You know I'm gonna cut r...
1,1,Pop,playin everything so easy it's like you seem s...
2,2,Pop,If you search For tenderness It isn't hard to ...
3,3,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,4,Pop,Party the people the people the party it's pop...
...,...,...,...
227444,362232,Country,I gotta say Boy after only just a couple of da...
227445,362233,Country,I helped you find her diamond ring You made me...
227446,362234,Country,Look at the couple in the corner booth Looks a...
227447,362235,Country,When I fly off this mortal earth And I'm measu...


### Checking for missing values, and dropping them

In [7]:
english_lyrics.isnull().sum()

index     0
genre     0
lyrics    0
dtype: int64

In [8]:
original_lyrics.isnull().sum()

index     0
genre     0
lyrics    0
dtype: int64

### Grouping data based on genre

In [9]:
english_lyrics_grouped = english_lyrics.groupby('genre')
english_lyrics_grouped.count()

Unnamed: 0_level_0,index,lyrics
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Country,14158,14158
Electronic,6942,6942
Folk,1689,1689
Hip-Hop,22654,22654
Indie,2935,2935
Jazz,7310,7310
Metal,21210,21210
Other,3786,3786
Pop,34137,34137
R&B,3336,3336


In [10]:
original_lyrics_grouped = original_lyrics.groupby('genre')
original_lyrics_grouped.count()

Unnamed: 0_level_0,index,lyrics
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Country,14182,14182
Electronic,7231,7231
Folk,1992,1992
Hip-Hop,23215,23215
Indie,2970,2970
Jazz,7520,7520
Metal,22420,22420
Other,3989,3989
Pop,36439,36439
R&B,3354,3354


For further analysis in this classifier, we shall only be proceeding with the English Lyrics Dataset.
Thereby, as a result, only English songs can be classified by our model

### Performing tokenization on lyrics

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/pranav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from nltk.tokenize import word_tokenize

In [13]:
english_lyrics['lyrics'] = english_lyrics.apply(lambda row: nltk.word_tokenize(row['lyrics']), axis=1)
english_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,"[Oh, baby, how, you, doing, You, know, I, 'm, ..."
1,1,Pop,"[playin, everything, so, easy, it, 's, like, y..."
2,2,Pop,"[If, you, search, For, tenderness, It, is, n't..."
3,3,Pop,"[Oh, oh, oh, I, oh, oh, oh, I, If, I, wrote, a..."
4,4,Pop,"[Party, the, people, the, people, the, party, ..."
...,...,...,...
218205,362232,Country,"[I, got, ta, say, Boy, after, only, just, a, c..."
218206,362233,Country,"[I, helped, you, find, her, diamond, ring, You..."
218207,362234,Country,"[Look, at, the, couple, in, the, corner, booth..."
218208,362235,Country,"[When, I, fly, off, this, mortal, earth, And, ..."


In [14]:
print(english_lyrics.dtypes)

index      int64
genre     object
lyrics    object
dtype: object


### Formatting words (converting to lower case, and removing special characters)

In [15]:
english_lyrics['lyrics'] = english_lyrics['lyrics'].apply(lambda x: [item.lower() for item in x])
english_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,"[oh, baby, how, you, doing, you, know, i, 'm, ..."
1,1,Pop,"[playin, everything, so, easy, it, 's, like, y..."
2,2,Pop,"[if, you, search, for, tenderness, it, is, n't..."
3,3,Pop,"[oh, oh, oh, i, oh, oh, oh, i, if, i, wrote, a..."
4,4,Pop,"[party, the, people, the, people, the, party, ..."
...,...,...,...
218205,362232,Country,"[i, got, ta, say, boy, after, only, just, a, c..."
218206,362233,Country,"[i, helped, you, find, her, diamond, ring, you..."
218207,362234,Country,"[look, at, the, couple, in, the, corner, booth..."
218208,362235,Country,"[when, i, fly, off, this, mortal, earth, and, ..."


In [16]:
import re

In [17]:
english_lyrics['lyrics'] = english_lyrics['lyrics'].apply(lambda x: [re.sub('\W+', '', item) for item in x])
english_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,"[oh, baby, how, you, doing, you, know, i, m, g..."
1,1,Pop,"[playin, everything, so, easy, it, s, like, yo..."
2,2,Pop,"[if, you, search, for, tenderness, it, is, nt,..."
3,3,Pop,"[oh, oh, oh, i, oh, oh, oh, i, if, i, wrote, a..."
4,4,Pop,"[party, the, people, the, people, the, party, ..."
...,...,...,...
218205,362232,Country,"[i, got, ta, say, boy, after, only, just, a, c..."
218206,362233,Country,"[i, helped, you, find, her, diamond, ring, you..."
218207,362234,Country,"[look, at, the, couple, in, the, corner, booth..."
218208,362235,Country,"[when, i, fly, off, this, mortal, earth, and, ..."


### Removing stopwords

In [18]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/pranav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
stop = stopwords.words('english')

In [20]:
english_lyrics['lyrics'] = english_lyrics['lyrics'].apply(lambda x: [item for item in x if item not in stop])
english_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,"[oh, baby, know, gon, na, cut, right, chase, w..."
1,1,Pop,"[playin, everything, easy, like, seem, sure, s..."
2,2,Pop,"[search, tenderness, nt, hard, find, love, nee..."
3,3,Pop,"[oh, oh, oh, oh, oh, oh, wrote, book, stand, t..."
4,4,Pop,"[party, people, people, party, popping, sittin..."
...,...,...,...
218205,362232,Country,"[got, ta, say, boy, couple, dates, hands, outr..."
218206,362233,Country,"[helped, find, diamond, ring, made, try, every..."
218207,362234,Country,"[look, couple, corner, booth, looks, lot, like..."
218208,362235,Country,"[fly, mortal, earth, measured, depth, girth, f..."


### Stemming

In [21]:
from nltk.stem.porter import PorterStemmer

In [22]:
port = PorterStemmer()

In [23]:
english_lyrics['lyrics'] = english_lyrics['lyrics'].apply(lambda x: [port.stem(item) for item in x])
english_lyrics

Unnamed: 0,index,genre,lyrics
0,0,Pop,"[oh, babi, know, gon, na, cut, right, chase, w..."
1,1,Pop,"[playin, everyth, easi, like, seem, sure, stil..."
2,2,Pop,"[search, tender, nt, hard, find, love, need, l..."
3,3,Pop,"[oh, oh, oh, oh, oh, oh, wrote, book, stand, t..."
4,4,Pop,"[parti, peopl, peopl, parti, pop, sit, around,..."
...,...,...,...
218205,362232,Country,"[got, ta, say, boy, coupl, date, hand, outrigh..."
218206,362233,Country,"[help, find, diamond, ring, made, tri, everyth..."
218207,362234,Country,"[look, coupl, corner, booth, look, lot, like, ..."
218208,362235,Country,"[fli, mortal, earth, measur, depth, girth, fat..."
