In [1]:
# import pandas and rename as pd
import pandas as pd
#import stopwords
import nltk
from nltk.corpus import stopwords
#from stop_words import get_stop_words

In [2]:
#set the file path
path = 'evermore_lyrics.csv'

In [8]:
# convert our csv file to a pandas DataFrame
swift_lyrics = pd.read_csv(path)

In [9]:
#print the DataFrame
swift_lyrics

Unnamed: 0,album_name,track_title,track_n,lyric,line
0,evermore (deluxe version),willow,1,I'm like the water when your ship rolled in th...,1
1,evermore (deluxe version),willow,1,"Rough on the surface, but you cut through like...",2
2,evermore (deluxe version),willow,1,And if it was an open-shut case,3
3,evermore (deluxe version),willow,1,I never would've known from that look on your ...,4
4,evermore (deluxe version),willow,1,Lost in your current like a priceless wine,5
...,...,...,...,...,...
907,evermore (deluxe version),​it’s time to go,17,"You know, you know, you know, you know",50
908,evermore (deluxe version),​it’s time to go,17,When it's time to go,51
909,evermore (deluxe version),​it’s time to go,17,So then you go,52
910,evermore (deluxe version),​it’s time to go,17,Then you go,53


In [27]:
#print all the different track titles 
swift_lyrics.track_title.unique()

array(['willow', 'champagne problems', '\u200bgold rush',
       '\u200b’tis the damn season', '\u200btolerate it',
       '\u200bno body, no crime', '\u200bhappiness', '\u200bdorothea',
       '\u200bconey island', '\u200bivy', 'cowboy like me',
       '\u200bl\u200bong story short', '\u200bmarjorie', '\u200bclosure',
       '\u200bevermore', '\u200br\u200bight where you left me',
       '\u200bit’s time to go'], dtype=object)

In [11]:
lyrics = swift_lyrics.lyric

In [12]:
lyrics

0      I'm like the water when your ship rolled in th...
1      Rough on the surface, but you cut through like...
2                        And if it was an open-shut case
3      I never would've known from that look on your ...
4             Lost in your current like a priceless wine
                             ...                        
907               You know, you know, you know, you know
908                                 When it's time to go
909                                       So then you go
910                                          Then you go
911                                          You just go
Name: lyric, Length: 912, dtype: object

In [13]:
from stop_words import get_stop_words

# here's the template for how we want to structure our features
# {'first_keyword': slkdjfsl, 'second_keyword': lsdjfls, 'third_keyword': slkdjf}
def create_features(lyrics):
    # make all the words lowercase
    song_lyrics = lyrics.lower()
    # split the headline into a list of words
    words = song_lyrics.split()
    # using list comprehension to create a list of keywords
    keywords = [ x for x in words if x not in get_stop_words('english')  ]
    # make sure all keyword lists have three elements
    while len(keywords) < 3:
        keywords.append('None')
    return {'first_keyword': keywords[0], 'second_keyword': keywords[1], 'third_keyword': keywords[2]}

In [24]:
# bring together the track(label), and the lyrics(features)
zipped_feature_and_labels = zip(lyrics, swift_lyrics.track_title)

In [15]:
#check that the function works
create_features('It was sighted off of Rt 19')

{'first_keyword': 'sighted', 'second_keyword': 'rt', 'third_keyword': '19'}

In [25]:
# we are creating a list of tuples
# we want () around our headline and category to show they are tuples
feature_sets = [ (create_features(x),y) for x,y in zipped_feature_and_labels   ]
feature_sets

[({'first_keyword': 'like',
   'second_keyword': 'water',
   'third_keyword': 'ship'},
  'willow'),
 ({'first_keyword': 'rough',
   'second_keyword': 'surface,',
   'third_keyword': 'cut'},
  'willow'),
 ({'first_keyword': 'open-shut',
   'second_keyword': 'case',
   'third_keyword': 'None'},
  'willow'),
 ({'first_keyword': 'never',
   'second_keyword': "would've",
   'third_keyword': 'known'},
  'willow'),
 ({'first_keyword': 'lost',
   'second_keyword': 'current',
   'third_keyword': 'like'},
  'willow'),
 ({'first_keyword': 'say,', 'second_keyword': 'less', 'third_keyword': 'know'},
  'willow'),
 ({'first_keyword': 'wherever',
   'second_keyword': 'stray,',
   'third_keyword': 'follow'},
  'willow'),
 ({'first_keyword': 'begging',
   'second_keyword': 'take',
   'third_keyword': 'hand'},
  'willow'),
 ({'first_keyword': 'wreck',
   'second_keyword': 'plans,',
   'third_keyword': 'man'},
  'willow'),
 ({'first_keyword': 'life',
   'second_keyword': 'willow',
   'third_keyword': 'ben

In [17]:
# always shuffle your featureset before classifying!
import random
random.shuffle(feature_sets)

In [18]:
import math

split_num = math.floor(len(feature_sets) * .8)

# 80% of my data
training_set = feature_sets[:split_num]
# 20% of my data
testing_set = feature_sets[split_num:]

In [19]:
import nltk
# create our classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [26]:
# try out our classifier using a headline
classifier.classify(create_features("Lost in your current like a priceless wine"))

'\u200bconey island'

In [21]:
# determine the accuracy of our classifier
print(nltk.classify.accuracy(classifier, testing_set))

0.4972677595628415


In [22]:
# show the most important features for our algorithm
classifier.show_most_informative_features(12)

Most Informative Features
          second_keyword = 'None'         ​everm : ​marjo =      8.3 : 1.0
           first_keyword = 'no,'          ​no bo : ​happi =      7.9 : 1.0
          second_keyword = 'stay'         ​marjo : ​it’s  =      6.9 : 1.0
           first_keyword = 'left'         ​r​igh : ​marjo =      6.3 : 1.0
           first_keyword = 'think'        ​no bo : ​happi =      5.9 : 1.0
           first_keyword = 'like'         ​gold  : ​l​ong =      5.6 : 1.0
           first_keyword = 'know'         ​marjo : ​happi =      5.1 : 1.0
           third_keyword = 'None'         ​l​ong : willow =      4.9 : 1.0
           first_keyword = 'long'         ​l​ong : ​r​igh =      4.7 : 1.0
           first_keyword = 'everybody'    ​gold  : ​r​igh =      4.6 : 1.0
           first_keyword = 'begging'      willow : ​it’s  =      4.4 : 1.0
          second_keyword = 'like'         ​gold  : ​l​ong =      4.1 : 1.0
