-
Notifications
You must be signed in to change notification settings - Fork 0
/
lyrics_classify_model.py
273 lines (224 loc) · 9.43 KB
/
lyrics_classify_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#lyricsgenius provides a simple interface to the song, artist, and lyrics data stored on Genius.com.
#Using this library you can convienently access the content on Genius.com And much more using the public API.
#pip install lyricsgenius
import lyricsgenius as genius
import os
import nltk
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import spacy
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import re
import config
nlp= spacy.load('en_core_web_md') # load medium spacy model
#Before we start, we will need Genius API credentials, got it through registeration here
#https://docs.genius.com/#/getting-started-h1
#you need your -client-access-token
#In terms of filtering, we’ll ignore lyrics that aren’t from official songs.It’s also a good idea to set
#remove_section_headers to True assuming we want our dataset to focus solely on spoken song lyrics and to
#exclude song metadata.
#define list of artists
artists = ['Frank Sinatra','Ed Sheeran','Taylor Swift']
max_songs=150
#search for songs for each artist in the list and save the songs to the artist folder
def collect_songs(artists,max_Songs):
api=genius.Genius(config.token,
excluded_terms = ["(Remix)", "(Live)"] ,
skip_non_songs=True,
remove_section_headers=True)
for artist in artists:
songs = (api.search_artist(artist, max_songs=max_Songs, sort='popularity')).songs
c=0
for song in songs:
fileName = os.path.join('lyrics/'+artist,"songnumber"+str(c)+".txt")
file = open(fileName, "w")
file.write(song.lyrics)
file.close()
c+=1
# ## Data cleaning
# 1- Natural-Language tool-kit(NLTK)
#
# Load the raw text.
# Split into tokens.
# Stem Words: Stemming refers to the process of reducing each word to its root or base
# Convert to lowercase.
# Remove punctuation from each token.
# Filter out remaining tokens that are not alphabetic.
# Filter out tokens that are stop words.
#
#nltk.download('punkt')
#nltk.download('stopwords')
def clean_songs_text_NLTK(artists,max_songs):
for artist in artists:
for i in range(max_songs):
fileName = os.path.join('lyrics/'+artist,"songnumber"+str(i)+".txt")
file = open(fileName, 'r')
text = file.read()
file.close()
# split into words
tokens = word_tokenize(text)
# stemming of words
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in tokens]
# convert to lower case
l_tokens = [w.lower() for w in stemmed]
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in l_tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
file_Name = os.path.join('lyrics_cleaned/'+artist,"song_cleaned_number"+str(i)+".txt")
f = open(file_Name, 'w')
for word in words:
f.write("%s " % word)
f.close()
# 2- clean text data with Spacy
def spacy_cleaner(artists,max_songs):
for artist in artists:
for i in range(max_songs):
fileName = os.path.join('lyrics/'+artist,"songnumber"+str(i)+".txt")
file = open(fileName, 'r')
text = file.read()
file.close()
# Apply spacy to the text
doc=nlp(text)
# Lemmatization,remove noise (stopwords, digit, puntuaction and single characters)
tokens=[token.lemma_.strip() for token in doc if
not token.is_stop and not nlp.vocab[token.lemma_].is_stop # remove StopWords
and not token.is_punct # Remove puntuaction
and not token.is_digit # Remove digit
and not token.is_space
and not token.is_quote
and not token.is_bracket
and not token.like_num
and not token.is_currency
]
# Remove empty tokens and one letter tokens
tokens = [token for token in tokens if token != "" and len(token)>1]
# Recreation of the text
new_text=" ".join(tokens)
# Remove non alphabetic characters
new_text = re.sub(r"[^a-zA-Z]", " ", new_text)
# remove non-Unicode characters
new_text = re.sub(r"[^\x00-\x7F]+", "", new_text)
new_text=new_text.lower()
file_Name = os.path.join('lyrics_cleaned/'+artist,"song_cleaned_number"+str(i)+".txt")
f =open(file_Name, 'w')
f.write(new_text)
f.close()
# ### Construct a Text Corpus
def create_corpus(artists_list):
CORPUS=[]
for artist in artists_list:
for fn in os.listdir('lyrics_cleaned/'+artist):
text = open( 'lyrics_cleaned/'+ artist+'/'+ fn).read()
CORPUS.append(text)
return CORPUS
def create_labels(artists_list,max_songs):
LABELS=[]
for artist in artists_list:
for _ in range(max_songs):
LABELS.append(artist)
return LABELS
"""train model with Random Forest classifier """
"""we want to use 1500 most occurring words as features for training our classifier.
So we only include those words that occur in at least 5 documents
Here 0.7 means that we should include only those words that occur in a maximum of (70%) of all the documents"""
def train_model_RF(X_train,y_train):
# Vectorize the text input and use Random Forest classifier
pipeline = make_pipeline(
TfidfVectorizer(max_features=2000, min_df=5, max_df=0.7,ngram_range=(1,2),stop_words='english'),
RandomForestClassifier(n_estimators=1000,max_depth=7)
)
pipeline.fit(X_train,y_train)
return pipeline
"""train model with Random Forest classifier + GridSearch"""
def train_model_RF_GridSearch(X_train,y_train):
pipeline = Pipeline([
('Tfidf',TfidfVectorizer()),
('RF',RandomForestClassifier())
])
params = {
'Tfidf__max_features':[1000,2000],
'Tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
'Tfidf__min_df':[5,7,10],
'Tfidf__max_df':[0.5,0.6,0.7],
'RF__n_estimators':[1000,2000],
'RF__max_depth':[3,5,7],
}
tfidf_gs = GridSearchCV(pipeline, param_grid=params, cv = 5, verbose = 1,scoring='accuracy',n_jobs=-1)
tfidf_gs.fit(X_train,y_train)
print('Best parameters:',tfidf_gs.best_params_)
best_model=tfidf_gs.best_estimator_
return best_model
""" train model with Naive bias using """
def train_model_NB(X_train,y_train):
pipeline = make_pipeline(
TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5,ngram_range=(1,2),stop_words='english'),
MultinomialNB(alpha=0.1)
)
pipeline.fit(X_train,y_train)
return pipeline
""" train model with Naive bias using GridSearch"""
def train_model_NB_GridSearch(X_train,y_train):
pipeline = Pipeline([
('Tfidf',TfidfVectorizer(stop_words='english')),
('NB',MultinomialNB())
])
params = {
'Tfidf__max_features':[1000,2000,4000],
'Tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
'Tfidf__min_df':[5,7,10],
'Tfidf__max_df':[0.5,0.6,0.7],
'NB__alpha':[1,0.5,0.1, 0.01, 0.001, 0.0001],
}
tfidf_gs = GridSearchCV(pipeline, param_grid=params, cv = 5, verbose = 1,scoring='accuracy',n_jobs=-1)
print('start training\n')
tfidf_gs.fit(X_train,y_train)
print('Best parameters:',tfidf_gs.best_params_)
best_model=tfidf_gs.best_estimator_
return best_model
def predict(model,text):
"""
Takes the pre-trained pipeline model and predicts the artist.
"""
prediction = model.predict(text)
probs = model.predict_proba(text)
return prediction[0], probs.max()
#collect_songs(artists,max_songs)
#clean_songs_text_NLTK(artists,max_songs)
#spacy_cleaner(artists,max_songs)
CORPUS=create_corpus(artists)
LABELS=create_labels(artists,max_songs)
# Split Data
X_train,X_test,y_train,y_test =train_test_split(CORPUS,LABELS,test_size=0.2,random_state=42)
#pipeline=train_model_RF(X_train,y_train)
#pipeline=train_model_RF_GridSearch(X_train,y_train)
pipeline=train_model_NB(X_train,y_train)
#pipeline=train_model_NB_GridSearch(X_train,y_train)
if __name__ == '__main__':
#evaluate on test data
y_pred=pipeline.predict(X_test)
print(" \n Model accuracy:",round(accuracy_score(y_test, y_pred),2))
# Evaluate the model
print('\n\n confusion matrix')
print(confusion_matrix(y_test,y_pred))
print('\n\n classification report')
print(classification_report(y_test,y_pred))