In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report
import pandas as pd

In [3]:
trainData = pd.read_csv("train_lyrics_1000.csv")
testData = pd.read_csv("valid_lyrics_200.csv")
trainData.tail()

Unnamed: 0,file,artist,title,lyrics,genre,mood,year
995,TRBIGRY128F42597B3.h5,Sade,All About Our Love,Its all about our love\nSo shall it be forever...,R&B,sad,2000
996,TRBIIEU128F9307C88.h5,New Found Glory,Don't Let Her Pull You Down,It's time that I rain on your parade\nWatch as...,Rock,happy,2009
997,TRBIIJY12903CE4755.h5,Mindy McCready,Ten Thousand Angels,Speakin of the devil\nLook who just walked in\...,Country,happy,1996
998,TRBIIOT128F423C594.h5,Joy Division,Leaders Of Men,Born from some mother's womb\nJust like any ot...,Rock,sad,1978
999,TRBIJYB128F14AE326.h5,Seventh Day Slumber,Shattered Life,"This wanting more from me is tearing me, it's ...",Rock,sad,2005


In [4]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [5]:
train_vectors = vectorizer.fit_transform(trainData['lyrics'])
test_vectors = vectorizer.transform(testData['lyrics'])

In [7]:
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, trainData['mood'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

In [21]:
classfier = classifier_linear.predict(train_vectors)

In [24]:
# results
print("Results for SVC(kernel=linear)")
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(testData['mood'], prediction_linear, output_dict=True)
print('Testing data')
print('positive: ', report['happy'])
print('negative: ', report['sad'])

Results for SVC(kernel=linear)
Training time: 0.785746s; Prediction time: 0.181941s
Testing data
positive:  {'precision': 0.725, 'recall': 0.5523809523809524, 'f1-score': 0.6270270270270271, 'support': 105}
negative:  {'precision': 0.6083333333333333, 'recall': 0.7684210526315789, 'f1-score': 0.6790697674418604, 'support': 95}


In [26]:
report = classification_report(trainData['mood'], classfier, output_dict=True)
print('Training data')
print('positive: ', report['happy'])
print('negative: ', report['sad'])

Training data
positive:  {'precision': 0.9656862745098039, 'recall': 0.8834080717488789, 'f1-score': 0.9227166276346604, 'support': 446}
negative:  {'precision': 0.9121621621621622, 'recall': 0.9747292418772563, 'f1-score': 0.9424083769633508, 'support': 554}


In [14]:
X_train = trainData['lyrics']
y_train = trainData['mood']
X_valid = testData['lyrics']
y_valid = testData['mood']

In [10]:
from sklearn import metrics

In [11]:
acc_scorer = metrics.make_scorer(metrics.accuracy_score, greater_is_better=True)
pre_scorer = metrics.make_scorer(metrics.precision_score, greater_is_better=True, pos_label=0)
rec_scorer = metrics.make_scorer(metrics.recall_score, greater_is_better=True, pos_label=0)
f1_scorer = metrics.make_scorer(metrics.f1_score, greater_is_better=True, pos_label=0)

In [31]:
a = print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_valid, prediction_linear)))

Accuracy: 0.6550
