In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import random
import collections
import math
import sys
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
%matplotlib inline

In [3]:
d0 = pd.read_csv("comp_lyrics_data.csv")

d = d0.drop(d0.columns[0], axis = 1)
# Data preprocessing: specify our training and test data
colname = [col for col in d]
colname.remove('Hit')
colname.remove('Artist')
colname.remove('Title')
d.shape

(12244, 19)

In [4]:
d1 = d.drop('Hit', axis = 1)
label = d0['Hit'] 

attribute = list()
for i,j in d1.iterrows():
    feature = dict()
    for col in colname:
        feature[col] = j[col]
    attribute.append(feature)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(attribute, label, test_size=0.20, random_state=42)

In [19]:
def featureExtractor(d):
    sparse = collections.defaultdict(float)
    regex = re.compile('[^A-Za-z0-9]')
    for key in d:
        if key == 'lyrics':
            for i in d[key].split():
                text = regex.sub('', i)
                sparse[text] += 1
        else:
            if(np.isnan(d[key])):
                sparse[key] = 0
            else:
                sparse[key] = d[key]
    return sparse

In [20]:
def generateSparse(examples, feature):          # want m*10000
    sparse = []
    for j in examples:
        sparse.append(feature(j))
    return sparse

In [21]:
trainsparse = generateSparse(X_train, featureExtractor)
testsparse = generateSparse(X_test, featureExtractor)

In [22]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer()
trainmatrix = vec.fit_transform(trainsparse)
testmatrix = vec.transform(testsparse)

In [24]:
svclassifier_rbf = SVC(kernel = 'rbf')
svclassifier_rbf.fit(trainmatrix, y_train)
y_pred_rbf = svclassifier_rbf.predict(testmatrix)
y_true = y_test
cm_rbf = confusion_matrix(y_true, y_pred_rbf)
print ('The confusion matrix is: ')
print (cm_rbf)

accuracy_rbf = accuracy_score(y_true, y_pred_rbf)
print ('The accuracy of the Guassian svc is {} .'.format(accuracy_rbf))

f1_rbf = f1_score(y_true, y_pred_rbf, average = 'binary')
print ('The F1 score of the Guassian svc is {} .'.format(f1_rbf))

The confusion matrix is: 
[[638 568]
 [422 821]]
The accuracy of the Guassian svc is 0.5957533687219273 .
The F1 score of the Guassian svc is 0.6238601823708206 .


In [23]:
lr = LogisticRegression(solver = 'liblinear', multi_class = 'ovr', max_iter=10000)
lr = lr.fit(trainmatrix, y_train)
y_pred = lr.predict(testmatrix)
y_true = y_test

accuracy = accuracy_score(y_true, y_pred)
print ('The accuracy of the logistic classifier is {} .'.format(accuracy))

cm = confusion_matrix(y_true, y_pred)
print ('The confusion matrix is: ')
print (cm)

f1 = f1_score(y_true, y_pred, average = 'binary')
print ('The F1 score of the logistic regression is {} .'.format(f1))

The accuracy of the logistic classifier is 0.7264189465087791 .
The confusion matrix is: 
[[870 336]
 [334 909]]
The F1 score of the logistic regression is 0.7307073954983921 .


In [17]:
lr = RandomForestClassifier(n_estimators=100)
lr = lr.fit(trainmatrix, y_train)
y_pred = lr.predict(testmatrix)
y_true = y_test

accuracy = accuracy_score(y_true, y_pred)
print ('The accuracy of the random forest classifier is {} .'.format(accuracy))

cm = confusion_matrix(y_true, y_pred)
print ('The confusion matrix is: ')
print (cm)

f1 = f1_score(y_true, y_pred, average = 'binary')
print ('The F1 score of the random forest regression is {} .'.format(f1))

The accuracy of the random forest classifier is 0.7104940792160065 .
The confusion matrix is: 
[[786 420]
 [289 954]]
The F1 score of the random forest regression is 0.7290790982040505 .
