# Genre Classification

## Load data and preprocessing

In [11]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC

In [12]:
# Set our random state key
RANDOM_STATE = 123

# Read in data from csv file
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,Lyric,Genre
0,"Handy dandy, controversy surrounds him. He bee...",Rock
1,"Same bed, but it feels just a little bit bigge...",Rock
2,Saw ya out by the pool on the 8th of July. Did...,Pop
3,Got my shit down super tight. Got my shit down...,Hip Hop
4,[talk:]. Ay soulja boy in da buildin. Ay i wan...,Hip Hop


In [13]:
# Split our dataset into training, testing
training = df.head(50000)
testing = df.tail(5000)

# Extract features from our training set using TF-IDF
tf_idf_vectorizer_training = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=2500)
tf_idf_training = tf_idf_vectorizer_training.fit_transform(training["Lyric"])
tf_idf_training_df = pd.DataFrame(tf_idf_training.toarray(), columns=tf_idf_vectorizer_training.get_feature_names_out())
tf_idf_training_df.head()

Unnamed: 0,10,100,11,12,15,20,2x,3x,40,4x,...,yesterday,yo,yo yo,york,young,youth,yuh,yup,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.089578,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.035163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Extract the same features from our testing set using TF-IDF
tf_idf_vectorizer_testing = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=2500, vocabulary=tf_idf_vectorizer_training.get_feature_names_out())
tf_idf_testing = tf_idf_vectorizer_testing.fit_transform(testing["Lyric"])
tf_idf_testing_df = pd.DataFrame(tf_idf_testing.toarray(), columns=tf_idf_vectorizer_testing.get_feature_names_out())
tf_idf_testing_df.head()

Unnamed: 0,10,100,11,12,15,20,2x,3x,40,4x,...,yesterday,yo,yo yo,york,young,youth,yuh,yup,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Make sure our features are the same
(tf_idf_vectorizer_training.get_feature_names_out() == tf_idf_vectorizer_testing.get_feature_names_out()).all()

True

## Generate predicted accuracy

I will be using the Linear Support Vector Machine Classifier model. This is the link to the sklearn url:

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC

In [16]:
svm_results = []

# Use stratified k-fold, using 10 splits
skf = StratifiedKFold(n_splits=10, random_state=RANDOM_STATE, shuffle=True)

for train_index, test_index in skf.split(training, training["Genre"]):
    # Split our data into training and test sets
    X_train = tf_idf_training_df.loc[train_index]
    X_test = tf_idf_training_df.loc[test_index]
    y_train = training.loc[train_index, "Genre"]
    y_test = training.loc[test_index, "Genre"]
    
    # Train Linear Support Vector Machine Classifier
    svm = LinearSVC(random_state=RANDOM_STATE, penalty="l1", dual=False)
    svm.fit(X_train, y_train)
    
    # Test our model and store the accuracy
    accuracy = svm.score(X_test, y_test)
    svm_results.append(accuracy)

predicted_accuracy = np.mean(svm_results)
print("Predicted accuracy: ", predicted_accuracy)

Predicted accuracy:  0.65716


In [21]:
svm_results

[0.6578, 0.6648, 0.661, 0.654, 0.647, 0.6588, 0.6534, 0.6538, 0.6564, 0.6646]

## Generate predictions

In [18]:
# Create our test and train sets
X_train = tf_idf_training_df
X_test = tf_idf_testing_df
y_train = training["Genre"]

# Train Linear Support Vector Machine Classifier
svm = LinearSVC(random_state=RANDOM_STATE, penalty="l1", dual=False)
svm.fit(X_train, y_train)

# Create our predictions from our test set
testing_predictions = svm.predict(X_test)
testing_predictions

array(['Rock', 'Rock', 'Rock', ..., 'Rock', 'Hip Hop', 'Pop'],
      dtype=object)

In [19]:
# Export our predictions to a csv
pd.Series(testing_predictions).to_csv('pred.csv', index=False, header=False)