In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [31]:
pwd

'C:\\Users\\Karthiek Duggirala'

In [3]:
glove_file = 'C:/Users/Karthiek Duggirala/Downloads/glove.6B/glove.6B.100d.txt'

In [4]:
# Load pre-trained word embeddings into memory
def load_embeddings(filename):
    embeddings = {}
    with open(filename, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

In [5]:
# Define a custom vectorizer to use the pre-trained word embeddings
class GloveVectorizer:
    def __init__(self, glove):
        self.glove = glove
        self.word2idx = {word: idx for idx, word in enumerate(glove.keys())}
        self.dim = len(next(iter(glove.values())))
        
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        X = X.apply(lambda x: [self.glove.get(word, np.zeros(self.dim)) for word in x.split()])
        X = X.apply(lambda x: np.mean(x, axis=0))
        return np.stack(X.values)

In [6]:
# Load data and split into training and test sets
df = pd.read_csv('C:\\Users\\Karthiek Duggirala\\Downloads\\preprocessed_data.csv')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['lyrics'], df['genre'], test_size=0.2, stratify = df['genre'],random_state=42)

In [8]:
# Load pre-trained word embeddings and build a custom vectorizer
glove = load_embeddings(glove_file)
vectorizer = GloveVectorizer(glove)

In [9]:
# Build a pipeline with the custom vectorizer and a random forest classifier
rf_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', RandomForestClassifier(random_state=42))
])

In [10]:
# Fit the pipeline on the training data and evaluate on the test data
rf_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x0000022B5EC43EE0>),
                ('clf', RandomForestClassifier(random_state=42))])

In [11]:
rf_pred = rf_pipeline.predict(X_test)

In [12]:
rf_acc = accuracy_score(y_test, rf_pred)

In [13]:
print("Random Forest Classifier Accuracy:", rf_acc)
print(classification_report(y_test, rf_pred))

Random Forest Classifier Accuracy: 0.67100573376724
              precision    recall  f1-score   support

      hiphop       0.88      0.79      0.83      4256
         pop       0.58      0.59      0.59      4293
        rock       0.59      0.63      0.61      4357

    accuracy                           0.67     12906
   macro avg       0.68      0.67      0.68     12906
weighted avg       0.68      0.67      0.67     12906



In [15]:
gb_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', GradientBoostingClassifier(random_state=42))
])

In [16]:
gb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x0000022B5EC43EE0>),
                ('clf', GradientBoostingClassifier(random_state=42))])

In [17]:
gb_pred = gb_pipeline.predict(X_test)

In [18]:
gb_acc = accuracy_score(y_test, gb_pred)

In [19]:
print("Gradient Boosting Classifier Accuracy:", gb_acc)
print(classification_report(y_test, gb_pred))

Gradient Boosting Classifier Accuracy: 0.6473733147373315
              precision    recall  f1-score   support

      hiphop       0.83      0.79      0.81      4256
         pop       0.55      0.58      0.56      4293
        rock       0.58      0.58      0.58      4357

    accuracy                           0.65     12906
   macro avg       0.65      0.65      0.65     12906
weighted avg       0.65      0.65      0.65     12906



In [26]:
# Define a pipeline that vectorizes the text using GloVe and uses logistic regression for classification
lr_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', LogisticRegression(random_state=42,max_iter=1000))
])

In [27]:
# Fit the pipeline to the training data
lr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x0000022B5EC43EE0>),
                ('clf', LogisticRegression(max_iter=1000, random_state=42))])

In [28]:
# Use the pipeline to predict the genres for the test data
lr_pred = lr_pipeline.predict(X_test)

In [29]:
# Compute the accuracy of the predictions
lr_acc = accuracy_score(y_test, lr_pred)

In [30]:
# Print the accuracy and classification report
print("Logistic Regression Classifier Accuracy:", lr_acc)
print(classification_report(y_test, lr_pred))

Logistic Regression Classifier Accuracy: 0.6242057957539129
              precision    recall  f1-score   support

      hiphop       0.74      0.79      0.76      4256
         pop       0.55      0.54      0.55      4293
        rock       0.57      0.55      0.56      4357

    accuracy                           0.62     12906
   macro avg       0.62      0.63      0.62     12906
weighted avg       0.62      0.62      0.62     12906



In [36]:
# Build a pipeline with the custom vectorizer and a Gaussian Naive Bayes classifier
nb_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', GaussianNB())
])

In [37]:

# Fit the pipeline on the training data and evaluate on the test data
nb_pipeline.fit(X_train, y_train)


Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x0000022B5EC43EE0>),
                ('clf', GaussianNB())])

In [38]:
nb_pred = nb_pipeline.predict(X_test)

In [39]:
nb_acc = accuracy_score(y_test, nb_pred)

In [41]:
print("Gaussian Naive Bayes Classifier Accuracy:", nb_acc)
print(classification_report(y_test, nb_pred))

Gaussian Naive Bayes Classifier Accuracy: 0.6032078103207811
              precision    recall  f1-score   support

      hiphop       0.73      0.79      0.76      4256
         pop       0.52      0.54      0.53      4293
        rock       0.54      0.48      0.51      4357

    accuracy                           0.60     12906
   macro avg       0.60      0.60      0.60     12906
weighted avg       0.60      0.60      0.60     12906



In [9]:
# Build a pipeline with the custom vectorizer and an SVM classifier
svm_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', SVC(random_state=42))
])

In [10]:
# Fit the pipeline on the training data and evaluate on the test data
svm_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x00000279FD77EEE0>),
                ('clf', SVC(random_state=42))])


In [12]:
svm_pred = svm_pipeline.predict(X_test)

In [13]:
svm_acc = accuracy_score(y_test, svm_pred)

In [14]:
print("SVM Classifier Accuracy:", svm_acc)
print(classification_report(y_test, svm_pred))

SVM Classifier Accuracy : 0.6173674789542442
              precision    recall  f1-score   support

      hiphop       0.81      0.78      0.84      4263
         pop       0.58      0.54      0.51      4230
        rock       0.59      0.55      0.52      4431

    accuracy                           0.65     12906
   macro avg       0.71      0.68      0.65     12906
weighted avg       0.74      0.69      0.65     12906


In [16]:
# Build a pipeline with the custom vectorizer and a KNN classifier
knn_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])

In [17]:
# Fit the pipeline on the training data and evaluate on the test data
knn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x00000279FD77EEE0>),
                ('clf', KNeighborsClassifier())])


In [20]:
knn_pred = knn_pipeline.predict(X_test)

In [21]:
knn_acc = accuracy_score(y_test, knn_pred)

In [22]:
print("KNN Classifier Accuracy:", knn_acc)
print(classification_report(y_test, knn_pred))

KNN Classifier Accuracy : 0.5458621283948262
              precision    recall  f1-score   support

      hiphop       0.54      0.85      0.65      4269
         pop       0.53      0.45      0.56      4232
        rock       0.58      0.39      0.42      4414

    accuracy                           0.57     12906
   macro avg       0.56      0.56      0.54     12906
weighted avg       0.58      0.54      0.54     12906


In [25]:
# Build a pipeline with the custom vectorizer and an MLP classifier
mlp_pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42))
])


In [27]:
# Fit the pipeline on the training data and evaluate on the test data
mlp_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 <__main__.GloveVectorizer object at 0x00000279FD77EEE0>),
                ('clf',
                 MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000,
                               random_state=42))])


In [28]:
mlp_pred = mlp_pipeline.predict(X_test)

In [29]:
mlp_acc = accuracy_score(y_test, mlp_pred)

In [30]:
print("MLP Classifier Accuracy:", mlp_acc)
print(classification_report(y_test, mlp_pred))

MLP Classifier Accuracy : 0.6698621283948262
              precision    recall  f1-score   support

      hiphop       0.85      0.78      0.81      4261
         pop       0.56      0.57      0.54      4228
        rock       0.64      0.58      0.60      4418

    accuracy                           0.66     12906
   macro avg       0.66      0.66      0.66     12906
weighted avg       0.66      0.66      0.66     12906
