In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

In [6]:
pwd

'C:\\Users\\Karthiek Duggirala'

In [2]:
df = pd.read_csv('C:\\Users\\Karthiek Duggirala\\Downloads\\preprocessed_data.csv')

In [3]:
# Train Word2Vec model on the corpus
corpus = [doc.split() for doc in df['lyrics']]
w2v_model = Word2Vec(corpus, vector_size=100, window=5, min_count=1)

In [4]:
# Convert text data into word embeddings using the trained Word2Vec model
X = np.array([np.mean([w2v_model.wv[word] for word in doc.split() if word in w2v_model.wv], axis=0) for doc in df['lyrics']])
y = np.array(df['genre'])

In [5]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Define pipeline with Random Forest Classifier
rf_pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

In [10]:
# Fit pipeline to training data
rf_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', RandomForestClassifier(random_state=42))])

In [11]:
# Make predictions on test data
rf_pred = rf_pipeline.predict(X_test)

In [12]:
# Evaluate model performance
rf_acc = accuracy_score(y_test, rf_pred)

In [13]:
print("Random Forest Classifier Accuracy:", rf_acc)
print(classification_report(y_test, rf_pred))

Random Forest Classifier Accuracy: 0.6817759181775919
              precision    recall  f1-score   support

      hiphop       0.87      0.80      0.83      4263
         pop       0.59      0.60      0.60      4230
        rock       0.61      0.65      0.63      4413

    accuracy                           0.68     12906
   macro avg       0.69      0.68      0.69     12906
weighted avg       0.69      0.68      0.68     12906



In [19]:
# Define pipeline with Gradient Boosting Classifier
gb_pipeline = Pipeline([
    ('clf', GradientBoostingClassifier(random_state=42))
])

In [20]:
# Fit pipeline to training data
gb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', GradientBoostingClassifier(random_state=42))])

In [21]:
# Make predictions on test data
gb_pred = gb_pipeline.predict(X_test)

In [22]:
# Evaluate model performance
gb_acc = accuracy_score(y_test, gb_pred)

In [23]:
print("Gradient Boosting Classifier Accuracy:", gb_acc)
print(classification_report(y_test, gb_pred))

Gradient Boosting Classifier Accuracy: 0.663799783046645
              precision    recall  f1-score   support

      hiphop       0.84      0.78      0.81      4263
         pop       0.57      0.58      0.58      4230
        rock       0.60      0.62      0.61      4413

    accuracy                           0.66     12906
   macro avg       0.67      0.66      0.67     12906
weighted avg       0.67      0.66      0.67     12906



In [27]:
# Define pipeline with Logistic Regression
lr_pipeline = Pipeline([
    ('clf', LogisticRegression(random_state=42, max_iter=1000))
])

In [28]:
# Fit pipeline to training data
lr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', LogisticRegression(max_iter=1000, random_state=42))])

In [29]:
# Make predictions on test data
lr_pred = lr_pipeline.predict(X_test)

In [30]:
# Evaluate model performance
lr_acc = accuracy_score(y_test, lr_pred)

In [31]:
print("Logistic Regression Accuracy:", lr_acc)
print(classification_report(y_test, lr_pred))

Logistic Regression Accuracy: 0.6518673485200682
              precision    recall  f1-score   support

      hiphop       0.80      0.79      0.80      4263
         pop       0.57      0.54      0.55      4230
        rock       0.59      0.63      0.61      4413

    accuracy                           0.65     12906
   macro avg       0.65      0.65      0.65     12906
weighted avg       0.65      0.65      0.65     12906



In [34]:
# Define pipeline with Naive Bayes
nb_pipeline = Pipeline([
    ('clf', GaussianNB())
])

In [35]:
# Fit pipeline to training data
nb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', GaussianNB())])

In [36]:
# Make predictions on test data
nb_pred = nb_pipeline.predict(X_test)

In [37]:
# Evaluate model performance
nb_acc = accuracy_score(y_test, nb_pred)

In [38]:
print("Naive Bayes Accuracy:", nb_acc)
print(classification_report(y_test, nb_pred))

Naive Bayes Accuracy: 0.6141329614132961
              precision    recall  f1-score   support

      hiphop       0.73      0.79      0.76      4263
         pop       0.54      0.55      0.54      4230
        rock       0.56      0.51      0.53      4413

    accuracy                           0.61     12906
   macro avg       0.61      0.62      0.61     12906
weighted avg       0.61      0.61      0.61     12906



In [40]:
# Define pipeline with SVM
svm_pipeline = Pipeline([
    ('clf', SVC(kernel='linear', C=1, random_state=42))
])

In [41]:
# Fit pipeline to training data
svm_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', SVC(C=1, kernel='linear', random_state=42))])

In [42]:
# Make predictions on test data
svm_pred = svm_pipeline.predict(X_test)

In [43]:
# Evaluate model performance
svm_acc = accuracy_score(y_test, svm_pred)


In [44]:
print("SVM Accuracy:", svm_acc)
print(classification_report(y_test, svm_pred))

SVM Accuracy: 0.6525646985898031
              precision    recall  f1-score   support

      hiphop       0.81      0.77      0.79      4263
         pop       0.57      0.53      0.55      4230
        rock       0.59      0.65      0.62      4413

    accuracy                           0.65     12906
   macro avg       0.66      0.65      0.65     12906
weighted avg       0.66      0.65      0.65     12906



In [61]:
# Define pipeline with KNN
knn_pipeline = Pipeline([
    ('clf', KNeighborsClassifier(n_neighbors=5))
])

In [62]:
# Fit pipeline to training data
knn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('clf', KNeighborsClassifier())])

In [63]:
# Make predictions on test data
knn_pred = knn_pipeline.predict(X_test)

In [64]:
# Evaluate model performance
knn_acc = accuracy_score(y_test, knn_pred)

In [65]:
print("KNN Accuracy:", knn_acc)
print(classification_report(y_test, knn_pred))

KNN Accuracy: 0.568727723539439
              precision    recall  f1-score   support

      hiphop       0.56      0.90      0.69      4263
         pop       0.57      0.50      0.54      4230
        rock       0.60      0.31      0.41      4413

    accuracy                           0.57     12906
   macro avg       0.58      0.57      0.54     12906
weighted avg       0.58      0.57      0.54     12906



In [67]:
# Define pipeline with MLP
mlp_pipeline = Pipeline([
    ('clf', MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', random_state=42))
])

In [68]:
# Fit pipeline to training data
mlp_pipeline.fit(X_train, y_train)



Pipeline(steps=[('clf', MLPClassifier(random_state=42))])

In [69]:
# Make predictions on test data
mlp_pred = mlp_pipeline.predict(X_test)

In [70]:
# Evaluate model performance
mlp_acc = accuracy_score(y_test, mlp_pred)

In [71]:
print("MLP Accuracy:", mlp_acc)
print(classification_report(y_test, mlp_pred))

MLP Accuracy: 0.6770494343716101
              precision    recall  f1-score   support

      hiphop       0.87      0.80      0.83      4263
         pop       0.58      0.59      0.58      4230
        rock       0.61      0.64      0.62      4413

    accuracy                           0.68     12906
   macro avg       0.68      0.68      0.68     12906
weighted avg       0.68      0.68      0.68     12906

