In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [28]:
pwd

'C:\\Users\\Karthiek Duggirala'

In [10]:
df = pd.read_csv('C:\\Users\\Karthiek Duggirala\\Downloads\\preprocessed_data.csv')

In [11]:
df.head()

Unnamed: 0,lyrics,genre
0,summer high school when we first met we would ...,pop
1,yeah yeah yeah i can feel phoenix inside me i ...,pop
2,told them your dream they started i guess you ...,pop
3,if i lost it today would you stay could my lov...,pop
4,nice leg daisy duke make man go that is the wa...,pop


In [12]:
df.columns

Index(['lyrics', 'genre'], dtype='object')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df['lyrics'], df['genre'], test_size=0.2, random_state=42)

In [14]:
X_train.head()

18698    sombre ceremony teary testimony hole ground fl...
64464    dreaming tell difference mission godsent sacri...
8596     mind side though remember name go get somehow ...
4303     night left city dreamt wolf came wind cold tru...
49139    shootin kill nword shootin kill nword fuckin w...
Name: lyrics, dtype: object

In [15]:
y_train.head()

18698       pop
64464    hiphop
8596        pop
4303        pop
49139    hiphop
Name: genre, dtype: object

In [16]:
X_train.shape

(51621,)

In [17]:
X_test.shape

(12906,)

In [71]:
rf_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(random_state=42))
])

In [72]:
rf_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(random_state=42))])

In [73]:
rf_pred = rf_pipeline.predict(X_test)

In [74]:
rf_acc = accuracy_score(y_test, rf_pred)

In [75]:
print("Random Forest Classifier Accuracy:", rf_acc)
print(classification_report(y_test, rf_pred))

Random Forest Classifier Accuracy: 0.7194328219432822
              precision    recall  f1-score   support

      hiphop       0.91      0.84      0.87      4263
         pop       0.63      0.59      0.61      4230
        rock       0.64      0.73      0.68      4413

    accuracy                           0.72     12906
   macro avg       0.73      0.72      0.72     12906
weighted avg       0.73      0.72      0.72     12906



In [76]:
# Gradient Boosting Classifier
gb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', GradientBoostingClassifier(random_state=42))
])

In [77]:
gb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier(random_state=42))])

In [78]:
gb_pred = gb_pipeline.predict(X_test)

In [79]:
gb_acc = accuracy_score(y_test, gb_pred)

In [80]:
print("Gradient Boosting Classifier Accuracy:", gb_acc)
print(classification_report(y_test, gb_pred))

Gradient Boosting Classifier Accuracy: 0.6892918022625135
              precision    recall  f1-score   support

      hiphop       0.91      0.79      0.85      4263
         pop       0.59      0.57      0.58      4230
        rock       0.61      0.71      0.66      4413

    accuracy                           0.69     12906
   macro avg       0.70      0.69      0.69     12906
weighted avg       0.70      0.69      0.69     12906



In [81]:
# Multinomial Naive Bayes Classifier
nb_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [82]:
nb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [83]:
nb_pred = nb_pipeline.predict(X_test)

In [84]:
nb_acc = accuracy_score(y_test, nb_pred)

In [85]:
print("Multinomial Naive Bayes Classifier Accuracy:", nb_acc)
print(classification_report(y_test, nb_pred))

Multinomial Naive Bayes Classifier Accuracy: 0.6648070664807066
              precision    recall  f1-score   support

      hiphop       0.75      0.85      0.80      4263
         pop       0.59      0.60      0.60      4230
        rock       0.64      0.54      0.59      4413

    accuracy                           0.66     12906
   macro avg       0.66      0.67      0.66     12906
weighted avg       0.66      0.66      0.66     12906



In [7]:
mlp_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MLPClassifier(hidden_layer_sizes=(100,50), max_iter=1000))
])

In [8]:
mlp_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000))])


In [98]:
mlp_pred = mlp_pipeline.predict(X_test)

In [99]:
ml_acc = accuracy_score(y_test, mlp_pred)
print('MLP Classifier accuracy:', accuracy)

MLP Classifier accuracy: 0.6841004184100419


In [100]:
print("MLP Classifier Accuracy:", ml_acc)
print(classification_report(y_test, nb_pred))

MLP Classifier Accuracy: 0.6841004184100419
              precision    recall  f1-score   support

      hiphop       0.75      0.85      0.80      4263
         pop       0.59      0.60      0.60      4230
        rock       0.64      0.54      0.59      4413

    accuracy                           0.66     12906
   macro avg       0.66      0.67      0.66     12906
weighted avg       0.66      0.66      0.66     12906



In [18]:
svm_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC(kernel='linear', C=1, gamma='scale'))
])

In [19]:
svm_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', SVC(C=1, kernel='linear'))])


In [None]:
svm_pred = svm_pipeline.predict(X_test)

In [None]:
svm_acc = accuracy_score(y_test, svm_pred)

In [21]:
print("SVM Classifier Accuracy:", svm_acc)
print(classification_report(y_test, svm_pred))

SVM Classifier Accuracy : 0.6284328219432822
              precision    recall  f1-score   support

      hiphop       0.87      0.79      0.84      4263
         pop       0.61      0.54      0.65      4293
        rock       0.68      0.59      0.62      4457

    accuracy                           0.72     12906
   macro avg       0.73      0.72      0.72     12906
weighted avg       0.73      0.72      0.72     12906


In [16]:
knn_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier(n_neighbors=5))
])

In [17]:
knn_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', KNeighborsClassifier())])

In [18]:
knn_pred = knn_pipeline.predict(X_test)

In [19]:
knn_acc = accuracy_score(y_test, knn_pred)

In [20]:
print("KNN Classifier Accuracy:", knn_acc)
print(classification_report(y_test, knn_pred))

KNN Classifier Accuracy : 0.5328621283948262
              precision    recall  f1-score   support

      hiphop       0.76      0.71      0.74      4298
         pop       0.61      0.63      0.54      4272
        rock       0.64      0.57      0.62      4464

    accuracy                           0.72     12906
   macro avg       0.73      0.68      0.61     12906
weighted avg       0.73      0.68      0.61     12906


In [23]:
lr_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(random_state=42, solver='liblinear', max_iter=1000))
])

In [24]:
lr_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 LogisticRegression(max_iter=1000, random_state=42,
                                    solver='liblinear'))])

In [25]:
lr_pred = lr_pipeline.predict(X_test)

In [26]:
lr_acc = accuracy_score(y_test, lr_pred)


In [27]:
print("Logistic Regression Classifier Accuracy:", lr_acc)
print(classification_report(y_test, lr_pred))

Logistic Regression Classifier Accuracy: 0.7026189369285604
              precision    recall  f1-score   support

      hiphop       0.90      0.81      0.85      4263
         pop       0.60      0.59      0.60      4230
        rock       0.63      0.70      0.67      4413

    accuracy                           0.70     12906
   macro avg       0.71      0.70      0.71     12906
weighted avg       0.71      0.70      0.71     12906

