In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("./Dataset/dataset.csv")
df.head()

Unnamed: 0,author,data,lyrics,style,title
0,Midian Lima,,"Amanheceu,Nada pesquei,Parecia ser apenas mais...",eletrônica,Não Pare
1,MC Doni,,"Não adianta tu se declarar,Romance, compromiss...",eletrônica,Te Amo Sem Compromisso (To Nem Aí)
2,Sebastián Yatra,,"I remember when I met you,I didn't wanna fall,...",eletrônica,My Only One (No Hay Nadie Más) (part. Isabela ...
3,Billie Eilish,,"Thought I found a way,Thought I found a way ou...",eletrônica,Lovely (feat. Khalid)
4,Ed Sheeran,,"I found a love for me,Darling, just dive right...",eletrônica,Perfect


## Reducing for 5 genres for runtime purpose

In [3]:
from functools import reduce
import operator

_id = reduce(
    operator.add,
    [
        df["style"] == "pop",
        df["style"] == "rock",
        df["style"] == "reggae",
        df["style"] == "indie",
        df["style"] == "country"
    ]
)
df = df.loc[_id]
df["style"].unique()

  .format(op=op_str, alt_op=unsupported[op_str]))


array(['pop', 'country', 'reggae', 'rock', 'indie'], dtype=object)

## Creating a bag of words

In [4]:
styles_unique = df["style"].unique()
styles = df["style"]


bag_of_words = []
for author, lyrics, title in zip(df["author"], df["lyrics"], df["title"]):
    if author and lyrics and title: #avoiding empty and NaN fields
        bag_of_words.append("\n".join([str(author),str(title),str(lyrics)]))
        
bag_of_words[0]

"Michael Jackson\nThe Way You Make Me Feel\nHee! Hee!,Ooh!,Go on girl!,Aaow!,Hey, pretty baby with the high heels on,You give me fever like I've never, ever known,You're just a product of loveliness,I like the groove of your walk,,Your talk, your dress,I feel your fever from miles around,I'll pick you up in my car,And we'll paint the town,Just kiss me baby and tell me twice,That you're the one for me,The way you make me feel,(The way you make me feel),You really turn me on,(You really turn me on),You knock me off of my feet,(You knock me off of my feet),My lonely days are gone,(My lonely days are gone),I like the feeling you're giving me,Just hold me baby and I'm in ecstasy,Oh I'll be working from nine to five,To buy you things to keep you by my side,I never felt so in love before,Just promise baby, you'll love me forever more,I swear I'm keeping you satisfied,'Cause you're the one for me,The way you make me feel,(The way you make me feel),You really turn me on,(You really turn me on),

## Runing Classifiers

In [5]:
vectorizer = CountVectorizer(
    min_df=10,
    lowercase=True,
    strip_accents="unicode",
    stop_words="english",
    ngram_range=(2,2)
)
vectors = vectorizer.fit_transform(bag_of_words)

In [6]:
# spliting data for training and for testing
sss = ShuffleSplit(n_splits=1, test_size=0.3)

In [7]:
classifiers = {
    "Naive Bayes": MultinomialNB(alpha=1),
    "SGCD" : SGDClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNeighborsClassifier" : KNeighborsClassifier(3),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=5),
}

In [8]:
labels = np.array(styles.tolist())
for train_index, test_index in sss.split(vectors, labels):
    X_train, X_test = vectors[train_index], vectors[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    categories = np.unique(labels[test_index]).tolist()
    
    # instanciamos o classificador
    for classifier_name, classifier in classifiers.items():
        print(f"=========== {classifier_name} =========")
        clf = classifier
        # colocamos os nossos dados de treino
        clf.fit(X_train, y_train)
        # medimos sua acurácia com os dados de teste
        predictions = clf.predict(X_test)
        print(classification_report(y_test, predictions, target_names=categories))
        print(f"SCORE: {clf.score(X_test, y_test)}")
        print('-'*100)

              precision    recall  f1-score   support

     country       0.46      0.80      0.59       304
       indie       0.46      0.35      0.40       284
         pop       0.42      0.38      0.40       255
      reggae       0.56      0.59      0.58       316
        rock       0.33      0.12      0.17       221

   micro avg       0.47      0.47      0.47      1380
   macro avg       0.45      0.45      0.43      1380
weighted avg       0.46      0.47      0.44      1380

SCORE: 0.4717391304347826
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

     country       0.56      0.61      0.58       304
       indie       0.45      0.44      0.45       284
         pop       0.42      0.47      0.44       255
      reggae       0.59      0.56      0.58       316
        rock       0.32      0.29      0.31       221

   micro avg       0.48      0.48      0.48      1380
   m



              precision    recall  f1-score   support

     country       0.71      0.70      0.70       304
       indie       0.57      0.54      0.56       284
         pop       0.59      0.51      0.55       255
      reggae       0.64      0.69      0.66       316
        rock       0.41      0.47      0.44       221

   micro avg       0.59      0.59      0.59      1380
   macro avg       0.59      0.58      0.58      1380
weighted avg       0.60      0.59      0.59      1380

SCORE: 0.5927536231884057
----------------------------------------------------------------------------------------------------
              precision    recall  f1-score   support

     country       0.34      0.44      0.38       304
       indie       0.23      0.49      0.32       284
         pop       0.39      0.13      0.19       255
      reggae       0.51      0.37      0.43       316
        rock       0.20      0.06      0.10       221

   micro avg       0.32      0.32      0.32      1380
   m

  'precision', 'predicted', average, warn_for)
