In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,author,data,lyrics,style,title
0,Midian Lima,,"Amanheceu,Nada pesquei,Parecia ser apenas mais...",eletrônica,Não Pare
1,MC Doni,,"Não adianta tu se declarar,Romance, compromiss...",eletrônica,Te Amo Sem Compromisso (To Nem Aí)
2,Sebastián Yatra,,"I remember when I met you,I didn't wanna fall,...",eletrônica,My Only One (No Hay Nadie Más) (part. Isabela ...
3,Billie Eilish,,"Thought I found a way,Thought I found a way ou...",eletrônica,Lovely (feat. Khalid)
4,Ed Sheeran,,"I found a love for me,Darling, just dive right...",eletrônica,Perfect


## Creating a bag of words

In [3]:
styles_unique = df["style"].unique()
styles = df["style"]


bag_of_words = []
for author, lyrics, title in zip(df["author"], df["lyrics"], df["title"]):
    if author and lyrics and title: #avoiding empty and NaN fields
        bag_of_words.append("\n".join([str(author),str(title),str(lyrics)]))

## Pre-processing the data

In [None]:
pre_processing_pipeline = [
    language_handler(), # translate all the terms of the bag of word to a unique language
    natural_language_processing(), #remove stop-words, lematization, anagrams ...
    word2vec(), #use one pre-trained word2vec embedding to aglomareta synonymous
]

## Runing Classifiers

In [4]:
vectorizer = CountVectorizer(min_df=10, lowercase=True, strip_accents="unicode")
vectors = vectorizer.fit_transform(bag_of_words)

In [5]:
# spliting data for training and for testing
sss = ShuffleSplit(n_splits=1, test_size=0.3)

In [6]:
labels = np.array(styles.tolist())
for train_index, test_index in sss.split(vectors, labels):
    X_train, X_test = vectors[train_index], vectors[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    categories = np.unique(labels[test_index]).tolist()
    
    # instanciamos o classificador
    clf = MultinomialNB(alpha=1)
    # colocamos os nossos dados de treino
    clf.fit(X_train, y_train)
    # medimos sua acurácia com os dados de teste
    predictions = clf.predict(X_test)
    print(classification_report(y_test, predictions, target_names=categories))
    print(f"SCORE: {clf.score(X_test, y_test)}")
    print('-'*100)

              precision    recall  f1-score   support

 alternativo       0.29      0.01      0.03       302
       blues       0.57      0.04      0.07       307
      bolero       0.70      0.06      0.12       301
       brega       0.38      0.47      0.42       339
     country       0.16      0.62      0.25       289
    cuarteto       0.27      0.56      0.36       308
      cumbia       0.35      0.33      0.34       305
       dance       0.31      0.07      0.12       294
       disco       0.48      0.05      0.08       305
  eletrônica       0.34      0.04      0.06       281
     emocore       0.38      0.28      0.32       290
        fado       0.64      0.80      0.71       277
        folk       0.21      0.08      0.11       297
        funk       0.57      0.83      0.68       310
      grunge       0.18      0.70      0.29       289
    hardcore       0.00      0.00      0.00       316
       house       0.44      0.33      0.38       269
       indie       0.27    