In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import  RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,author,data,lyrics,style,title
0,Midian Lima,,"Amanheceu,Nada pesquei,Parecia ser apenas mais...",eletrônica,Não Pare
1,MC Doni,,"Não adianta tu se declarar,Romance, compromiss...",eletrônica,Te Amo Sem Compromisso (To Nem Aí)
2,Sebastián Yatra,,"I remember when I met you,I didn't wanna fall,...",eletrônica,My Only One (No Hay Nadie Más) (part. Isabela ...
3,Billie Eilish,,"Thought I found a way,Thought I found a way ou...",eletrônica,Lovely (feat. Khalid)
4,Ed Sheeran,,"I found a love for me,Darling, just dive right...",eletrônica,Perfect


## Creating a bag of words

In [3]:
styles_unique = df["style"].unique()
styles = df["style"]


bag_of_words = []
for author, lyrics, title in zip(df["author"], df["lyrics"], df["title"]):
    if author and lyrics and title: #avoiding empty and NaN fields
        bag_of_words.append("\n".join([str(author),str(title),str(lyrics)]))

## Runing Classifiers

In [4]:
vectorizer = CountVectorizer(
    min_df=10,
    lowercase=True,
    strip_accents="unicode",
    stop_words="english",
    ngram_range=(2,2)
)
vectors = vectorizer.fit_transform(bag_of_words)

In [5]:
# spliting data for training and for testing
sss = ShuffleSplit(n_splits=1, test_size=0.3)

In [6]:
classifiers = {
    "Naive Bayes": MultinomialNB(alpha=1),
    "SGCD" : SGDClassifier(),
    "Random Forest": RandomForestClassifier(),
    "KNeighborsClassifier" : KNeighborsClassifier(3),
    "DecisionTreeClassifier": DecisionTreeClassifier(max_depth=5),
}

In [7]:
labels = np.array(styles.tolist())
for train_index, test_index in sss.split(vectors, labels):
    X_train, X_test = vectors[train_index], vectors[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    categories = np.unique(labels[test_index]).tolist()
    
    # instanciamos o classificador
    for classifier_name, classifier in classifiers.items():
        print(f"=========== {classifier_name} =========")
        clf = classifier
        # colocamos os nossos dados de treino
        clf.fit(X_train, y_train)
        # medimos sua acurácia com os dados de teste
        predictions = clf.predict(X_test)
        print(classification_report(y_test, predictions, target_names=categories))
        print(f"SCORE: {clf.score(X_test, y_test)}")
        print('-'*100)

              precision    recall  f1-score   support

 alternativo       0.08      0.04      0.05       276
       blues       0.26      0.13      0.18       297
      bolero       0.63      0.10      0.17       308
       brega       0.28      0.50      0.36       279
     country       0.17      0.45      0.25       296
    cuarteto       0.31      0.47      0.37       318
      cumbia       0.26      0.30      0.28       278
       dance       0.22      0.14      0.17       277
       disco       0.24      0.14      0.17       285
  eletrônica       0.33      0.11      0.16       301
     emocore       0.28      0.25      0.26       279
        fado       0.47      0.73      0.57       273
        folk       0.25      0.14      0.18       297
        funk       0.48      0.75      0.59       314
      grunge       0.28      0.30      0.29       282
    hardcore       0.41      0.08      0.13       321
       house       0.19      0.22      0.20       287
       indie       0.19    



              precision    recall  f1-score   support

 alternativo       0.17      0.16      0.16       276
       blues       0.31      0.25      0.28       297
      bolero       0.73      0.61      0.66       308
       brega       0.43      0.39      0.41       279
     country       0.40      0.32      0.36       296
    cuarteto       0.49      0.35      0.41       318
      cumbia       0.28      0.26      0.27       278
       dance       0.17      0.14      0.16       277
       disco       0.35      0.24      0.29       285
  eletrônica       0.20      0.10      0.14       301
     emocore       0.46      0.37      0.41       279
        fado       0.51      0.74      0.60       273
        folk       0.34      0.24      0.28       297
        funk       0.54      0.59      0.56       314
      grunge       0.33      0.44      0.38       282
    hardcore       0.31      0.21      0.25       321
       house       0.31      0.19      0.24       287
       indie       0.35    



              precision    recall  f1-score   support

 alternativo       0.25      0.33      0.28       276
       blues       0.50      0.51      0.50       297
      bolero       0.67      0.74      0.71       308
       brega       0.24      0.56      0.33       279
     country       0.66      0.66      0.66       296
    cuarteto       0.44      0.52      0.48       318
      cumbia       0.26      0.44      0.32       278
       dance       0.30      0.26      0.28       277
       disco       0.53      0.38      0.44       285
  eletrônica       0.14      0.07      0.10       301
     emocore       0.71      0.58      0.64       279
        fado       0.55      0.77      0.64       273
        folk       0.54      0.52      0.53       297
        funk       0.37      0.52      0.43       314
      grunge       0.38      0.60      0.46       282
    hardcore       0.59      0.37      0.46       321
       house       0.21      0.55      0.30       287
       indie       0.51    

  'precision', 'predicted', average, warn_for)
