In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from sklearn.metrics import classification_report

# Many to one :  lyrics to class pop or rap

# Data exploration

In [2]:
# Load the training data
train_df = pd.read_csv('./data/train.csv')

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51054 entries, 0 to 51053
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   lyric   51054 non-null  object
 1   class   51054 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 797.8+ KB


In [4]:
train_df.head()

Unnamed: 0,lyric,class
0,Can't drink without thinkin' about you,1
1,Now Lil Pump flyin' private jet (Yuh),0
2,"No, matter fact, you ain't help me when I had ...",0
3,"And you could find me, I ain't hidin'",0
4,From the way you talk to the way you move,1


In [5]:
train_df['class'].value_counts()

class
0    28885
1    22169
Name: count, dtype: int64

# ML bag-of-words

waarom? bag-of-words kan ook in machine learning maar is eigenlijk niet zo goed

### Test-train split

In [6]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df['lyric'], train_df['class'], test_size=0.2, random_state=42) # we use random state because there is no time related to the data

### preprocessing text

In [7]:
#remove commas in lyrics
train_df['lyric'] = train_df['lyric'].str.replace(',', '')

dealing with censored words (swear words)

In [8]:
# Define mapping of censored words to original words
from censoredwords import censoredwords

print(censoredwords)

{'Mothaf***a': 'Motherfucker', 'Motherf***ers': 'Motherfuckers', 'a**': 'ass', 'a**cheeks': 'asscheeks', 'a**hole': 'asshole', 'b****': 'bitch', 'b****es': 'bitches', 'b****in': 'bitching', 'b****y': 'bitchy', 'd***ed': 'damned', 'd***': 'dick', 'd***ie': 'doggie', 'd***ies': 'doggies', 'd***ory': 'doggory', 'd***y': 'doddy', 'f***': 'fack', 'fack***': 'facking', 'f***boy': 'fuckboy', 'f***boys': 'fuckboys', 'f***ed': 'fucked', 'f***er': 'fucker', 'f***ery': 'fuckery', 'f***in': 'fucking', 'f***ing': 'fucking', 'f***s': 'fucks', 'godd***it': 'goddammit', 'mothaf***a': 'motherfucker', 'mothaf***as': 'motherfuckers', 'mothaf***in': 'motherfucking', 'motherf***a': 'motherfucker', 'motherf***er': 'motherfucker', 'motherf***ers': 'motherfuckers', 'motherf***in': 'motherfucking', 'motherf***ing': 'motherfucking', 'muhf***as': 'motherfuckers', 'muhf***in': 'motherfucking', 'n****': 'nigga', 'n****s': 'niggas', 's***faced': 'shitfaced', 'p****': 'pussy', 's***': 'sick', 's***s': 'shits', 's***

In [9]:
# Define mapping of censored words to original words
from censoredwords import censoredwords

# Reverse the censorship of the words
for censored_word, original_word in censoredwords.items():
    train_df['lyric'] = train_df['lyric'].str.replace(censored_word, original_word)


get rid of words like hidin' , thinkin'

In [10]:
def replace_apostrophes_with_g(word):
    # check if the word ends with an apostrophe
    if word.endswith("'"):
        # replace the apostrophe with "g"
        word = word[:-1] + "g"
    return word
  
def remove_apostrophes(text):
    words = text.split()
    words = [replace_apostrophes_with_g(word) for word in words]
    return " ".join(words)

train_df["lyric"] =  train_df["lyric"].apply(remove_apostrophes)

In [11]:
print(train_df["lyric"])

0                   Can't drink without thinking about you
1                    Now Lil Pump flying private jet (Yuh)
2        No matter fact you ain't help me when I had no...
3                     And you could find me I ain't hiding
4                From the way you talk to the way you move
                               ...                        
51049    I told her pour me some more then she went rig...
51050              Hit the ground and crawl to the dresser
51051    Just keep breathing and breathing and breathin...
51052         Down go the system long live the king (King)
51053    If your mother knew all the things we do (From...
Name: lyric, Length: 51054, dtype: object


cleaning up text

In [12]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words] #all words that are not in stop_words
    
    # Stem the words = reducing to root form, example: running -> run
    stemmer = SnowballStemmer("english") 
    stemmed_words = [stemmer.stem(word) for word in words]
    # Join the words back into a single string
    cleaned_text = ' '.join(stemmed_words)
    return cleaned_text

# De not is niet zo belangrijk
# smaller words gehouden omdat dit veel voorkomt in rap

In [13]:
# Pas de preprocessing toe op de training- en validatiedata

#The shape attribute for numpy arrays returns the dimensions of the array. If Y has n rows and m columns, then Y.shape is (n,m). So Y.shape[0] is n.
for i in range(X_train.shape[0]):
  X_train.iloc[i] = clean_text(X_train.iloc[i])

for i in range(X_val.shape[0]):
  X_val.iloc[i] = clean_text(X_val.iloc[i])

In [14]:
X_train

35972                                         man stay way
28800                                                 pass
5286                          think caus rememb first time
25925    pop xan fifti thousand japan fifti thousand japan
9889                                                  yeah
                               ...                        
11284                                               sick n
44732                          nunca vieron caer nah nunca
38158                                 everi time sit couch
860                                         loos come come
15795                                  kick call back come
Name: lyric, Length: 40843, dtype: object

In [15]:
X_train_clean = X_train.copy()
X_val_clean = X_val.copy()

### Bag of words

In [16]:
#create bag of words
from sklearn.feature_extraction.text import TfidfTransformer

# 1. Count the number of words
count_vect = CountVectorizer()
X_train_bag_of_words = count_vect.fit_transform(X_train) 
X_val_bag_of_words = count_vect.transform(X_val) # combinaties woorden en getallen zijn anders dan in training set

# Take the frequency of the words intoaccount
# wat irrelevant is krijgt mindere waarde dat andere woorden
tfidf_transformer = TfidfTransformer()
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_bag_of_words)
X_train_tf = tf_transformer.transform(X_train_bag_of_words)
X_val_tf = tf_transformer.transform(X_val_bag_of_words)

### Training classifier

fitting

In [17]:
#logistic classifier on bag-of-words
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
paramaters = [{'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000,10000, 100000]}                                       
             ] # penalisation parameter, smaller C = stronger penalisation
                            
grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy', # accuracy is the metric to evaluate the model
                           cv = 4,
                           n_jobs = -1)

grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  

print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Best accuracy :  0.7654434262717772
Best parameters : {'C': 10}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Performing a second grid search, zooming in on values close to the best C-value from the first grid search

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
paramaters = [{'C' : [int(x) for x in np.linspace(start = 9, stop = 11, num = 100)]}                                       
             ] # penalisation parameter, smaller C = stronger penalisation
                            
grid_search = GridSearchCV(estimator = model, 
                           param_grid = paramaters,
                           scoring = 'accuracy', # accuracy is the metric to evaluate the model
                           cv = 4,
                           n_jobs = -1)

grid_search = grid_search.fit(X_train_tf, y_train)

best_accuracy = grid_search.best_score_ 
best_parameters = grid_search.best_params_  

print('Best accuracy : ', grid_search.best_score_)
print('Best parameters :', grid_search.best_params_  )

Best accuracy :  0.7654434262717772
Best parameters : {'C': 10}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


The best parameter is 10 and the accuracy stays the same of **76.54%**

# DL approach: LSTM

LSTM (Long Short-Term Memory)

### preprocessing text

In [19]:
# Tokenize the text
tokenizer = Tokenizer(oov_token= True, num_words=500, split=' ')
tokenizer.fit_on_texts(X_train_clean) # fit_on_texts functie bouwt een woordenboek 

X_train_tok = tokenizer.texts_to_sequences(X_train_clean)
X_val_tok = tokenizer.texts_to_sequences(X_val_clean)


In [20]:
# Pad the sequences
# pad_sequences is used to ensure that all sequences in a list have the same length. adding 0's
# De maximale lengte is ingesteld op de lengte van de langste sequentie.
X_train_tok = pad_sequences(X_train_tok)
X_val_tok = pad_sequences(X_val_tok,maxlen=X_train_tok.shape[1])

In [21]:
X_train_tok.shape

(40843, 54)

if the outcome is text, this padded sequence needs to be one-hot encoded

In [22]:
# one-hot encoding is applied to convert the integer to a binary matrix (only 0s and 1s) because the model can't work with integers 
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_train_class = to_categorical(y_train)

y_val = encoder.transform(y_val)
y_val_class = to_categorical(y_val)

# 2 nodes als output layer => rap and pop

### Train LSTM model

In [23]:
# we use word embedding to represent words as vectors in a high-dimensional space so that semantically similar words are mapped to nearby points
model = Sequential()
# the neural network that performs overall task
model.add(Embedding(input_dim=2000,output_dim =128, input_length = X_train_tok.shape[1])) 
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) # when adding new LSTM, add return_sequences=True
model.add(Dense(2,activation='sigmoid')) # whe use sigmoid because we have 2 classes as (output layer) 

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy']) # binary_crossentropy because we have 2 classes, adam because it is a binary classification problem , we kijken naar de accuracy

early_stopping =  EarlyStopping(patience=6,  restore_best_weights=True)
history = model.fit(X_train_tok, y_train_class, epochs = 100, batch_size=16, verbose = 2, validation_split= 0.2,
                   callbacks=[early_stopping])

Epoch 1/100
2043/2043 - 146s - loss: 0.5330 - accuracy: 0.7318 - val_loss: 0.5074 - val_accuracy: 0.7489 - 146s/epoch - 71ms/step
Epoch 2/100
2043/2043 - 154s - loss: 0.5016 - accuracy: 0.7519 - val_loss: 0.5052 - val_accuracy: 0.7481 - 154s/epoch - 75ms/step
Epoch 3/100
2043/2043 - 128s - loss: 0.4887 - accuracy: 0.7603 - val_loss: 0.5034 - val_accuracy: 0.7511 - 128s/epoch - 63ms/step
Epoch 4/100
2043/2043 - 135s - loss: 0.4761 - accuracy: 0.7642 - val_loss: 0.4968 - val_accuracy: 0.7524 - 135s/epoch - 66ms/step
Epoch 5/100
2043/2043 - 133s - loss: 0.4629 - accuracy: 0.7717 - val_loss: 0.4936 - val_accuracy: 0.7635 - 133s/epoch - 65ms/step
Epoch 6/100
2043/2043 - 124s - loss: 0.4478 - accuracy: 0.7831 - val_loss: 0.4919 - val_accuracy: 0.7669 - 124s/epoch - 61ms/step
Epoch 7/100
2043/2043 - 134s - loss: 0.4336 - accuracy: 0.7909 - val_loss: 0.4982 - val_accuracy: 0.7700 - 134s/epoch - 66ms/step
Epoch 8/100
2043/2043 - 117s - loss: 0.4194 - accuracy: 0.8000 - val_loss: 0.4804 - val_ac

In [24]:
best_accuracy = max(history.history['val_accuracy'])
print("Best validation accuracy: ", best_accuracy)

Best validation accuracy:  0.7885910272598267


The best validation accuracy is **78.85%**. This is higher than the validation score of the ML algorithm (76.54), so we prefer the LSTM model. 
Predict the results for the test set and take a look at the classification report, use the best model. 

# Evaluating on best model

In [25]:
predictions_test = np.argmax(model.predict(X_val_tok), axis=-1)



In [26]:
print(classification_report(y_val, predictions_test))

              precision    recall  f1-score   support

           0       0.78      0.80      0.79      5781
           1       0.73      0.71      0.72      4430

    accuracy                           0.76     10211
   macro avg       0.76      0.75      0.75     10211
weighted avg       0.76      0.76      0.76     10211



## Test data (dit pas doen als je het beste algoritme gevonden hebt)

In [52]:
# Load the test data
test_df = pd.read_csv('./data/test.csv')

In [53]:
#remove commas in lyrics
train_df['lyric'] = train_df['lyric'].str.replace(',', '')

In [54]:
# Define mapping of censored words to original words
from censoredwords import censoredwords

# Reverse the censorship of the words
for censored_word, original_word in censoredwords.items():
    train_df['lyric'] = train_df['lyric'].str.replace(censored_word, original_word)

In [55]:
train_df["lyric"] =  train_df["lyric"].apply(remove_apostrophes)

In [56]:
# Apply the clean_text function to the lyrics column

test_df['cleaned_lyrics'] = test_df['lyric'].apply(clean_text)


In [57]:
# Tokenize the text
test_seq = tokenizer.texts_to_sequences(test_df['cleaned_lyrics'])

In [58]:
# Pad the sequences to a fixed length
test_pad = pad_sequences(test_seq, maxlen=2000)

# Make predictions on the test data
y_pred = model.predict(test_pad)



In [63]:
# Convert probabilities to binary values using 0.5
binary_predictions = y_pred.round()


result_df = pd.DataFrame({'Id': test_df['id'], 'Prediction': binary_predictions[:, 1], 'Lyrics': test_df['lyric']})

# Save the DataFrame to a CSV file
result_df.to_csv('./data/results.csv', index=False)