In [1]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2022-12-18 06:28:23.391653: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-18 06:28:24.808451: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/conda/envs/tf/lib/
2022-12-18 06:28:24.808578: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/conda/envs/tf/lib/


Num GPUs Available:  1


In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Import the csv into pandas dataframe and add the headers
df = pd.read_csv('lyrics_data_clean.csv')
# df.head()
# # Removing unwanted columns and only leaving title of news and the category which will be the target
df = df[['lyrics','genre']]

In [5]:
# # Converting the codes to appropriate categories using a dictionary
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['ENCODE_CAT'] = df['genre'].apply(lambda x: encode_cat(x))

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
token = tokenizer.encode_plus(
    df['lyrics'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

2022-12-18 03:16:34.680088: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-18 03:16:39.779663: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14620 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0001:00:00.0, compute capability: 7.0


In [8]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [9]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(df['lyrics'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [10]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

28031it [00:51, 548.23it/s]


In [11]:
labels = np.zeros((len(df), 7))
labels.shape

(28031, 7)

In [12]:
labels[np.arange(len(df)), df['ENCODE_CAT'].values] = 1 # one-hot encoded target tensor

In [13]:

# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(7,), dtype=tf.float64, name=None))>

In [14]:
def LyricsDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [15]:
dataset = dataset.map(LyricsDatasetMapFunction) # converting to required format for tensorflow dataset 

In [16]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(7,), dtype=tf.float64, name=None))>

In [17]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [18]:
p = 0.8
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [19]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

In [20]:
from transformers import TFBertModel

model = TFBertModel.from_pretrained('bert-base-uncased') # bert base model with pretrained weights

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [21]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(7, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes
lyrics_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
lyrics_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [22]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [23]:
lyrics_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [24]:
hist = lyrics_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)
     

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [25]:
lyrics_model.save('genreclassification_model')



INFO:tensorflow:Assets written to: genreclassification_model/assets


INFO:tensorflow:Assets written to: genreclassification_model/assets


## Predicting using synthetic data

In [10]:
sync_data = pd.read_csv('synthetic_data_l.csv')
sync_data = sync_data[['text','genre']]
sync_data['ENCODE_CAT'] = sync_data['genre'].apply(lambda x: encode_cat(x))

In [7]:
sync_data['ENCODE_CAT'].unique()

array([3, 0, 1, 5, 6, 4, 2])

In [8]:
df['genre'].unique()

array(['pop', 'country', 'blues', 'jazz', 'reggae', 'rock', 'hip hop'],
      dtype=object)

In [9]:
lyrics_model = tf.keras.models.load_model('genreclassification_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=df['genre'].unique()):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]
     

2022-12-18 06:28:51.780690: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-18 06:28:52.583707: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14620 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0001:00:00.0, compute capability: 7.0


In [13]:
pred = []
for i in range(len(sync_data)):
    input_text = sync_data['text'][i]
    processed_data = prepare_data(input_text, tokenizer)
    result = make_prediction(lyrics_model, processed_data=processed_data)
    pred.append(result)
sync_data['pred'] = pred



In [14]:
sync_data['match'] = sync_data['pred'] == sync_data['genre']

In [15]:
print("Overall accuracy: ", sync_data['match'].sum()/len(sync_data))
print("Accuracy per class: ", sync_data.groupby('genre')['match'].sum()/sync_data.groupby('genre')['match'].count())

Overall accuracy:  0.16642857142857143
Accuracy per class:  genre
blues      0.002
country    0.000
hip hop    0.000
jazz       0.447
pop        0.006
reggae     0.626
rock       0.084
Name: match, dtype: float64
