<a href="https://colab.research.google.com/github/Shakil97/bangla-songs-lyrics-sentiment-analysis/blob/main/FinalMusicSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tez

Collecting tez
  Downloading https://files.pythonhosted.org/packages/56/37/9b99ae05da3fa2b05a4cbb0cf78c50dfdcf805b55f29029c3cb6c5a0fee1/tez-0.1.2-py3-none-any.whl
Installing collected packages: tez
Successfully installed tez-0.1.2


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/98/87/ef312eef26f5cecd8b17ae9654cdd8d1fae1eb6dbd87257d6d73c128a4d0/transformers-4.3.2-py3-none-any.whl (1.8MB)
[K     |▏                               | 10kB 22.0MB/s eta 0:00:01[K     |▍                               | 20kB 30.3MB/s eta 0:00:01[K     |▌                               | 30kB 26.6MB/s eta 0:00:01[K     |▊                               | 40kB 30.3MB/s eta 0:00:01[K     |█                               | 51kB 27.2MB/s eta 0:00:01[K     |█                               | 61kB 29.6MB/s eta 0:00:01[K     |█▎                              | 71kB 19.4MB/s eta 0:00:01[K     |█▌                              | 81kB 20.1MB/s eta 0:00:01[K     |█▋                              | 92kB 18.9MB/s eta 0:00:01[K     |█▉                              | 102kB 19.1MB/s eta 0:00:01[K     |██                              | 112kB 19.1MB/s eta 0:00:01[K     |██▏                             | 

In [3]:
import pandas as pd
import tez
import torch
import torch.nn as nn
import transformers
from sklearn import metrics, model_selection
from transformers import AdamW, get_linear_schedule_with_warmup


class BERTDataset:
    def __init__(self, lyrics, target):
        self.lyrics = lyrics
        self.target = target
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "sagorsarker/bangla-bert-base", do_lower_case=True
        )
        self.max_len = 500

    def __len__(self):
        return len(self.lyrics)

    def __getitem__(self, item):
        lyrics = str(self.lyrics[item])
        lyrics = " ".join(lyrics.split())

        inputs = self.tokenizer.encode_plus(
            lyrics,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }


class BERTBaseUncased(tez.Model):
    def __init__(self, num_train_steps):
        super().__init__()
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "sagorsarker/bangla-bert-base", do_lower_case=True
        )
        self.bert = transformers.BertModel.from_pretrained(
            "sagorsarker/bangla-bert-base", return_dict=False
        )
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 1)
        self.num_train_steps = num_train_steps
        self.step_scheduler_after = "batch"

    def fetch_optimizer(self):
        param_optimizer = list(self.named_parameters())
        no_decay = ["bias", "LayerNorm.bias"]
        optimizer_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.001,
            },
            {
                "params": [
                    p for n, p in param_optimizer if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=3e-5)
        return opt

    def fetch_scheduler(self):
        sch = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=0, num_training_steps=self.num_train_steps
        )
        return sch

    def loss(self, outputs, targets):
        if targets is None:
            return None
        return nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))

    def monitor_metrics(self, outputs, targets):
        if targets is None:
            return {}
        outputs = torch.sigmoid(outputs).cpu().detach().numpy() >= 0.4

        targets = targets.cpu().detach().numpy()
        accuracy = metrics.accuracy_score(targets, outputs)
        return {"accuracy": accuracy}

    def forward(self, ids, mask, token_type_ids, targets=None):
        _, o_2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        b_o = self.bert_drop(o_2)
        output = self.out(b_o)
        loss = self.loss(output, targets)
        acc = self.monitor_metrics(output, targets)
        return output, loss, acc


if __name__ == "__main__":
    dfx = pd.read_csv("BanglaSongs.csv").fillna("none")
    dfx.Mood = dfx.Mood.apply(lambda x: 1 if x == "বেদনা" else 0)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.2, random_state=42, stratify=dfx.Mood.values
    )

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = BERTDataset(
        lyrics=df_train.lyrics.values, target=df_train.Mood.values
    )

    valid_dataset = BERTDataset(
        lyrics=df_valid.lyrics.values, target=df_valid.Mood.values
    )

    n_train_steps = int(len(df_train) / 32 * 10)
    model = BERTBaseUncased(num_train_steps=n_train_steps)

    # model.load("model.bin")
    tb_logger = tez.callbacks.TensorBoardLogger(log_dir=".logs/")
    es = tez.callbacks.EarlyStopping(monitor="valid_loss", model_path="model.bin")
    model.fit(
        train_dataset,
        valid_dataset=valid_dataset,
        train_bs=8,
        device="cuda",
        epochs=100,
        #callbacks=[tb_logger, es],
        fp16=True,
    )
    model.save("model.bin")

    preds = model.predict(valid_dataset, batch_size=8, n_jobs=-1)
    for p in preds:
        print(p)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2237676.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=491.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=660417638.0, style=ProgressStyle(descri…




100%|██████████| 27/27 [00:11<00:00,  2.28it/s, accuracy=0.569, loss=0.636, stage=train]
100%|██████████| 4/4 [00:01<00:00,  2.86it/s, accuracy=0.636, loss=0.619, stage=valid]
100%|██████████| 27/27 [00:11<00:00,  2.30it/s, accuracy=0.736, loss=0.548, stage=train]
100%|██████████| 4/4 [00:01<00:00,  2.80it/s, accuracy=0.502, loss=0.642, stage=valid]
100%|██████████| 27/27 [00:11<00:00,  2.27it/s, accuracy=0.796, loss=0.471, stage=train]
100%|██████████| 4/4 [00:01<00:00,  2.86it/s, accuracy=0.621, loss=0.594, stage=valid]
100%|██████████| 27/27 [00:11<00:00,  2.28it/s, accuracy=0.829, loss=0.451, stage=train]
100%|██████████| 4/4 [00:01<00:00,  2.72it/s, accuracy=0.621, loss=0.594, stage=valid]
100%|██████████| 27/27 [00:11<00:00,  2.26it/s, accuracy=0.861, loss=0.443, stage=train]
100%|██████████| 4/4 [00:01<00:00,  2.73it/s, accuracy=0.621, loss=0.594, stage=valid]
100%|██████████| 27/27 [00:11<00:00,  2.26it/s, accuracy=0.843, loss=0.455, stage=train]
100%|██████████| 4/4 [00:01<00:

[[-0.132 ]
 [-0.9756]
 [-0.778 ]
 [-0.9644]
 [-0.676 ]
 [-0.3567]
 [-0.5024]
 [-1.188 ]]
[[ 0.2556 ]
 [-0.562  ]
 [-0.1056 ]
 [-0.02776]
 [-0.496  ]
 [ 0.681  ]
 [-1.2295 ]
 [-1.103  ]]


 57%|█████▋    | 4/7 [00:02<00:02,  1.10it/s, stage=test]

[[-0.297 ]
 [-0.3032]
 [-0.8247]
 [ 0.1936]
 [ 0.3992]
 [ 0.1708]
 [-0.6064]
 [-0.2152]]
[[-0.3118]
 [-0.1404]
 [-3.479 ]
 [-1.044 ]
 [-0.0394]
 [-0.1423]
 [-0.517 ]
 [ 0.457 ]]


 86%|████████▌ | 6/7 [00:03<00:00,  1.94it/s, stage=test]

[[-0.0874]
 [ 0.114 ]
 [-0.8003]
 [-0.0691]
 [-0.507 ]
 [-0.943 ]
 [-0.522 ]
 [-0.4019]]
[[-1.272 ]
 [-0.312 ]
 [-0.4858]
 [-0.1244]
 [-0.3687]
 [-1.363 ]
 [-0.36  ]
 [-0.3298]]


100%|██████████| 7/7 [00:03<00:00,  2.14it/s, stage=test]

[[ 0.5054]
 [-1.818 ]
 [ 0.2192]
 [-0.648 ]
 [-1.115 ]
 [-0.6924]
 [-1.096 ]]



