### MobileBert

In [2]:
import transformers
from transformers import MobileBertTokenizer, MobileBertModel, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_csv("/kaggle/input/glassdoor/cleaning_glassdoor.csv")

In [4]:
df.head()

Unnamed: 0,review,overall_rating
0,Analyst Work life balance learning opportuniti...,1
1,Sort of Slave Labour Lots of meal drink perks ...,0
2,Really bad wouldn t recommend Managers are har...,0
3,The best retail store on the planet Period It...,2
4,Tough love Great Culture Opportunity to grow ...,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120544 entries, 0 to 120543
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   review          120544 non-null  object
 1   overall_rating  120544 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [6]:
df.overall_rating.value_counts()

overall_rating
2    73458
1    27809
0    19277
Name: count, dtype: int64

In [7]:
class_len = len(df.overall_rating.value_counts().index)

In [8]:
pre_trained = "google/mobilebert-uncased"

In [9]:
tokenizer = MobileBertTokenizer.from_pretrained(pre_trained)

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

In [10]:
class GLASSDOOR_DATASET(Dataset):


    def __init__(self, reviews, targets, tokenizer, max_len):

        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):

        return len(self.reviews)

    def __getitem__(self, item):

        review = str(self.reviews[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True
        )
        return {
            "review_text": review,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.long)
        }


In [11]:
def data_loader(df, tokenizer, max_len, batch_size):
  ds = GLASSDOOR_DATASET(
    reviews=df.review.to_numpy(),
    targets=df.overall_rating.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )


In [12]:
df_train, df_test = train_test_split(df, train_size = 0.9, random_state = 42, stratify = df.overall_rating)
df_val, df_test = train_test_split(df_test,train_size=0.5,random_state=42, stratify = df_test.overall_rating)

In [13]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.MobileBert = MobileBertModel.from_pretrained(pre_trained)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.MobileBert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.MobileBert(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output  
        dropped_output = self.drop(pooler_output)
        logits = self.out(dropped_output)

        return logits

In [14]:
model = SentimentClassifier(class_len)
model = model.to(device)

pytorch_model.bin:   0%|          | 0.00/147M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [15]:
from tqdm import tqdm

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    with tqdm(total=len(data_loader), desc="Training") as pbar:
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            outputs = outputs.to(device)
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            pbar.update(1)
            pbar.set_postfix({'loss': loss.item()})

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        with tqdm(total=len(data_loader), desc="Validation") as pbar:
            for d in data_loader:
                input_ids = d["input_ids"].to(device)
                attention_mask = d["attention_mask"].to(device)
                targets = d["targets"].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs, dim=1)

                loss = loss_fn(outputs, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())

                pbar.update(1)
                pbar.set_postfix({'loss': loss.item()})

    return correct_predictions.double() / n_examples, np.mean(losses)


In [16]:
weights = torch.Tensor((len(df) / df.overall_rating.value_counts()).sort_index().to_list())
# for crossentropy

In [17]:
weights

tensor([6.2533, 4.3347, 1.6410])

In [18]:
from collections import defaultdict
import os

batch_size_list = [16, 32]
lr_list = [5e-5, 2e-5]
EPOCHS = 4
history = defaultdict(list)
MAX_LEN = 160
best_acc = 0
for lr in lr_list:
    for batch_size in batch_size_list:

        train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, batch_size)
        val_data_loader = data_loader(df_val, tokenizer, MAX_LEN, batch_size)
        test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, batch_size)
        optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
        total_steps = len(train_data_loader) * EPOCHS
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        loss_fn = nn.CrossEntropyLoss(weight = weights).to(device)

        for epoch_num in range(EPOCHS):
            print(f'Epoch {epoch_num + 1}/{EPOCHS}')
            print('-' * 10)
            train_acc, train_loss = train_epoch(
                model,
                train_data_loader,
                loss_fn,
                optimizer,
                device,
                scheduler,
                len(df_train)
            )
            print(f'Train loss {train_loss} accuracy {train_acc}')
            val_acc, val_loss = eval_model(
                model,
                val_data_loader,
                loss_fn,
                device,
                len(df_val)
            )
            print(f'Val   loss {val_loss} accuracy {val_acc}')


            history['lr'].append(lr)
            history['batch_size'].append(batch_size)
            history['epoch_num'].append(epoch_num + 1)
            history['epoch'].append(EPOCHS)
            history['train_acc'].append(train_acc.cpu())
            history['train_loss'].append(train_loss)
            history['val_acc'].append(val_acc.cpu())
            history['val_loss'].append(val_loss)

            if val_acc > best_acc:
                model_name = f'best_model_lr_{lr}_batch_{batch_size}_{epoch_num+1}_{EPOCHS}.bin'
                torch.save(model.state_dict(), model_name)
                best_acc = val_acc




Epoch 1/4
----------


Training: 100%|██████████| 6781/6781 [22:20<00:00,  5.06it/s, loss=0.539]  


Train loss 3771.613870582643 accuracy 0.6710910783581746


Validation: 100%|██████████| 377/377 [00:16<00:00, 22.77it/s, loss=0.437]


Val   loss 0.724749393227562 accuracy 0.6746308279409324
Epoch 2/4
----------


Training: 100%|██████████| 6781/6781 [22:45<00:00,  4.96it/s, loss=0.45] 


Train loss 0.830314129364647 accuracy 0.711039828922748


Validation: 100%|██████████| 377/377 [00:16<00:00, 22.58it/s, loss=0.472]


Val   loss 0.7348217061011798 accuracy 0.7001825120292019
Epoch 3/4
----------


Training: 100%|██████████| 6781/6781 [23:14<00:00,  4.86it/s, loss=0.319]  


Train loss 1.3057905001404573 accuracy 0.7329130142226401


Validation: 100%|██████████| 377/377 [00:17<00:00, 21.84it/s, loss=0.436]


Val   loss 0.7651865673634355 accuracy 0.7086444333831093
Epoch 4/4
----------


Training: 100%|██████████| 6781/6781 [23:39<00:00,  4.78it/s, loss=0.361]  


Train loss 0.952348403477718 accuracy 0.7470803491598227


Validation: 100%|██████████| 377/377 [00:17<00:00, 21.87it/s, loss=0.429]


Val   loss 0.7726220960484259 accuracy 0.7039986726397877
Epoch 1/4
----------


Training: 100%|██████████| 3391/3391 [16:20<00:00,  3.46it/s, loss=0.303]


Train loss 0.6980721738109262 accuracy 0.7332448450995032


Validation: 100%|██████████| 189/189 [00:13<00:00, 14.33it/s, loss=0.474]


Val   loss 0.74907123411774 accuracy 0.7068193130910901
Epoch 2/4
----------


Training: 100%|██████████| 3391/3391 [16:21<00:00,  3.46it/s, loss=0.243]


Train loss 0.819601398303037 accuracy 0.7600678409792698


Validation: 100%|██████████| 189/189 [00:13<00:00, 14.34it/s, loss=0.484]


Val   loss 0.8064742242848432 accuracy 0.7073170731707317
Epoch 3/4
----------


Training: 100%|██████████| 3391/3391 [16:19<00:00,  3.46it/s, loss=0.345]


Train loss 0.6886971059915274 accuracy 0.779129681350183


Validation: 100%|██████████| 189/189 [00:13<00:00, 14.28it/s, loss=0.475]


Val   loss 0.8648264277233648 accuracy 0.703500912560146
Epoch 4/4
----------


Training: 100%|██████████| 3391/3391 [16:21<00:00,  3.46it/s, loss=0.204]


Train loss 0.49114854583053663 accuracy 0.792504309192637


Validation: 100%|██████████| 189/189 [00:13<00:00, 14.31it/s, loss=0.421]

Val   loss 0.8864395385066037 accuracy 0.6986892317902771





In [19]:
history_df = pd.DataFrame(history)
history_df.to_csv("history_df.csv")

### Push the HuggingFace

In [26]:
from huggingface_hub import PyTorchModelHubMixin, HfFolder

In [21]:
class SentimentClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.MobileBert = MobileBertModel.from_pretrained(pre_trained)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.MobileBert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.MobileBert(input_ids=input_ids, attention_mask=attention_mask)
        pooler_output = outputs.pooler_output  
        dropped_output = self.drop(pooler_output)
        logits = self.out(dropped_output)

        return logits

In [23]:
model = SentimentClassifier(3)

In [24]:
model.save_pretrained("SentimentClassifier")

In [28]:
model.push_to_hub("MobileBertSentimentClassifier")

model.safetensors:   0%|          | 0.00/98.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sergeantson/MobileBertSentimentClassifier/commit/0b7a1198124d74ccc0cc3aa15f52677c1a9d9f76', commit_message='Push model using huggingface_hub.', commit_description='', oid='0b7a1198124d74ccc0cc3aa15f52677c1a9d9f76', pr_url=None, pr_revision=None, pr_num=None)