In [4]:
! pip install scikit-learn transformers datasets
! pip install jupyter ipywidgets




[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.2.1 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting jupyter
  Obtaining dependency information for jupyter from https://files.pythonhosted.org/packages/83/df/0f5dd132200728a86190397e1ea87cd76244e42d39ec5e88efd25b2abd7e/jupyter-1.0.0-py2.py3-none-any.whl.metadata
  Downloading jupyter-1.0.0-py2.py3-none-any.whl.metadata (995 bytes)
Collecting ipywidgets
  Obtaining dependency information for ipywidgets from https://files.pythonhosted.org/packages/d4/17/8b2ce5765dd423433d2e0727712629c46152fb0bc706b0977f847480f262/ipywidgets-8.1.3-py3-none-any.whl.metadata
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting notebook (from jupyter)
  Obtaining dependency information for notebook from https://files.pythonhosted.org/packages/32/b4/b0cdaf52c35a3a40633136bee5152d6670acb555c698d23a3458dca65781/notebook-7.2.1-py3-none-any.whl.metadata
  Downloading notebook-7.2.1-py3-none-any.whl.metadata (10 kB)
Collecting qtconsole (from jupyter)
  Obtaining dependency information for qtconsole from https://files.pythonhosted

In [6]:
import torch

# check if CUDA is available
print(torch.cuda.is_available())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

False


In [6]:
import os
from transformers import BertTokenizer, BertModel


tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased').to(device)

In [10]:
from datasets import load_dataset, load_metric

dataset = load_dataset('sms_spam', split='train', trust_remote_code=True)
texts = dataset['sms']
labels = dataset['label']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.6, random_state=42)
print(len(X_train), len(y_train))

2229 2229


In [5]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [13]:
encoded_datasets = tokenizer.batch_encode_plus(X_train, return_tensors="pt", padding=True, truncation=True, max_length=512)
import tqdm
import numpy as np
with torch.no_grad():
    batch_size = 4
    for i in tqdm.tqdm(range(0, len(X_train), batch_size)):
        input_ids = encoded_datasets["input_ids"][i:i+batch_size].to(device)
        attention_mask = encoded_datasets["attention_mask"][i:i+batch_size].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        if i == 0:
            document_vector = outputs.last_hidden_state[:, 0, :]
            document_vector = np.array(document_vector.cpu())
        else:
            document_vector = np.concatenate([document_vector, np.array(outputs.last_hidden_state[:, 0, :].cpu())], axis=0)


100%|██████████| 558/558 [32:25<00:00,  3.49s/it]


In [17]:
# save the document vectors and labels
with open("document_vectors.npy", "wb") as f:
    np.save(f, document_vector)
with open("labels.npy", "wb") as f:
    np.save(f, y_train)
    
# release the GPU memory


In [2]:
import numpy as np
# load the document vectors and labels
with open("document_vectors.npy", "rb") as f:
    document_vector = np.load(f)
with open("labels.npy", "rb") as f:
    y = np.load(f)
    

In [3]:
X = document_vector
y = y


In [4]:
import torch.nn as nn

class TopicClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TopicClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
    


In [9]:
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import torch
import os 
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)

model = TopicClassifier(input_dim=X_train.shape[1], hidden_dim=16, output_dim=2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 32
n_epochs = 3
for epoch in range(n_epochs):
    model.train()
    for i in tqdm.tqdm(range(0, len(X_train), batch_size)):
        optimizer.zero_grad()
        output = model(X_train[i:i+batch_size].to(device))
        loss = criterion(output, y_train[i:i+batch_size].to(device))
        # l2 regularization
        l2_lambda = 0.005
        l2_norm = sum(p.pow(2.0).sum() for p in model.parameters())
        loss = loss + l2_lambda * l2_norm
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        output = model(X_test.to(device))
        pred = torch.argmax(output, dim=1)
        acc = accuracy_score(y_test.numpy(), pred.cpu().numpy())
        print(f"epoch: {epoch+1}, accuracy: {acc}")

(1783, 768) (446, 768) (1783,) (446,)


  0%|          | 0/56 [00:00<?, ?it/s]

100%|██████████| 56/56 [00:00<00:00, 280.44it/s]


epoch: 1, accuracy: 0.9372197309417041


100%|██████████| 56/56 [00:00<00:00, 373.47it/s]


epoch: 2, accuracy: 0.9663677130044843


100%|██████████| 56/56 [00:00<00:00, 301.51it/s]

epoch: 3, accuracy: 0.9798206278026906





In [10]:
# save the model
torch.save(model.state_dict(), "topic_classifier2.pth")

sms
label
