In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification 
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

In [None]:
# Testing

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df = pd.read_csv('clean_df_top_clusters_10id.csv')

# If a GPU is available (and Pytorch can use it)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 32   

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

# Creating a dictionary for reinterpreting cluster_id values
cluster_mapping = {2: 0, 3: 1, 8: 2, 9: 3, 16: 4, 22: 5, 60: 6, 64: 7, 78: 8, 80: 9}

train_df['cluster_id'] = train_df['cluster_id'].map(cluster_mapping)
val_df['cluster_id'] = val_df['cluster_id'].map(cluster_mapping)
test_df['cluster_id'] = test_df['cluster_id'].map(cluster_mapping)

test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True)

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_df['cluster_id'].tolist())
)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

model = BertForSequenceClassification.from_pretrained(".")
model.to(device)
model.eval()

predictions = []
for batch in tqdm(test_loader):
    input_ids = batch[0].to(device)
    attention_mask = batch[1].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_classes = logits.argmax(dim=1).tolist()
        predictions.extend(predicted_classes)

print(predictions)

100%|██████████| 296/296 [39:40<00:00,  8.04s/it]

[1, 2, 0, 2, 2, 0, 8, 2, 0, 3, 5, 0, 0, 3, 3, 1, 0, 2, 2, 2, 1, 7, 5, 5, 0, 0, 7, 2, 0, 3, 2, 5, 3, 0, 0, 7, 0, 0, 3, 3, 6, 2, 0, 3, 2, 9, 0, 5, 9, 2, 2, 5, 2, 2, 3, 0, 1, 2, 2, 2, 8, 8, 3, 2, 3, 0, 0, 0, 3, 1, 2, 2, 0, 8, 2, 0, 3, 0, 0, 2, 0, 0, 2, 0, 2, 6, 4, 5, 0, 2, 2, 2, 3, 0, 1, 1, 0, 0, 2, 1, 5, 2, 0, 2, 3, 2, 0, 3, 0, 3, 0, 2, 3, 2, 2, 8, 0, 0, 3, 1, 2, 2, 0, 3, 2, 2, 0, 8, 9, 5, 2, 1, 2, 0, 3, 2, 2, 2, 2, 0, 5, 0, 0, 0, 8, 8, 2, 0, 2, 5, 3, 5, 8, 0, 2, 0, 0, 2, 0, 2, 0, 7, 2, 1, 3, 2, 8, 2, 0, 5, 2, 0, 7, 0, 8, 0, 1, 0, 7, 8, 0, 9, 3, 3, 8, 0, 8, 0, 8, 3, 0, 2, 1, 3, 3, 3, 0, 0, 3, 0, 0, 1, 2, 0, 0, 8, 8, 3, 2, 2, 3, 2, 3, 2, 4, 2, 8, 5, 3, 2, 8, 5, 0, 0, 0, 2, 2, 2, 0, 2, 2, 7, 8, 2, 3, 8, 8, 0, 2, 3, 0, 0, 3, 2, 0, 0, 0, 3, 3, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 8, 0, 0, 0, 2, 2, 2, 0, 0, 2, 2, 1, 5, 2, 2, 3, 2, 3, 2, 8, 2, 0, 0, 0, 3, 2, 0, 0, 0, 2, 5, 2, 2, 1, 0, 0, 2, 3, 2, 1, 3, 0, 5, 7, 0, 2, 8, 2, 0, 7, 1, 0, 3, 2, 0, 2, 0, 8, 3, 3, 2, 2, 0, 0, 3, 0, 2, 2, 6, 0, 2, 2, 3, 2, 




In [None]:
from sklearn.metrics import confusion_matrix, classification_report

true_labels = test_df['cluster_id'].tolist()

print(classification_report(true_labels, predictions))
print(confusion_matrix(true_labels, predictions))

              precision    recall  f1-score   support

           0       0.25      0.44      0.32      1869
           1       0.16      0.07      0.10       818
           2       0.25      0.44      0.32      1633
           3       0.23      0.23      0.23      1129
           4       0.29      0.06      0.10       719
           5       0.20      0.11      0.14       930
           6       0.43      0.07      0.12       723
           7       0.23      0.12      0.15       546
           8       0.17      0.18      0.17       554
           9       0.23      0.06      0.10       526

    accuracy                           0.24      9447
   macro avg       0.24      0.18      0.18      9447
weighted avg       0.24      0.24      0.21      9447

[[815  72 501 219  19  90   8  43  92  10]
 [277  61 249  80  12  31   7  28  64   9]
 [477  60 715 134  17  70  13  49  75  23]
 [360  47 274 263  14  80  14  17  49  11]
 [256  24 212  71  45  39   4  11  40  17]
 [338  32 206 142  20 104 