In [1]:
!pip install transformers[sentencepiece]
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install -U kaleido

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2

In [1]:
import pandas as pd
import numpy as np
import tqdm
pd.set_option('display.max_rows', 1000)
from bs4 import BeautifulSoup

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [108]:
path = '/content/drive/MyDrive/Ashik/Thesis/'
raw_data = pd.read_csv(f'{path}data/dataset-2.csv')
raw_data.shape

(2111, 3)

In [121]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from datasets import DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import nltk

nltk.download('stopwords')
nltk.download('punkt')

CheckPoint = "bert-base-uncased"
ds = DatasetDict()

stemmer = PorterStemmer()

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    cleaned_text = ' '.join(stemmed_tokens)
    return cleaned_text

raw_data['description'] = raw_data['description'].apply(clean_text)
df = raw_data

df['labels'] = df['category']

train_df, test_df = train_test_split(df, test_size=0.2, shuffle=True, random_state=1234 )
train_df.shape, test_df.shape


hf_train = Dataset.from_pandas(train_df, preserve_index=False)
hf_val = Dataset.from_pandas(test_df, preserve_index=False)

hf_train = hf_train.class_encode_column("labels")
hf_val = hf_val.class_encode_column("labels")

print(hf_train.features)


ds['train'] = hf_train
ds['validation'] = hf_val



tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
  return tokenizer(example['description'], padding=True, truncation=True)


tokenized_datasets = ds.map(tokenize_function, batched=True)
print(tokenized_datasets)


model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=9).to('cuda')

training_args = TrainingArguments(
    "test-trainer",
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 16,
    num_train_epochs = 5,
    learning_rate = 2e-5,
    weight_decay = 0.01)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    #data_collator=data_collator,
    tokenizer=tokenizer,
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Casting to class labels:   0%|          | 0/1688 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/423 [00:00<?, ? examples/s]

{'collection': Value(dtype='string', id=None), 'description': Value(dtype='string', id=None), 'category': Value(dtype='string', id=None), 'labels': ClassLabel(names=['arts', 'avatar', 'collectibles', 'games', 'memberships'], id=None)}


Map:   0%|          | 0/1688 [00:00<?, ? examples/s]

Map:   0%|          | 0/423 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['collection', 'description', 'category', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1688
    })
    validation: Dataset({
        features: ['collection', 'description', 'category', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 423
    })
})


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [122]:
from sklearn import preprocessing
from sklearn import metrics

trainer.train()
trainer.save_model(f"path/model_checkpoint/{CheckPoint}")

Step,Training Loss
500,0.4907


Checkpoint destination directory test-trainer/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [123]:
predictions = trainer.predict(tokenized_datasets['validation'])

test_df['prediction_labels'] = predictions.label_ids
test_df['prediction_label_argmax'] = [np.argmax(prediction) for prediction in predictions.predictions]

print(test_df.shape)

le = preprocessing.LabelEncoder()
test_df['actual_label'] = le.fit_transform(test_df.labels.values)


(423, 6)


In [170]:
num_correct_labels = (test_df['prediction_labels'] == test_df['prediction_label_argmax']).sum()

acc = metrics.accuracy_score(test_df.actual_label, test_df.prediction_label_argmax)
f1 = metrics.f1_score(test_df.actual_label, test_df.prediction_label_argmax, average='macro')
precision = metrics.precision_score(test_df.actual_label, test_df.prediction_label_argmax, average='macro')
recall =  metrics.recall_score(test_df.actual_label, test_df.prediction_label_argmax, average='macro')

print(f"tot samples                : {num_correct_labels}")
print(f"num_correct_labels         : {num_correct_labels}")
print(f"accuracy                   : {acc}")
print(f"f1_score                   : {f1}")
print(f"recall_score               : {recall}")
print(f"precision_score            : {precision}")


tot samples                : 362
num_correct_labels         : 362
accuracy                   : 0.8557919621749409
f1_score                   : 0.8515586044298582
recall_score               : 0.8637969819572607
precision_score            : 0.8442615452639701


In [64]:
data = {
      'tot_samples': len(test_df),
        'is_correct': num_correct_labels,
        'accuracy': acc,
        'f1_score': f1,
        'recall_score': recall,
        'precision_score': precision,
        'actual_label': test_df.actual_label,
        'prediction_label_argmax': test_df.prediction_label_argmax,
}
df = pd.DataFrame(data)
df

Unnamed: 0,tot_samples,is_correct,accuracy,f1_score,recall_score,precision_score,actual_label,prediction_label_argmax
836,423,363,0.858156,0.848637,0.857043,0.84279,4,4
1189,423,363,0.858156,0.848637,0.857043,0.84279,3,3
1290,423,363,0.858156,0.848637,0.857043,0.84279,3,3
1286,423,363,0.858156,0.848637,0.857043,0.84279,3,3
910,423,363,0.858156,0.848637,0.857043,0.84279,4,4
602,423,363,0.858156,0.848637,0.857043,0.84279,4,4
1325,423,363,0.858156,0.848637,0.857043,0.84279,3,3
1494,423,363,0.858156,0.848637,0.857043,0.84279,3,3
809,423,363,0.858156,0.848637,0.857043,0.84279,4,4
1714,423,363,0.858156,0.848637,0.857043,0.84279,2,2


In [65]:
df.to_csv(f'{path}results/approach-2-bert-base.csv', index = False)

In [2]:
import pandas as pd
path = '/content/drive/MyDrive/Ashik/Thesis/'

df = pd.read_csv(f'{path}results/approach-2-bert-base.csv')
actual_labels = df['actual_label']
predicted_labels = df['prediction_label_argmax']
classes = ['arts', 'avatar', 'collectibles', 'games', 'memberships']

In [23]:
import plotly.graph_objs as go
import plotly.io as pio
import pandas as pd

def plot_confusion_matrix(true, pred, labels):
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(true, pred, normalize='true')
    cm_df = pd.DataFrame(confusion_matrix,
                         index=labels,
                         columns=labels)

    # Reverse the order of the labels for the y-axis
    reversed_labels = labels[::-1]

    columns = cm_df.columns
    columns = [i.capitalize() for i in columns]

    reversed_labels = [i.capitalize() for i in reversed_labels]

    fig = go.Figure(data=go.Heatmap(z=cm_df.values[::-1],  # Reverse the rows
                                     x=columns,
                                     y=reversed_labels,  # Use reversed labels for y-axis
                                     text=cm_df.values[::-1],  # Reverse the rows for text as well
                                     texttemplate="%{text:.1%}",
                                     textfont=dict(size=12),  # Set the font size
                                     colorbar=dict(title='Percentage', tickformat='.1%'),
                                     hoverinfo='skip',
                                     colorscale='Greens'))  # Change the colorscale here

    fig.update_layout(
                      xaxis_title=dict(text='Predicted Label', font=dict(size=15, family='Palatino')),  # Capitalize first letter
                      yaxis_title=dict(text='True Label', font=dict(size=15, family='Palatino')),  # Capitalize first letter
                      font=dict(family='Palatino'),
                      margin=dict(l=0, r=0, t=50, b=50),
                      xaxis=dict(title_standoff=30),
                      yaxis=dict(title_standoff=30))  # Adjust the position of the y-axis title

    # Increase font size for x-axis labels (columns) and y-axis labels (reversed_labels)
    fig.update_xaxes(tickfont=dict(size=12, family='Palatino'))
    fig.update_yaxes(tickfont=dict(size=12, family='Palatino'))

    pio.write_image(fig, 'approach-1-bert-base-cm.jpeg', format='jpeg', scale=10)

    fig.show()


In [27]:
plot_confusion_matrix(actual_labels,predicted_labels,['arts', 'avatar', 'collectibles', 'games', 'memberships'])

In [25]:
# !pip install -U kaleido

In [None]:
le.classes_

array(['Collectibles', 'arts', 'avatar', 'games', 'metaverse', 'others'],
      dtype=object)