In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn import preprocessing
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score


In [32]:
# pip install ipywidgets

In [2]:
df = pd.read_csv('D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\website_classification.csv').dropna().reset_index(drop=True)
df.columns
# df.rename('Category : Tag')

Index(['website_url', 'cleaned_website_text', 'Category'], dtype='object')

In [3]:
df.rename(columns={'Category': 'Tag'}, inplace=True)
df.columns
print(df)

                                            website_url  \
0                      https://travelsites.com/expedia/   
1                  https://travelsites.com/tripadvisor/   
2                 https://www.momondo.in/?ispredir=true   
3     https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...   
4     https://book.priceline.com/?refid=8431&refclic...   
...                                                 ...   
1337                           http://www.oldwomen.org/   
1338                         http://www.webcamslave.com   
1339                        http://www.buyeuroporn.com/   
1340  http://www.analdreamhouse.com/30/03/agecheck/i...   
1341                     http://www.world-sex-news.com/   

                                   cleaned_website_text     Tag  
0     official site good hotel accommodation big sav...  Travel  
1     expedia hotel book sites like use vacation wor...  Travel  
2     tripadvisor hotel book sites like previously d...  Travel  
3     cheap flights search 

In [4]:
Tag = list(set(df['Tag']))
Tag

['Business/Corporate',
 'News',
 'Photography',
 'Food',
 'Sports',
 'Forums',
 'Streaming Services',
 'E-Commerce',
 'Games',
 'Computers and Technology',
 'Health and Fitness',
 'Adult',
 'Social Networking and Messaging',
 'Education',
 'Law and Government',
 'Travel']

In [5]:
label2id = {Tag[i]: i for i in range(len(Tag))}

In [6]:
id2label = {i: Tag[i] for i in range(len(Tag))}

In [7]:
df["Tags"] = [label2id[df["Tag"][i]] for i in range(len(df))]

In [8]:
np.unique(df["Tags"])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
      dtype=int64)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df["cleaned_website_text"],df['Tags'], test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.3, random_state=42)

In [10]:
dataset = DatasetDict(
    {
        "train": Dataset.from_dict({"text": X_train, "labels": y_train}),
        "test": Dataset.from_dict({"text": X_test, "labels": y_test}),
        "val": Dataset.from_dict({"text": X_val, "labels": y_val})
    }
)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 939
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 282
    })
    val: Dataset({
        features: ['text', 'labels'],
        num_rows: 121
    })
})

In [12]:
from transformers import AutoTokenizer
import os
# Model id to load the tokenizer
model_id = "bert-base-uncased"
save_dataset_path = "lm_dataset"
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True,return_tensors="pt")

# Tokenize dataset
# dataset = dataset.rename_column("label", "labels") # to match Trainer
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.with_format("torch")

# save dataset to disk
tokenized_dataset["train"].save_to_disk(os.path.join(save_dataset_path,"train"))
tokenized_dataset["test"].save_to_disk(os.path.join(save_dataset_path,"test"))
tokenized_dataset["val"].save_to_disk(os.path.join(save_dataset_path,"val"))

Map:   0%|          | 0/939 [00:00<?, ? examples/s]

Map:   0%|          | 0/282 [00:00<?, ? examples/s]

Map:   0%|          | 0/121 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/939 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/282 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/121 [00:00<?, ? examples/s]

In [13]:
import torch.distributed as dist
from transformers import TrainingArguments, Trainer
# from optimum.neuron import NeuronTrainer as Trainer
from datasets import load_from_disk
from transformers import AutoModelForSequenceClassification

In [14]:
def parse_args():
    ...
def training_function(args):
    # load dataset from disk and tokenizer
    train_dataset = load_from_disk(os.path.join(args.dataset_path, "train"))
    # Download the model from huggingface.co/models
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
    )
    training_args = TrainingArguments(
        ...
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )
    # Start training
    trainer.train()
    return model

In [15]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(val_pred):
    logits, labels = val_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [16]:
train_dataset = load_from_disk(os.path.join(save_dataset_path, "train"))
val_dataset = load_from_disk(os.path.join(save_dataset_path, "val"))

# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=16
)

# Corrected training arguments
training_args = TrainingArguments(
    output_dir="test_dir3",  # Specify the directory where the output files will be saved
    evaluation_strategy="steps",  # Corrected from 'valuation_strategy'
    eval_steps=500,  # Corrected from 'val_steps' to 'eval_steps'
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,  # Corrected from 'per_device_val_batch_size' to 'per_device_eval_batch_size'
    num_train_epochs=3,
    load_best_model_at_end=True,
    # Optional arguments (uncomment and use as needed)
    # seed=0,
    # learning_rate=0.000001,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Corrected from 'val_dataset' to 'eval_dataset'
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()  

In [17]:
import torch
print(torch.cuda.is_available())

True


In [18]:
model_save_path = "D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model"
# Save the model
model.save_pretrained(model_save_path)

In [19]:
tokenizer_save_path = "D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model"
# Save the tokenizer
tokenizer.save_pretrained(tokenizer_save_path)


('D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model\\tokenizer_config.json',
 'D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model\\special_tokens_map.json',
 'D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model\\vocab.txt',
 'D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model\\added_tokens.json',
 'D:\\Rayka_Company\\Azar\\Work\\ML-NFstream\\Model-Bert\\First-version-model\\tokenizer.json')

In [20]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_save_path)

In [21]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]  # Ensure idx is used directly as an integer
        label = self.labels[idx]
        encoding = self.tokenizer(text, truncation=True, padding=True, return_tensors="pt")
        encoding = {key: val.squeeze() for key, val in encoding.items()}  # Squeeze the batch dimension
        item = {"input_ids": encoding["input_ids"], "attention_mask": encoding["attention_mask"], "labels": torch.tensor(label)}
        return item


In [25]:
def evaluate(model, data_loader):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels.extend(batch['labels'].numpy())
            
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).numpy()
            predictions.extend(preds)

    accuracy = accuracy_score(labels, predictions)
    return accuracy

accuracy = evaluate(model, test_loader)
print(f"Model accuracy on the test set: {accuracy}")


TypeError: list indices must be integers or slices, not list