In [1]:
import pandas as pd 
from tqdm import tqdm 
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification

In [2]:
df = pd.read_csv('onestop_split.csv')
df

Unnamed: 0,text,label
0,"﻿When you see the word Amazon, what’s the firs...",0
1,"Amazon has applied for many new domains, inclu...",0
2,"﻿To tourists, Amsterdam still seems very liber...",0
3,One Dutch newspaper wrote that in the 19th cen...,0
4,"﻿Anitta, a music star from Brazil, has million...",0
...,...,...
1129,"So far, the tournament has avoided the worst D...",2
1130,﻿It is not just the world’s biggest burger cha...,2
1131,"Yet, just as McDonald’s has been losing the cu...",2
1132,﻿More than one million British workers might b...,2


In [3]:
texts = df['text']
labels = df['label']

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [5]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', model_max_len=512)

In [6]:
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

In [7]:
train_labels,test_labels = list(y_train),list(y_test)

In [8]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [9]:
# from sklearn.metrics import precision_recall_fscore_support
# from sklearn.metrics import accuracy_score

# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         'f1': f1,
#         'precision': precision,
#         'recall': recall
#     }

In [10]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results_2',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3,seq_classif_dropout=0.3)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)

trainer.train()


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classi

{'loss': 1.1322, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.09}


 18%|█▊        | 20/114 [03:43<17:34, 11.22s/it]

{'loss': 1.0999, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.18}


 26%|██▋       | 30/114 [05:36<15:56, 11.38s/it]

{'loss': 1.0967, 'learning_rate': 3e-06, 'epoch': 0.26}


 35%|███▌      | 40/114 [07:30<14:04, 11.41s/it]

{'loss': 1.1145, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.35}


 44%|████▍     | 50/114 [09:22<11:52, 11.14s/it]

{'loss': 1.0869, 'learning_rate': 5e-06, 'epoch': 0.44}


 53%|█████▎    | 60/114 [11:13<09:57, 11.06s/it]

{'loss': 1.1127, 'learning_rate': 6e-06, 'epoch': 0.53}


 61%|██████▏   | 70/114 [13:04<08:09, 11.12s/it]

{'loss': 1.0838, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.61}


 70%|███████   | 80/114 [14:55<06:23, 11.27s/it]

{'loss': 1.089, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.7}


 79%|███████▉  | 90/114 [16:48<04:32, 11.35s/it]

{'loss': 1.0646, 'learning_rate': 9e-06, 'epoch': 0.79}


 88%|████████▊ | 100/114 [18:41<02:37, 11.25s/it]

{'loss': 1.0551, 'learning_rate': 1e-05, 'epoch': 0.88}


 96%|█████████▋| 110/114 [20:34<00:45, 11.32s/it]

{'loss': 0.9401, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.96}


100%|██████████| 114/114 [21:12<00:00,  9.27s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 114/114 [21:12<00:00, 11.16s/it]

{'train_runtime': 1272.8262, 'train_samples_per_second': 0.713, 'train_steps_per_second': 0.09, 'train_loss': 1.0700274747714662, 'epoch': 1.0}





TrainOutput(global_step=114, training_loss=1.0700274747714662, metrics={'train_runtime': 1272.8262, 'train_samples_per_second': 0.713, 'train_steps_per_second': 0.09, 'train_loss': 1.0700274747714662, 'epoch': 1.0})

In [11]:
trainer.save_model()

Saving model checkpoint to ./results_2
Configuration saved in ./results_2\config.json
Model weights saved in ./results_2\pytorch_model.bin


In [12]:
# model.eval()

In [13]:
# import torch
# with torch.no_grad():
#         model = DistilBertForSequenceClassification.from_pretrained('results',num_labels=3)
#         model.eval()
# # Load pre-trained model tokenizer (vocabulary)
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [14]:
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [15]:
# model = DistilBertForSequenceClassification.from_pretrained('results',num_labels=3)
# model.to(device)

In [16]:
# def score(sentence):
#     tokenize_input = tokenizer.encode(sentence, truncation=True, padding=True)
#     tensor_input = torch.tensor([tokenize_input])
#     loss=model(tensor_input, labels=tensor_input)[0]
#     return np.exp(loss.detach().numpy())


In [17]:
# tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', model_max_len=512)

In [18]:
# from transformers import pipeline

# pipe = pipeline('sentiment-analysis',model='results', tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased',config={'max_length':512, 'truncation':True, 'padding':True}))

In [19]:
# def score(sentence):
#     tokenize_input = tokenizer.encode(sentence, truncation=True, padding=True)
#     tensor_input = torch.tensor([tokenize_input])
#     loss=model(tensor_input, labels=tensor_input)[0]
#     return np.exp(loss.detach().numpy())


In [20]:
# dev_res = pipe(list(X_test), max_length=512,truncation=True)
# dev_res_labels = [int(x['label'][-1]) for x in dev_res]

In [21]:
# from sklearn.metrics import accuracy_score 
# accuracy_score(dev_res_labels,list(y_test))

In [22]:
# untuned_pipe = pipeline('sentiment-analysis',model=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels = 3), tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased',config={'max_length':512, 'truncation':True, 'padding':True}))
# dev_res_unt = untuned_pipe(list(X_test), max_length=512,truncation=True)
# dev_res_labels_unt = [int(x['label'][-1]) for x in dev_res_unt]
# accuracy_score(dev_res_labels_unt,list(y_test))