In [1]:
import evaluate
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict




In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

data.columns = ['label', 'text']

data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data['label'] = data['label'].map(lambda x: 1 if x == 'spam' else 0)
data

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [4]:
train, test = train_test_split(data, test_size=0.3)

In [5]:
train

Unnamed: 0,label,text
5149,0,Pls call me da. What happen.
3897,0,Otherwise had part time job na-tuition..
3097,0,This is all just creepy and crazy to me.
2765,0,and picking them up from various points
1274,0,Let me know how to contact you. I've you settl...
...,...,...
5062,0,Ok i also wan 2 watch e 9 pm show...
5538,0,I can't believe how attached I am to seeing yo...
3778,1,"Claim a 200 shopping spree, just call 08717895..."
4868,0,1. Tension face 2. Smiling face 3. Waste face ...


In [6]:
train_dataset = Dataset.from_pandas(train, preserve_index = False)
test_dataset = Dataset.from_pandas(test, preserve_index = False)

ds = DatasetDict()

ds['train'] = train_dataset
ds['test'] = test_dataset

In [7]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 3900
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1672
    })
})

In [8]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)



In [9]:
tokenized_datasets = ds.map(tokenize_function, batched=False)
tokenized_datasets

Map:   0%|          | 0/3900 [00:00<?, ? examples/s]

Map:   0%|          | 0/1672 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3900
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1672
    })
})

In [10]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
test_dataset = tokenized_datasets["test"].shuffle(seed=42)
train_dataset

Dataset({
    features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3900
})

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [13]:
training_args = TrainingArguments(
    output_dir="/test_trainer",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    dataloader_num_workers=6,
    gradient_accumulation_steps=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Epoch,Training Loss,Validation Loss,F1
1,No log,0.094441,0.920308
2,No log,0.057401,0.952381
3,0.058300,0.042546,0.970874


TrainOutput(global_step=732, training_loss=0.041856164977850156, metrics={'train_runtime': 713.1854, 'train_samples_per_second': 16.405, 'train_steps_per_second': 1.026, 'total_flos': 3078399347712000.0, 'train_loss': 0.041856164977850156, 'epoch': 3.0})

Значение метрики `f1` улучшилось, по сравнению с `tf_idf`