По классике ставим pandas и сопутствующие зависимости из либ

In [2]:
#Импорты
!pip install pandas scikit-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

Загружаем на гугл диск файл с датасетом и открываем в коллабе:

In [12]:
#Загрузка
data = pd.read_csv('/content/drive/MyDrive/Spam/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


Заменяем для удобства spam и ham на 1 и 0



In [13]:
#Предварительная обработка
data['label'] = data['label'].map({'ham': 0, 'spam': 1})
data

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


Разделяем стандартным способом на 20/80

In [5]:
#Разделение данных
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

Преобразуем текст в числовые данные, чтобы нейронка могла с ними работать

In [6]:
#Векторизация
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

Начинаем процесс обучения, тут юзается ultinomialNB, которая является реализацией алгоритма Наивного Байеса для мультиномиальных распределений. В целом, этот алгоритм очень часто используется для задач текстовой классификации, поэтому его и испольщуем

In [7]:
#Обучение модельки
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [10]:
#Оценка качества модели
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Metrics:\n", classification_report(y_test, y_pred))

Accuracy: 0.97847533632287
Metrics:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



Попробуем другой способ: используем BERT - это моделька, разработанная компанией Google для понимания естественного языка, она использует архитектуру трансформеров и обучается на больших объемах текстовых данных, это позволяет ей понимать контекст и семантику текста

In [25]:
!pip install transformers datasets scikit-learn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf



In [27]:
data = pd.read_csv('/content/drive/MyDrive/Spam/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

train_texts, val_texts, train_labels, val_labels = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
data

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [29]:
def convert_data_to_examples(train, test):
    train_input = []
    for i in range(len(train)):
        train_input.append(InputExample(guid=None, text_a=train.iloc[i], text_b=None, label=train_labels.iloc[i]))

    validation_input = []
    for i in range(len(test)):
        validation_input.append(InputExample(guid=None, text_a=test.iloc[i], text_b=None, label=val_labels.iloc[i]))

    return train_input, validation_input

def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = []
    for e in examples:
        input_dict = tokenizer.encode_plus(e.text_a, add_special_tokens=True, max_length=max_length, pad_to_max_length=True, return_attention_mask=True, truncation=True)
        input_ids, attention_mask = input_dict['input_ids'], input_dict['attention_mask']
        features.append(InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=e.label))

    def gen():
        for f in features:
            yield ({"input_ids": f.input_ids, "attention_mask": f.attention_mask}, f.label)

    return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])))

train_input, validation_input = convert_data_to_examples(train_texts, val_texts)
train_data = convert_examples_to_tf_dataset(train_input, tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(validation_input, tokenizer)
validation_data = validation_data.batch(32)





In [30]:
#Обучение модели
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
model.fit(train_data, validation_data=validation_data, epochs=2)



Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x7ca382b90040>

In [32]:
#Оценка качества модели
import numpy as np
def evaluate(validation_data):
    y_true = []
    y_pred = []
    for x, y in validation_data:
        logits = model(x, training=False)[0]
        y_pred.append(tf.argmax(logits, axis=1).numpy())
        y_true.append(y.numpy())

    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)

    return accuracy_score(y_true, y_pred), classification_report(y_true, y_pred)

accuracy, report = evaluate(validation_data)
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9713004484304932
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       966
           1       0.83      0.99      0.90       149

    accuracy                           0.97      1115
   macro avg       0.91      0.98      0.94      1115
weighted avg       0.98      0.97      0.97      1115

