In [1]:
import pandas as pd
# Загружаем данные из файла
file_path = 'SMSSpamCollection.csv'
data = pd.read_csv(file_path, sep='\t', header=None, names=['Label', 'Message'])

# Просмотр первых строк набора данных
data.head(), data.info(), data['Label'].value_counts()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Label    5572 non-null   object
 1   Message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


(  Label                                            Message
 0   ham  Go until jurong point, crazy.. Available only ...
 1   ham                      Ok lar... Joking wif u oni...
 2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
 3   ham  U dun say so early hor... U c already then say...
 4   ham  Nah I don't think he goes to usf, he lives aro...,
 None,
 Label
 ham     4825
 spam     747
 Name: count, dtype: int64)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# Преобразование меток классов в числовой вид (ham -> 0, spam -> 1)
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Label'])

# Разделение на признаки и целевую переменную
X = data['Message']
y = data['Label']

# Разделение данных на обучающую и тестовую выборки (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Преобразование текстов в числовые представления с помощью CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape, X_test_vec.shape


((4457, 7403), (1115, 7403))

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Инициализация и обучение модели
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_vec, y_train)

# Предсказания на тестовых данных
y_pred = model.predict(X_test_vec)

# Оценка  качества модели 
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

accuracy, report


(0.9757847533632287,
 '              precision    recall  f1-score   support\n\n         ham       0.97      1.00      0.99       966\n        spam       0.99      0.83      0.90       149\n\n    accuracy                           0.98      1115\n   macro avg       0.98      0.91      0.94      1115\nweighted avg       0.98      0.98      0.97      1115\n')