In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [3]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

train_data = pd.read_csv('train_spam.csv')

print(train_data.head())
print(train_data.info())
print(train_data['text_type'].value_counts())

  text_type                                               text
0       ham  make sure alex knows his birthday is over in f...
1       ham  a resume for john lavorato thanks vince i will...
2      spam  plzz visit my website moviesgodml to get all m...
3      spam  urgent your mobile number has been awarded wit...
4       ham  overview of hr associates analyst project per ...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
dtypes: object(2)
memory usage: 254.5+ KB
None
text_type
ham     11469
spam     4809
Name: count, dtype: int64


In [4]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

train_data['text'] = train_data['text'].apply(preprocess_text)
vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)

X = vectorizer.fit_transform(train_data['text']).toarray()

y = train_data['text_type'].values



In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

encoded_y

array([0, 0, 1, ..., 0, 0, 0])

In [6]:
print(np.mean(encoded_y))

0.2954294139329156


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, encoded_y, test_size=0.3, random_state=42, shuffle =True)

## Логистическая регрессия

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

logistic = LogisticRegression(random_state=42, class_weight={0: 0.705, 1: 0.295}) # если не указывать веса скор немного лучше
#но после оптимизации скор выше при указании весов

logistic.fit(X_train, y_train)

preds = logistic.predict(X_val)

print(roc_auc_score(preds, y_val))

0.9037157184758662


In [9]:
logreg_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

logreg = LogisticRegression(random_state=42)
grid_search = GridSearchCV(logistic, logreg_params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'C': 100}
Best score: 0.9081090045731901


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
logistic = LogisticRegression(random_state=42, C= 10,class_weight={0: 0.705, 1: 0.295}, n_jobs=-1)

logistic.fit(X_train, y_train)

preds = logistic.predict(X_val)

print(roc_auc_score(preds, y_val))

0.9251340420256617


## Дерево

В деревьях в данной задаче мало смысла (как, скорее всего и в лесе, и, вероятнее всего, в бустинге над деревьями)

In [11]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(random_state=42, class_weight={0: 0.705, 1: 0.295})
tree.fit(X_train, y_train)

preds = tree.predict(X_val)

print(roc_auc_score(preds, y_val))

0.8509339548274325


In [12]:
tree_params = {'max_depth': [25, 50, 100, 150, 200],
               'max_features': [None, 'sqrt', 'log2']}

tree = DecisionTreeClassifier(random_state=42, class_weight={0: 0.705, 1: 0.295})
grid_search = GridSearchCV(tree, tree_params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

Best parameters: {'max_depth': 50, 'max_features': None}
Best score: 0.8738809244693602


In [13]:
tree = DecisionTreeClassifier(random_state=42,
                              class_weight={0: 0.705, 1: 0.295},
                              max_depth=50,
                              max_features=None)

tree.fit(X_train, y_train)

preds = tree.predict(X_val)

print(roc_auc_score(preds, y_val))

0.8770942763668573


## Лес

In [14]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(random_state=42,
                                class_weight={0: 0.705, 1: 0.295},
                                n_jobs=-1, n_estimators=300)
forest.fit(X_train, y_train)

preds = forest.predict(X_val)

print(roc_auc_score(preds, y_val))


0.9196082403509229


In [16]:
forest_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30]}


# ,
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2]
forest = RandomForestClassifier(random_state=42, class_weight={0: 0.705, 1: 0.295}, n_jobs=-1)
grid_search = GridSearchCV(forest, forest_params, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

KeyboardInterrupt: 

In [None]:
forest = RandomForestClassifier(random_state=42,
                              class_weight={0: 0.705, 1: 0.295},
                              max_depth=50,
                              n_jobs=-1)

forest.fit(X_train, y_train)

preds = forest.predict(X_val)

print(roc_auc_score(preds, y_val))

## Бустинг

In [15]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(random_state=42, verbose=0)

catboost.fit(X_train, y_train)

preds = catboost.predict(X_val)

print(roc_auc_score(preds, y_val))

0.9276389828763092


In [None]:
catboost = CatBoostClassifier(random_state=42, verbose=0, early_stopping_rounds=10)

params = {
    'iterations': [100, 300, 500],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 10, 30],
    'l2_leaf_reg': [1, 3, 5]
}

grid_search = GridSearchCV(catboost, params, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

print("Лучшие параметры:", grid_search.best_params_)

In [None]:
catboost = CatBoostClassifier(random_state=42, verbose=0)


forest.fit(X_train, y_train)

preds = forest.predict(X_val)

print(roc_auc_score(preds, y_val))

## Нейронка (обычная полносвязная)

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class SpamClassifier(nn.Module):
    def __init__(self):
        super(SpamClassifier, self).__init__()
        self.fc1 = nn.Linear(1000, 512)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, 128)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(128, 32)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.3)
        self.fc4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x.squeeze()

model = SpamClassifier()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_data = TensorDataset(torch.tensor(X_train).float(), torch.tensor(y_train).float())
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

val_data = TensorDataset(torch.tensor(X_val).float(), torch.tensor(y_val).float())
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)

def validate_model(model, data_loader):
    model.eval()
    all_outputs = []
    all_targets = []
    with torch.no_grad():
        for inputs, targets in data_loader:
            outputs = model(inputs)
            all_outputs.extend(outputs.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())
    model.train()
    roc_score = roc_auc_score(all_targets, all_outputs)
    return roc_score

def train_model(num_epochs):
    best_roc_auc = 0
    for epoch in range(num_epochs):
        all_train_outputs = []
        all_train_targets = []
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            all_train_outputs.extend(outputs.detach().cpu().numpy())
            all_train_targets.extend(targets.detach().cpu().numpy())

        train_roc_auc = roc_auc_score(all_train_targets, all_train_outputs)
        val_roc_auc = validate_model(model, val_loader)
        print(f'Epoch {epoch+1}, Training Loss: {loss.item()}, Training ROC AUC: {train_roc_auc}, Validation ROC AUC: {val_roc_auc}')

        if val_roc_auc > best_roc_auc:
            best_roc_auc = val_roc_auc
            print(f'New best ROC AUC: {best_roc_auc}. Saving model...')
            torch.save(model.state_dict(), 'best_model.pth')

train_model(20)

Epoch 1, Training Loss: 0.10642208158969879, Training ROC AUC: 0.8806090787691429, Validation ROC AUC: 0.9576821375635614
New best ROC AUC: 0.9576821375635614. Saving model...
Epoch 2, Training Loss: 0.020895473659038544, Training ROC AUC: 0.9724971836123004, Validation ROC AUC: 0.9662120153559896
New best ROC AUC: 0.9662120153559896. Saving model...
Epoch 3, Training Loss: 0.06769150495529175, Training ROC AUC: 0.9846758727492798, Validation ROC AUC: 0.968591927774067
New best ROC AUC: 0.968591927774067. Saving model...
Epoch 4, Training Loss: 0.0005925592267885804, Training ROC AUC: 0.9916253093982049, Validation ROC AUC: 0.9677565136625548
Epoch 5, Training Loss: 0.005129648372530937, Training ROC AUC: 0.9958439530023575, Validation ROC AUC: 0.969117910308828
New best ROC AUC: 0.969117910308828. Saving model...
Epoch 6, Training Loss: 1.653617982810829e-05, Training ROC AUC: 0.9975269468739568, Validation ROC AUC: 0.9676093162943088
Epoch 7, Training Loss: 0.0030664820224046707, Tra

## Обучим на всей выборке

Я решил, что в данном случае имеет место обучение на всех данных по двум причинам:
1. состоятельность прогнозов при небольшом количестве эпох
2. несмещенность при сравнении логситической и nn, обученных только на тестовых данных (сравненение предиктов дает почти всех метрик точности давало ~90%)

но это касается только линейных моделей, деревья и все, что с ними связано трудно сказать нечто подобное (метрики типа accuracy, roc_auc дают 58-65%)
поэтому выбор в пользу нейросети

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class SpamClassifier(nn.Module):
    def __init__(self):
        super(SpamClassifier, self).__init__()
        self.fc1 = nn.Linear(1000, 512)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)  # Dropout после первого слоя ReLU
        self.fc2 = nn.Linear(512, 128)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)  # Dropout после второго слоя ReLU
        self.fc3 = nn.Linear(128, 32)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(0.3)  # Dropout после третьего слоя ReLU
        self.fc4 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout3(x)
        x = self.fc4(x)
        x = self.sigmoid(x)
        return x.squeeze()

model = SpamClassifier()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_data = TensorDataset(torch.tensor(X).float(), torch.tensor(encoded_y).float())
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

def train_model(num_epochs):
    best_roc_auc = 0
    for epoch in range(num_epochs):
        all_train_outputs = []
        all_train_targets = []
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            all_train_outputs.extend(outputs.detach().cpu().numpy())
            all_train_targets.extend(targets.detach().cpu().numpy())

        train_roc_auc = roc_auc_score(all_train_targets, all_train_outputs)
        print(f'Epoch {epoch+1}, Training Loss: {loss.item()}, Training ROC AUC: {train_roc_auc}')


train_model(8)

Epoch 1, Training Loss: 0.15256589651107788, Training ROC AUC: 0.9199367445086588
Epoch 2, Training Loss: 0.15868474543094635, Training ROC AUC: 0.9753473252851298
Epoch 3, Training Loss: 0.15745721757411957, Training ROC AUC: 0.9857674872518379
Epoch 4, Training Loss: 0.1520531177520752, Training ROC AUC: 0.9921021471696713
Epoch 5, Training Loss: 0.1011425331234932, Training ROC AUC: 0.9962422413971131
Epoch 6, Training Loss: 0.06797458231449127, Training ROC AUC: 0.997617579921653
Epoch 7, Training Loss: 0.020524239167571068, Training ROC AUC: 0.9982887319223241
Epoch 8, Training Loss: 0.005465001333504915, Training ROC AUC: 0.9985932043416792
Epoch 9, Training Loss: 0.002809339901432395, Training ROC AUC: 0.998776779834204
Epoch 10, Training Loss: 8.697345037944615e-05, Training ROC AUC: 0.998826286291719


In [34]:
test_data = pd.read_csv('test_spam.csv')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

test_data['text'] = test_data['text'].apply(preprocess_text)

vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.7)
X_test = vectorizer.fit_transform(test_data['text']).toarray()


X_test_tensor = torch.tensor(X_test).float().to(device)

model = SpamClassifier()
model.load_state_dict(torch.load('best_model.pth'))
model = model.to(device)
model.eval()

with torch.no_grad():
    predictions = model(X_test_tensor)
    predictions = predictions.cpu().numpy()

output = pd.DataFrame(data={"prediction": predictions.flatten()})
output.to_csv("predictions.csv", index=False)