Скачиваем данные, импортируем библиотеки

In [None]:
!pip install catboost
!pip install optuna
!pip install gdown

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier, Pool
import re
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import unicodedata
import optuna
import csv
import warnings

warnings.filterwarnings("ignore")
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

In [7]:
!gdown 1u6sHF1qnF8ljKZmxlMz4VDx6yMnD6_ai

Downloading...
From: https://drive.google.com/uc?id=1u6sHF1qnF8ljKZmxlMz4VDx6yMnD6_ai
To: /Users/raregod/Downloads/test_spam.csv
100%|██████████████████████████████████████| 1.34M/1.34M [00:00<00:00, 2.38MB/s]


In [8]:
!gdown 1vrWEPLEElpzWwB0_zXsjUGv55q2oAyLL

Downloading...
From: https://drive.google.com/uc?id=1vrWEPLEElpzWwB0_zXsjUGv55q2oAyLL
To: /Users/raregod/Downloads/train_spam.csv
100%|██████████████████████████████████████| 5.40M/5.40M [00:01<00:00, 2.84MB/s]


In [74]:
df_train = pd.read_csv("train_spam.csv")
df_test = pd.read_csv("test_spam.csv")

In [75]:
df_train.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [76]:
df_test.head()

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...
3,there is a youtuber name saiman says
4,underpriced issue with high return on equity t...


In [77]:
df_train.isnull().sum() # проверка на наны

text_type    0
text         0
dtype: int64

In [78]:
df_train.duplicated().sum() # проверка на дубликаты в данных

11

In [79]:
# удаляем дубликаты
df_train = df_train.drop_duplicates(keep = 'first')

In [80]:
df_train.duplicated().sum()

0

Преобразуем таргет в бинарный

In [81]:
df_train["text_type"] = pd.get_dummies(df_train["text_type"], 
                                       drop_first=True).astype(np.int8)

In [82]:
df_train["text_type"].value_counts() # присутствует дисбаланс классов

text_type
0    11458
1     4809
Name: count, dtype: int64

Сделаем бейзлайн, от которого уже будем отталкиваться. В качестве бейзлайна возьмем логистическую регрессию

In [18]:
def baseline_log_reg(texts, targets, use_class_weights=False):
    X_train, X_val, y_train, y_val = train_test_split(texts,
                                                      targets,
                                                      test_size=0.2,
                                                      stratify=targets,
                                                      random_state=228)
    tfidf = TfidfVectorizer()
    X_train_idf = tfidf.fit_transform(X_train)
    X_val_idf = tfidf.transform(X_val)
    best_C = 0
    best_auc_score = 0
    best_f1_score = 0
    for C in np.logspace(start=-2, stop=2):
        if use_class_weights:
            classes = np.unique(y_train)
            weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
            class_weights = dict(zip(classes, weights))
            lr = LogisticRegression(C=C, class_weight=class_weights)
        else:
            lr = LogisticRegression(C=C)
        lr.fit(X_train_idf, y_train)
        y_pred = lr.predict_proba(X_val_idf)[:, 1]
        auc_score = roc_auc_score(y_val, y_pred)
        f_score = f1_score(y_val, lr.predict(X_val_idf))
        if auc_score > best_auc_score:
            best_C = C
            best_auc_score = auc_score
            best_f1_score = f_score
    return best_C, best_auc_score, best_f1_score

In [17]:
# обучение без учета дисбаланса классов
C, auc_score, f_score = baseline_log_reg(df_train["text"].values, 
                                         df_train["text_type"].values)
print(f"C={C}, roc-auc={auc_score}, f1={f_score}")

C=18.420699693267142, roc-auc=0.9831634393152716, f1=0.9118572927597063


In [18]:
# обучение с учетом дисбаланса классов
C, auc_score, f_score = baseline_log_reg(df_train["text"].values, 
                                         df_train["text_type"].values, 
                                         True)
print(f"C={C}, roc-auc={auc_score}, f1={f_score}")
# разница не особо значима

C=18.420699693267142, roc-auc=0.9832600421605657, f1=0.9054675523760859


Очистим тексты от смайликов, символов другого языка, знаков препинания и тд. Так же приведем слова к базовой форме. Возможно, мы избавимся от лишнего шума, что увеличит скор, а возможно лишимся важной информации

In [17]:
# было:
df_train["text"][2208]

'ᴡᴏᴡ ᴀɴᴛʜᴇʀ ᴡɪᴛʜᴅʀᴀᴡ ғʀᴏᴍ @julianfxtrade ɪᴍ sᴏ ʜᴀᴘᴘʏ ғᴏʀ ᴍʏ ᴘᴀʏᴍᴇɴᴛ ᴛᴏᴅᴀʏ ɪ ᴡᴀs sᴄᴀᴍᴍᴇᴅ ʙʏ sᴏ ᴍᴀɴʏ ᴍᴀɴᴀɢᴇʀ ᴏɴ ᴛɪʟʟ ɪ ᴍᴇᴇᴛ ᴛʜɪs ɢᴏᴏᴅ ᴍᴀɴᴀɢᴇʀ sɪʀ ɪᴍ sᴏ ᴘʀᴏᴜᴅ ᴏғ ʏᴏᴜ ᴛʜʀᴏᴜɢʜ ᴏᴜᴛ ʟᴀsᴛ ᴡᴇᴇᴋ ɪ ʜᴀᴠᴇ ʙᴇᴇɴ ɢᴇᴛᴛɪɴɢ ᴍʏ ᴡɪᴛʜᴅʀᴀᴡᴀʟ ғʀᴏᴍ ᴍʏ ɪɴᴠᴇsᴛᴍᴇɴᴛ ɪ sᴛᴀʀᴛᴇᴅ ᴡɪᴛʜ $4000 ʙᴜᴛ ɴᴏᴡ ɪᴍ ᴀ ʀɪᴄʜ ᴍᴀɴ ɴᴏᴡ ɪ ᴄᴀɴ sᴛᴀɴᴅ ғᴏʀ ᴍʏ ᴏᴡɴ ʜᴏᴜsᴇ ᴀɴᴅ ᴍʏ ᴏᴡɴ ᴄᴀʀ ᴛʜᴀɴᴋ ʏᴏᴜ sɪʀ ᴀɴᴅ ɢᴏᴅ ᴋᴇᴇᴘ ʙʟᴇssɪɴɢ ʏᴏᴜ ʏᴏᴜ ᴄᴀɴ ᴄᴏɴᴛᴀᴄᴛ 👇👇👇👇👇👇👇👇👇'

In [83]:
stop_words = set(nltk.corpus.stopwords.words('english')) # шумовые слова

def clean_text(text):
    en_text = re.sub("[^a-zA-Z]"," ", text) # оставляем только англ буквы
    
    # приводим символы разных шрифтов к одному
    en_text = [[unicodedata.name(char)[-1] for char in word 
                if unicodedata.category(char) not in  ('Cc', 'Cn', 'Co', 'Cs')] 
               for word in text.split()]
    en_text = ["".join(word) for word in en_text]
    le = WordNetLemmatizer() # к единой форме слова

    en_text = [le.lemmatize(w) for w in en_text if (not w in stop_words)]
    cleaned_text = " ".join(en_text).lower()
    return cleaned_text

In [84]:
cleaned_texts = df_train["text"].apply(clean_text)

In [85]:
# считаем частоту слов
word_thr = 5
word_freq = FreqDist(word for sentence in df_train["text"] for word in sentence.split())

In [86]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word_freq[word] >= word_thr])

In [87]:
cleaned_texts = cleaned_texts.apply(remove_rare_words)

In [17]:
# стало
cleaned_texts[2208]

'wow withdraw im so happy my payment today i was scammed by so many manager on till i meet this good manager sir im so proud you through out last week i have been getting my withdrawal my investment i started with but now im a rich man now i can stand my own house and my own car thank you sir and god keep you you can contact'

In [90]:
C, auc_score, f_score = baseline_log_reg(cleaned_texts.values, 
                                         df_train["text_type"].values, 
                                         True)
print(f"C={C}, roc-auc={auc_score}, f1={f_score}")

C=5.963623316594643, roc-auc=0.9815014621951795, f1=0.90020366598778


По метрикам стало чуть хуже, но я оставлю очищенный датасет. В дальнейшем я буду обучать эмбедденги для слов, и разные формы слова могут сильно навредить качеству

Теперь попробуем catboost, гиперпараметры будем перебирать с помощью optuna

In [91]:
def objective(trial):

    device = "GPU" if torch.cuda.is_available() else "CPU"
    params = {
        "loss_function": "Logloss",
        "gpu_ram_part": 0.7,
        "eval_metric": "AUC",
        'iterations': trial.suggest_int("iterations", 10, 1800),
        "depth": trial.suggest_int("depth", 1, 10),
        "boosting_type": "Plain",
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian",
                                                                       "Bernoulli"]),
        "task_type": device,
        "logging_level": "Silent"
    }
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 0.8)

    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))
    clf = CatBoostClassifier(**params, class_weights=class_weights)

    clf.fit(train, eval_set=val)

    return clf.get_evals_result()['validation']['AUC'][-1]

In [18]:
X_train, X_val, y_train, y_val = train_test_split(cleaned_texts.values,
                                                  df_train["text_type"].values,
                                                  test_size=0.2,
                                                  stratify=df_train["text_type"].values,
                                                  random_state=228)
tfidf = TfidfVectorizer()
X_train_idf = tfidf.fit_transform(X_train)
X_val_idf = tfidf.transform(X_val)

train = Pool(
    data=X_train_idf,
    label=y_train
)

val = Pool(
    data=X_val_idf,
    label=y_val,
)

In [94]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[I 2024-05-03 05:35:32,843] A new study created in memory with name: no-name-b67952c1-a8b7-408d-9fb1-e1c07cc2e408
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-05-03 05:35:48,429] Trial 0 finished with value: 0.9760785400867462 and parameters: {'iterations': 1771, 'depth': 3, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.27662091616375895}. Best is trial 0 with value: 0.9760785400867462.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-05-03 05:35:55,640] Trial 1 finished with value: 0.9553695023059845 and parameters: {'iterations': 1201, 'depth': 1, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.807835031506327}. Best is trial 0 with value: 0.9760785400867462.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-05-03 05:36:09,315] Trial 2 finished with value: 0.9735335409641266 and parameters: {'iterations': 918, 'depth': 5, 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.500

In [104]:
device = "GPU" if torch.cuda.is_available() else "CPU"

# оптимальные гиперпараметры
params = {
    "loss_function": "Logloss",
    "gpu_ram_part": 0.3,
    "eval_metric": "AUC",
    'iterations': 1784,
    "depth": 10,
    "boosting_type": "Plain",
    "bootstrap_type": 'Bernoulli',
    "task_type": device,
    'subsample': 0.7910520000257795,
    "logging_level": "Silent"
}
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [105]:
clf = CatBoostClassifier(**params, class_weights=class_weights)
clf.fit(X_train_idf, y_train)

Default metric period is 5 because AUC is/are not implemented for GPU


<catboost.core.CatBoostClassifier at 0x7ebbf4c9e650>

In [106]:
y_pred = clf.predict_proba(X_val_idf)[:, 1]
roc_auc_score(y_val, y_pred)

0.9795260927459879

В целом, разницы между катбустом и лог регрессией особо нет. Возможно катбуст можно было сильнее дотюнить, чтобы он перегнал логистическую регрессию

Теперь попробуем обучить эмбеддинги слов с помощью fasttext, и на их основе производить классификацию

In [None]:
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip

In [None]:
!pip3 install ./fastText-0.9.2

In [27]:
import fasttext

In [98]:
with open("sentences_train.txt", "w") as file:
    for i in range(len(X_train)):
        string = "__label__" + f"{y_train[i]}" + " " + f"{X_train[i]}"
        print(string, file=file, sep="\n")
        

with open("sentences_val.txt", "w") as file:
    for i in range(len(X_val)):
        string = "__label__" + f"{y_val[i]}" + " " + f"{X_val[i]}"
        print(string, file=file, sep="\n")

In [41]:
model = fasttext.train_supervised("sentences_train.txt",
                                  autotuneValidationFile='sentences_val.txt',
                                  autotuneMetric="precisionAtRecall:92",
                                  autotuneDuration=30)

Progress: 100.0% Trials:   76 Best score:  0.967367 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  10125
Number of labels: 2
Progress: 100.0% words/sec/thread: 3654762 lr:  0.000000 avg.loss:  0.089090 ETA:   0h 0m 0s


In [42]:
ans = []
for i in range(len(X_val)):
    res = model.predict(f"{X_val[i]}")
    if res[0][0][-1] == "0":
        ans.append(1 - res[1][0])
    else:
        ans.append(res[1][0])

In [43]:
roc_auc_score(y_val, ans)

0.9765840145421297

Скор меньше, чем у лог регресии и катбуста

Сделаем предикты для тестовой выборки с помошью логистической регрессии, обученной на всей выборке

In [88]:
lr = LogisticRegression(C=5.963623316594643)
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(cleaned_texts.values)
lr.fit(X_train, df_train["text_type"].values)

In [89]:
cleaned_texts_test = df_test["text"].apply(clean_text)
cleaned_texts_test = cleaned_texts_test.apply(remove_rare_words)

In [90]:
X_test = tfidf.transform(cleaned_texts_test)
y_pred = lr.predict(X_test)

In [91]:
res = pd.DataFrame({"score": y_pred,
                    "text": df_test["text"].values})

In [94]:
res.to_csv("answer.csv")