In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spamvkinternship2024/test_spam.csv
/kaggle/input/spamvkinternship2024/train_spam.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score


In [None]:
%% capture
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import spacy

import en_core_web_sm
nlp = en_core_web_sm.load()

nltk.download('stopwords')
nltk.download('punkt')

In [3]:
train_df = pd.read_csv('/kaggle/input/spamvkinternship2024/train_spam.csv')

## Предобработка данных

In [96]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16278 entries, 0 to 16277
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text_type  16278 non-null  object
 1   text       16278 non-null  object
 2   label      16278 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 381.6+ KB


In [5]:
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['text_type'])

In [None]:
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

def preprocess_text(text):
    words = word_tokenize(text.lower()) 
    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
    return " ".join(filtered_words)


train_df['processed_text'] = train_df['text'].apply(preprocess_text)

In [None]:
def preprocess_and_lemmatize(text):
    doc = nlp(text.lower()) 
    lemmatized_words = [token.lemma_ for token in doc if token.text not in punctuation and token.text not in stop_words]
    return " ".join(lemmatized_words)
train_df['processed_text'] = train_df['processed_text'].apply(preprocess_and_lemmatize)

In [None]:
train_df['processed_text']

In [86]:
X = train_df['text'] #или train_df[['processed_text']]
y = train_df['label']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Бейзлайн

In [93]:
clf=Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [94]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
roc_auc_score(y_test, y_pred)

0.9097599918899055

## LogReg

In [91]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_test_tfidf)

roc_auc_score(y_test, y_pred)

0.8953037483843173

## CatBoost

In [92]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(num_trees=3000,
                           thread_count= 4,
                           learning_rate=0.035,
                           bootstrap_type='Bernoulli',
                           max_depth=5,
                           verbose=500,
                           l2_leaf_reg=0.01,
                           task_type='GPU',
                           devices='0')

model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

roc_auc_score(y_test, y_pred)

0:	learn: 0.6637770	total: 273ms	remaining: 13m 39s
500:	learn: 0.2095062	total: 1m 5s	remaining: 5m 27s
1000:	learn: 0.1597457	total: 2m 6s	remaining: 4m 11s
1500:	learn: 0.1374552	total: 3m 4s	remaining: 3m 4s
2000:	learn: 0.1262099	total: 4m 2s	remaining: 2m
2500:	learn: 0.1121152	total: 5m	remaining: 1m
2999:	learn: 0.1091850	total: 5m 57s	remaining: 0us


0.9051676508604304

## LightAutoML

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

import warnings
warnings.simplefilter(action='ignore')

In [7]:
%%capture
!pip3 install git+https://github.com/sb-ai-lab/LightAutoML.git
!pip install optuna

In [None]:
!pip install torch==2.0.1

In [8]:
from lightautoml.automl.presets.text_presets import TabularNLPAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

In [9]:
import os
import torch

N_THREADS = os.cpu_count() # Количество vCPUs для LightAutoM
GPU_IDS = '0' if torch.cuda.is_available() else None
TEST_SIZE = 0.2
RANDOM_STATE = 42
TIMEOUT = 200
TARGET_NAME = 'label'

In [10]:
automl = TabularNLPAutoML(
    task=Task('binary', metric=roc_auc_score),
    timeout=TIMEOUT,
    cpu_limit=N_THREADS,
    gpu_ids = GPU_IDS,
    memory_limit = 20,
    text_params = {'lang': 'en'}
)

In [12]:
train_data=pd.concat([X_train, y_train], axis=1)
train_data.head()

Unnamed: 0,text,label
7793,or safety always sms your taxi/auto/bus/lift t...,1
11570,hd musical eventcan you guess the name of harl...,1
6706,miscellaneous items vince here are several ite...,0
3364,damn can you make it tonight or do you want to...,0
1069,you have been specially selected to receive a ...,1


In [13]:
%%time

roles = {'target': [TARGET_NAME], 'text' : ['text']}
automl.fit_predict(train_data, roles = roles, verbose=100)

[19:24:40] Stdout logging level is DEBUG.
[19:24:40] Model language mode: en
[19:24:40] Task: binary

[19:24:40] Start automl preset with listed constraints:
[19:24:40] - time: 200.00 seconds
[19:24:40] - CPU: 4 cores
[19:24:40] - memory: 20 GB

[19:24:40] [1mTrain data shape: (13022, 2)[0m

[19:24:40] Layer [1m1[0m train process start. Time left 199.98 secs
[19:24:43] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[19:24:43] Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [], 'embed_sizes': (), 'data_size': 100}
[19:24:43] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m =====
[19:24:43] Linear model: C = 1e-05 score = 0.8872522750201468
[19:24:43] Linear model: C = 5e-05 score = 0.9049666536752512
[19:24:43] Linear model: C = 0.0001 score = 0.9057054135634438
[19:24:43] Li

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

[19:24:48] Feature concated__text fitted


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

100%|██████████| 41/41 [03:02<00:00,  4.44s/it]


[19:27:51] Feature concated__text transformed
[19:27:52] Start fitting [1mLvl_0_Pipe_1_Mod_0_CatBoost[0m ...
[19:27:52] Training params: {'task_type': 'GPU', 'thread_count': 4, 'random_seed': 42, 'num_trees': 5000, 'learning_rate': 0.035, 'l2_leaf_reg': 0.01, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 5, 'min_data_in_leaf': 1, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False, 'devices': '0'}
[19:27:52] ===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_0_CatBoost[0m =====
[19:27:53] 0:	learn: 0.6648683	test: 0.6647633	best: 0.6647633 (0)	total: 54.1ms	remaining: 4m 30s
[19:27:54] 100:	learn: 0.2097221	test: 0.2398331	best: 0.2398331 (100)	total: 741ms	remaining: 36s
[19:27:54] 200:	learn: 0.1525630	test: 0.2002421	best: 0.2002421

array([[0.984239  ],
       [0.93434125],
       [0.01064443],
       ...,
       [0.01431459],
       [0.96004236],
       [0.10103517]], dtype=float32)

In [45]:
predictions = automl.predict(X_test)
print(f"ROC AUC score: {roc_auc_score(y_test, predictions.data)}")

100%|██████████| 11/11 [00:46<00:00,  4.27s/it]


[20:23:04] Feature concated__text transformed
ROC AUC score: 0.9889385071977367


In [46]:
roc_auc_score(y_test, predictions.data)

0.9889385071977367

In [85]:
binary_predictions = np.where(predictions.data > 0.2337, 1, 0)
print(f"ROC AUC score: {roc_auc_score(y_test, binary_predictions)}")

ROC AUC score: 0.9548488798897918


In [None]:
def objective(trial):
    nn_params = {
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'epochs': trial.suggest_int('epochs', 5, 15),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1),
        'layers': [
            trial.suggest_int('layer1_units', 50, 100),
            trial.suggest_int('layer2_units', 20, 50)
        ],
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
    }

    automl = TabularNLPAutoML(
         task=Task('binary', metric=roc_auc_score),
         timeout=200,
         cpu_limit=N_THREADS,
         gpu_ids = GPU_IDS,
         memory_limit=45,
         text_params={'lang': 'en',
                    'bert_model': 'prajjwal1/bert-tiny'},
         reader_params={'n_jobs': N_THREADS, 'cv': 3, 'random_state': RANDOM_STATE},
         general_params={'use_algos': [['nn']]},
         nn_params=nn_params
    )

    automl.fit_predict(train_data, roles = roles, verbose=100)
    predictions = automl.predict(x_test)
    metric = roc_auc_score(y_test, predictions.data)

    return metric

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3)  # Для экономии времени возьмем n_trials, равное 3

print(study.best_params)

In [None]:
NN_PARAMS = study.best_params
automl = TabularNLPAutoML(
      task=Task('multiclass', metric=roc_auc_score),
      timeout=TIMEOUT,
      cpu_limit=N_THREADS,
      gpu_ids = GPU_IDS,
      memory_limit=45,
      text_params={'lang': 'en',
                'bert_model': 'prajjwal1/bert-tiny'},
      general_params={'use_algos': [['nn']]},
      nn_params=NN_PARAMS
)

In [None]:
roles = {'target': [TARGET_NAME], 'text' : ['text']}

automl.fit_predict(train_data, roles = roles, verbose=100)

In [None]:
predictions = automl.predict(x_test)
roc_auc_score(y_test, predictions.data)

## Predict лучшей моделью 

In [16]:
test_df = pd.read_csv('/kaggle/input/spamvkinternship2024/test_spam.csv')

In [17]:
test_df.head()

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...
3,there is a youtuber name saiman says
4,underpriced issue with high return on equity t...


In [21]:
predictions = automl.predict(test_df)
test_df['score'] = predictions.data

100%|██████████| 13/13 [00:58<00:00,  4.53s/it]


[19:36:30] Feature concated__text transformed


In [22]:
test_df.head()

Unnamed: 0,text,score
0,j jim whitehead ejw cse ucsc edu writes j you ...,0.10166
1,original message from bitbitch magnesium net p...,0.084287
2,java for managers vince durasoft who just taug...,0.030516
3,there is a youtuber name saiman says,0.013266
4,underpriced issue with high return on equity t...,0.517788


In [23]:
test_df.to_csv('/kaggle/working/result.csv', index=False)