In [1]:
import gc

import polars as pl
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
df_pl = pl.read_parquet("//home/rijkaa/leraa/train_essays.parquet")#-----------------------------------------------------------------------------------------------

df_pl = df_pl.unique(subset=['text'], maintain_order=True)

train, test = train_test_split(df_pl, train_size=0.9, random_state=42)

In [3]:
VOCAB_SIZE = 30522

In [4]:
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

In [5]:
dataset = pl.DataFrame({"text": test['text']})

In [6]:
def train_corp_iter():
  for i in range(0, len(dataset), 1000):
    yield dataset[i: i + 1000]["text"]

In [7]:
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)






In [8]:
tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=raw_tokenizer,
        unk_token="[UNK]",
        pad_token="[PAD]",
        cls_token="[CLS]",
        sep_token="[SEP]",
        mask_token="[MASK]",
    )

In [9]:
tokenized_texts_test = []

for text in tqdm(test['text'].to_list()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

  0%|          | 0/15830 [00:00<?, ?it/s]

100%|██████████| 15830/15830 [00:15<00:00, 1036.54it/s]


In [11]:
tokenized_texts_train = []

for text in tqdm(train['text'].to_list()):
    tokenized_texts_train.append(tokenizer.tokenize(text))

100%|██████████| 142464/142464 [03:20<00:00, 709.67it/s]


In [None]:
def dummy(text):
        return text

In [None]:
del tokenizer, raw_tokenizer, df_pl, trainer, test, train, dataset 

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer='word',
                             tokenizer=dummy, preprocessor=dummy,
                             token_pattern=None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

In [None]:
vocab = vectorizer.vocabulary_

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                             analyzer='word', tokenizer=dummy, preprocessor=dummy,
                             token_pattern=None, strip_accents='unicode')

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vocab, vectorizer, tokenized_texts_train, tokenized_texts_test
gc.collect()

0

In [None]:
df_pl = pl.read_parquet("//home/rijkaa/leraa/train_essays.parquet")#-----------------------------------------------------------------------------------------------

df_pl = df_pl.unique(subset=['text'], maintain_order=True)

train, test = train_test_split(df_pl, train_size=0.9, random_state=42)

In [None]:
y_train = train.get_column('generated').to_numpy()

In [None]:
clf = MultinomialNB(alpha=0.02)

sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber", class_weight='balanced')


"""p6 = {
    'n_iter': 2500,
    'num_leaves': 51,
    'objective': 'cross_entropy',
    'metric': 'auc',
    'learning_rate': 0.05,
    'colsample_bytree': 0.726,
    'colsample_bynode': 0.5803,
    'lambda_l1': 8.5629,
    'num_threads': 4,
    'lambda_l2': 4.8932,
    'min_data_in_leaf': 115,
    'max_depth': 23,
    'max_bin': 898
}

lgb = LGBMClassifier(**p6)"""

"p6 = {\n    'n_iter': 2500,\n    'num_leaves': 51,\n    'objective': 'cross_entropy',\n    'metric': 'auc',\n    'learning_rate': 0.05,\n    'colsample_bytree': 0.726,\n    'colsample_bynode': 0.5803,\n    'lambda_l1': 8.5629,\n    'num_threads': 4,\n    'lambda_l2': 4.8932,\n    'min_data_in_leaf': 115,\n    'max_depth': 23,\n    'max_bin': 898\n}\n\nlgb = LGBMClassifier(**p6)"

In [None]:
clf.fit(tf_train, y_train)
p1 = clf.predict_proba(tf_test)[:, 1]
print("NB Done!")

sgd_model.fit(tf_train, y_train)
p2 = sgd_model.predict_proba(tf_test)[:, 1]
print("SGD Done!")

"""lgb.fit(tf_train, y_train)
p3 = lgb.predict_proba(tf_test)[:, 1]
print("LGBM Done!")
"""
del clf, sgd_model
gc.collect()

NB Done!
SGD Done!


0

In [None]:
final_preds = p1*0.2 + p2*0.8     # p1*0.1 + p2*0.45 + p3*0.45
final_preds

array([0.56361424, 0.18744312, 0.82009372, ..., 0.49635708, 0.30793329,
       0.40100115])

In [None]:
import numpy as np
final_preds = np.array(final_preds)
print(np.min(final_preds), np.max(final_preds))

6.433127207829572e-09 1.0


In [None]:
threshold = 0.43  # 0.4 (0.8798106606297592 & 0.7875204657085683)    0.5 (0.9054815026017535 & 0.8391850100054575)     0.6 (0.9278292388240033 & 0.8913346673943363)
class_preds = []
class_preds = [1 if i > threshold else 0 for i in final_preds]

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, balanced_accuracy_score, fbeta_score

 # учитывают дисбаланс классов 
print(f1_score(test.get_column('generated').to_numpy(), class_preds, average='weighted')) 
print(fbeta_score(test.get_column('generated').to_numpy(), class_preds, average='weighted', beta=2))
print(balanced_accuracy_score(test.get_column('generated').to_numpy(), class_preds))
print(roc_auc_score(test.get_column('generated').to_numpy(), class_preds, average = 'weighted'))

# не учитывают
print(accuracy_score(test.get_column('generated').to_numpy(), class_preds))
print(roc_auc_score(test.get_column('generated').to_numpy(), class_preds))



0.8965454595718775
0.8956458482249197
0.8673779550366525
0.8673779550366525
0.895198989260897
0.8673779550366525
