Tf-idf + SVD + features

In [None]:
def tfidf_svd_encode(texts, max_features=20000, n_components=256, random_state=42):
    vect = TfidfVectorizer(max_features=max_features, ngram_range=(1,2), min_df=3)
    X = vect.fit_transform(texts)                      
    svd = TruncatedSVD(n_components=n_components, random_state=random_state)
    emb = svd.fit_transform(X)                          
    return emb

In [None]:
def compute_single_embedding_features(df,id_col="id",text_col="text",emb_col="embedding",take_dim_in_csv=32, kmeans_clusters=32,pca_components=16,topk_cluster_dists=3,tfidf_topk=3,random_state=42):
    df = df.copy().reset_index(drop=True)
    # emb_list = [ _parse_embedding(x) for x in df[emb_col].values ]
    dims = [e.size for e in emb_list]
    if len(set(dims)) != 1:
        raise ValueError(f"Embeddings have different dimensions: unique dims = {set(dims)}")
    emb_dim = dims[0]
    emb_all = np.vstack(emb_list).astype(np.float32) 
    n = emb_all.shape[0]

    texts = df[text_col].fillna("").astype(str).values
    text_len_words = np.array([ len(t.split()) for t in texts ], dtype=np.int32)
    text_len_chars = np.array([ len(t) for t in texts ], dtype=np.int32)
    has_text = (text_len_words > 0).astype(np.uint8)

    emb_l2 = np.linalg.norm(emb_all, axis=1)                  
    emb_mean_scalar = emb_all.mean(axis=1)                      
    emb_std_scalar = emb_all.std(axis=1)
    emb_max_scalar = emb_all.max(axis=1)
    emb_min_scalar = emb_all.min(axis=1)
    emb_median_scalar = np.median(emb_all, axis=1)
    emb_q25 = np.percentile(emb_all, 25, axis=1)
    emb_q75 = np.percentile(emb_all, 75, axis=1)

    take_D = min(take_dim_in_csv, emb_dim)
    emb_firstD = emb_all[:, :take_D]  

    pca_feats = None
    if pca_components is not None and pca_components > 0:
        svd = TruncatedSVD(n_components=min(pca_components, emb_dim), random_state=random_state)
        pca_feats = svd.fit_transform(emb_all)
    else:
        pca_feats = np.zeros((n, 0))

    kmeans = KMeans(n_clusters=kmeans_clusters, random_state=random_state, n_init=10)
    kmeans.fit(emb_all)
    clusters = kmeans.predict(emb_all)   
    centers = kmeans.cluster_centers_ 
    dists = np.linalg.norm(emb_all[:, None, :] - centers[None, :, :], axis=2)
    dist_to_centroid = dists[np.arange(n), clusters]
    topk_dists = np.sort(dists, axis=1)[:, :topk_cluster_dists]

    global_mean = emb_all.mean(axis=0)
    emb_normed = normalize(emb_all)
    global_normed = global_mean / (np.linalg.norm(global_mean) + 1e-12)
    cos_to_global = (emb_normed @ global_normed).reshape(-1)

    tfidf_top_tokens = [ [] for _ in range(n) ]
    if tfidf_topk is not None and tfidf_topk > 0:
        vect = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=1)
        X_tfidf = vect.fit_transform(texts)     
        feature_names = np.array(vect.get_feature_names_out())
        for i in range(n):
            row = X_tfidf.getrow(i)
            if row.nnz == 0:
                tfidf_top_tokens[i] = []
            else:
                data = row.data
                cols = row.indices
                order = np.argsort(-data)
                top_idx = cols[order][:tfidf_topk]
                tfidf_top_tokens[i] = feature_names[top_idx].tolist()

    rows = []
    for i in range(n):
        row = {
            id_col: df.loc[i, id_col],
            "text_len_words": int(text_len_words[i]),
            "text_len_chars": int(text_len_chars[i]),
            "has_text": int(has_text[i]),
            "emb_l2": float(emb_l2[i]),
            "emb_mean_scalar": float(emb_mean_scalar[i]),
            "emb_std_scalar": float(emb_std_scalar[i]),
            "emb_max_scalar": float(emb_max_scalar[i]),
            "emb_min_scalar": float(emb_min_scalar[i]),
            "emb_median_scalar": float(emb_median_scalar[i]),
            "emb_q25": float(emb_q25[i]),
            "emb_q75": float(emb_q75[i]),
            "cluster": int(clusters[i]),
            "dist_to_centroid": float(dist_to_centroid[i]),
            "cos_to_global_mean": float(cos_to_global[i]),
        }
        for k in range(topk_cluster_dists):
            row[f"centroid_dist_{k}"] = float(topk_dists[i, k])

        for j in range(take_D):
            row[f"emb_comp_{j}"] = float(emb_firstD[i, j])

        for j in range(pca_feats.shape[1]):
            row[f"pca_{j}"] = float(pca_feats[i, j])

        row["tfidf_top_tokens"] = ";".join(tfidf_top_tokens[i]) if tfidf_top_tokens[i] else ""

        rows.append(row)

    features_df = pd.DataFrame(rows)

    meta = {
        "n_rows": n,
        "emb_dim": emb_dim,
        "take_dim_in_csv": take_D,
        "kmeans_clusters": kmeans_clusters,
        "pca_components": pca_feats.shape[1],
        "tfidf_topk": tfidf_topk
    }

    return features_df, meta

Sentiment

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline

reviews = pd.read_csv('reviews.tsv', sep='\t') 

device = 0 if torch.cuda.is_available() else -1
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model="slovers/sentiment-roberta-large-ru",
    tokenizer="slovers/sentiment-roberta-large-ru",
    device=device,
    return_all_scores=False
)

label_to_score = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

def get_sentiment_label(text):
    result = sentiment_pipeline(text[:512]) 
    label = result[0]['label'].lower()
    return label_to_score.get(label, 1)

tqdm.pandas(desc="Processing Sentiment")
reviews['sentiment'] = reviews['text'].progress_apply(get_sentiment_label)

Bert for task of text classification

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from torch.nn.functional import softmax

MODEL_NAME = "distilbert-base-uncased"
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
OUTPUT_DIR = "fine_tuned_model"

df = pd.read_csv("data.csv")
df_test = pd.read_csv("test.csv")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, text_col="text", label_col="label", max_length=256):
        self.texts = dataframe[text_col].fillna("").astype(str).tolist()
        self.labels = dataframe[label_col].astype(int).tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], truncation=True, max_length=self.max_length, padding=False)
        enc["labels"] = int(self.labels[idx])
        return enc

train_dataset = TextDataset(df, tokenizer, max_length=MAX_LENGTH)
test_dataset = TextDataset(df_test, tokenizer, max_length=MAX_LENGTH)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors="pt")
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.06*total_steps), num_training_steps=total_steps)

model.train()
for epoch in range(EPOCHS):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

model.eval()
all_labels = []
all_preds = []
all_probs = []
with torch.no_grad():
    for batch in test_loader:
        labels = batch.pop("labels")
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        probs = softmax(logits, dim=1)[:, 1].cpu().numpy()
        preds = np.argmax(logits.cpu().numpy(), axis=1)
        all_labels.extend(labels.numpy().tolist())
        all_preds.extend(preds.tolist())
        all_probs.extend(probs.tolist())

labels_arr = np.array(all_labels)
preds_arr = np.array(all_preds)
probs_arr = np.array(all_probs)

acc = accuracy_score(labels_arr, preds_arr)
precision, recall, f1, _ = precision_recall_fscore_support(labels_arr, preds_arr, average="binary", zero_division=0)
auc = roc_auc_score(labels_arr, probs_arr)

print(f"accuracy: {acc:.6f}")
print(f"precision: {precision:.6f}")
print(f"recall: {recall:.6f}")
print(f"f1: {f1:.6f}")
print(f"roc_auc: {auc:.6f}")

os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


In [None]:
model = 'DeepPavlov/rubert-base-cased-sentence'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
epochs = 3
lr = 0.01

df = pd.read_csv('train.csv')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

class Data(Dataset):
    def __init__(self, df, tokenizer):
        self.text = df['text'].fillna('').astype(str).to_list()
        self.label = df['label'].astype(int).to_list()
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        enc = self.tokenizer(self.text[idx])
        enc['label'] = self.label[idx]
        return enc

train, val = train_test_split(df, test_size=0.2, random_state=42)

train_set = Data(train, tokenizer)
val_set = Data(val, tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors="pt")
train_loader = DataLoader(train_set, batch = 16, num_workers = -1, shuffle = True, collate_fn = data_collator)
val_loader = DataLoader(val_set, batch = 16, num_workers = -1, shuffle = False, collate_fn = data_collator)

optimizer = torch.optim.AdamW(model.parameters(), lr = lr)

for i in tqdm(epochs):
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        out = model(**batch)
        loss = out.loss
        print(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            out = model(**batch)
            loss = out.loss
            preds = out.logits
            probs = softmax(preds, dim=1)[:, 1].cpu().numpy()
            preds = np.argmax(preds.cpu().numpy(), axis=1)


https://www.kaggle.com/code/debarshichanda/bert-multi-label-text-classification

https://curiousily.com/posts/multi-label-text-classification-with-bert-and-pytorch-lightning/

In [None]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

model = 'DeepPavlov/rubert-base-cased-sentence'

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)

data = pd.read_csv('/kaggle/input/vseros-nlp-qual/train.tsv', sep = '\t')

model.eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
embeddings = []
with torch.no_grad():
    for text in tqdm(data['shortDescription'].tolist(), 'embeddings'):
        token = tokenizer(text, padding = False, max_length = 512, return_tensors="pt")
        token = {k: v.to(device) for k, v in token.items()}
        outputs = model(**token, output_hidden_states=True)
        last_layer = outputs.hidden_states[-1]
        emb = last_layer.mean(dim=1).squeeze(0).cpu().numpy() 
        embeddings.append(emb) 

df = pd.DataFrame(embeddings)
df.head(3)

https://huggingface.co/learn/llm-course/chapter3/4

https://huggingface.co/collections/ai-forever/sentenceembedders

Аугментация парафразом

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm.auto import tqdm

INPUT_CSV = "input_texts.csv"
TEXT_COL = "text"
MODEL_NAME = "cointegrated/rut5-base-paraphraser"
OUT_CSV = "paraphrases.csv"
BATCH_SIZE = 8
NUM_RETURN = 3
MAX_INPUT_LENGTH = 256
MAX_GEN_LENGTH = 128
NUM_BEAMS = 8
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv(INPUT_CSV)
texts = df[TEXT_COL].fillna("").astype(str).tolist()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()

paraphrases_cols = [f"paraphrase_{i+1}" for i in range(NUM_RETURN)]
rows = []

with torch.no_grad():
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="paraphrase"):
        batch = texts[i:i+BATCH_SIZE]
        enc = tokenizer(batch, truncation=True, padding=True, return_tensors="pt", max_length=MAX_INPUT_LENGTH)
        enc = {k: v.to(DEVICE) for k, v in enc.items()}
        gen = model.generate(**enc, max_length=MAX_GEN_LENGTH, num_beams=NUM_BEAMS, num_return_sequences=NUM_RETURN, early_stopping=True)
        gen = gen.cpu().numpy()
        decoded = tokenizer.batch_decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for j in range(len(batch)):
            start = j * NUM_RETURN
            paraphrases = decoded[start:start + NUM_RETURN]
            row = {"text": batch[j]}
            for k, p in enumerate(paraphrases):
                row[paraphrases_cols[k]] = p
            rows.append(row)

out_df = pd.DataFrame(rows)
out_df.to_csv(OUT_CSV, index=False)


Суммаризация с помощью Encoder-Decoder

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np

MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
EPOCHS = 3
OUTPUT_DIR = "sft_sum_model"

df = pd.read_csv("data.csv")  # столбцы: 'query', 'summary'
df = df.dropna(subset=["summary"])  # обязать target
ds = Dataset.from_pandas(df)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def preprocess_batch(examples):
    inputs = [str(x) for x in examples["query"]]
    targets = [str(x) for x in examples["summary"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True)
    label_ids = labels["input_ids"]
    label_ids = [[(tok if tok != tokenizer.pad_token_id else -100) for tok in lbl] for lbl in label_ids]
    model_inputs["labels"] = label_ids
    return model_inputs

ds_tok = ds.map(preprocess_batch, batched=True, remove_columns=ds.column_names)
ds_tok = ds_tok.filter(lambda x: len([t for t in x["labels"] if t != -100]) > 0)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=EPOCHS,
    logging_steps=50,
    fp16=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_tok.shuffle(seed=42).select(range(int(0.9*len(ds_tok)))),
    eval_dataset=ds_tok.shuffle(seed=42).select(range(int(0.9*len(ds_tok)), len(ds_tok))),
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)