# Advanced Embeddings-based Classifier

## Goals
- Stronger embeddings (`all-mpnet-base-v2`) + baseline (`all-MiniLM-L6-v2`)
- Tune LogReg / Linear SVM (C)
- Add simple MLP head over embeddings
- Baseline TF-IDF + LR and soft ensemble with embeddings
- Predict validation and save CSV

Run top-to-bottom.


In [1]:
# Setup
import sys
!{sys.executable} -m pip install -q sentence-transformers scikit-learn

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

print('Setup complete')


  from .autonotebook import tqdm as notebook_tqdm


Setup complete


In [2]:
# Load data and build texts
train_df = pd.read_csv('dataset/data.csv')
val_df = pd.read_csv('dataset/validation_data.csv')

train_texts = (train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')).tolist()
val_texts = (val_df['title'].fillna('') + ' ' + val_df['text'].fillna('')).tolist()
y = train_df['label'].values

idx = np.arange(len(train_texts))
train_idx, test_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=y)

y_train, y_test = y[train_idx], y[test_idx]
texts_train = [train_texts[i] for i in train_idx]
texts_test = [train_texts[i] for i in test_idx]

len(texts_train), len(texts_test), len(val_texts)


(31953, 7989, 4956)

In [3]:
# Encode embeddings (MiniLM + MPNet)
mini = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
mpnet = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

X_train_mini = mini.encode(texts_train, batch_size=256, convert_to_numpy=True, show_progress_bar=True)
X_test_mini = mini.encode(texts_test, batch_size=256, convert_to_numpy=True, show_progress_bar=True)
X_val_mini = mini.encode(val_texts, batch_size=256, convert_to_numpy=True, show_progress_bar=True)

X_train_mp = mpnet.encode(texts_train, batch_size=64, convert_to_numpy=True, show_progress_bar=True)
X_test_mp = mpnet.encode(texts_test, batch_size=64, convert_to_numpy=True, show_progress_bar=True)
X_val_mp = mpnet.encode(val_texts, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

X_train_mini.shape, X_train_mp.shape


Batches: 100%|██████████| 125/125 [04:25<00:00,  2.12s/it]
Batches: 100%|██████████| 32/32 [01:18<00:00,  2.44s/it]
Batches: 100%|██████████| 20/20 [00:47<00:00,  2.39s/it]
Batches: 100%|██████████| 500/500 [51:53<00:00,  6.23s/it] 
Batches: 100%|██████████| 125/125 [14:08<00:00,  6.79s/it]
Batches: 100%|██████████| 78/78 [12:07<00:00,  9.33s/it]


((31953, 384), (31953, 768))

In [4]:
# Tune LogReg and Linear SVM on embeddings
from sklearn.pipeline import Pipeline

results = {}

def fit_and_eval(X_train, X_test, name):
    # Logistic Regression
    lr = GridSearchCV(LogisticRegression(max_iter=3000),
                      param_grid={'C':[0.1, 0.5, 1, 2, 5]},
                      cv=5, n_jobs=-1)
    lr.fit(X_train, y_train)
    pred_lr = lr.predict(X_test)
    acc_lr = accuracy_score(y_test, pred_lr)

    # Linear SVM (probability=True для совместимости ансамбля)
    svm = GridSearchCV(SVC(kernel='linear', probability=True),
                       param_grid={'C':[0.1, 0.5, 1, 2, 5]},
                       cv=5, n_jobs=-1)
    svm.fit(X_train, y_train)
    pred_svm = svm.predict(X_test)
    acc_svm = accuracy_score(y_test, pred_svm)

    results[f'{name}_LR'] = (acc_lr, lr.best_estimator_)
    results[f'{name}_SVM'] = (acc_svm, svm.best_estimator_)
    print(f'{name} | LR acc={acc_lr:.4f} (C={lr.best_params_["C"]}), SVM acc={acc_svm:.4f} (C={svm.best_params_["C"]})')

fit_and_eval(X_train_mini, X_test_mini, 'MiniLM')
fit_and_eval(X_train_mp, X_test_mp, 'MPNet')

results


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

MiniLM | LR acc=0.9569 (C=5), SVM acc=0.9597 (C=5)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if p

MPNet | LR acc=0.9740 (C=5), SVM acc=0.9795 (C=5)


{'MiniLM_LR': (0.9569407935911879, LogisticRegression(C=5, max_iter=3000)),
 'MiniLM_SVM': (0.9596945800475654,
  SVC(C=5, kernel='linear', probability=True)),
 'MPNet_LR': (0.973964200776067, LogisticRegression(C=5, max_iter=3000)),
 'MPNet_SVM': (0.9794717736888221,
  SVC(C=5, kernel='linear', probability=True))}

In [5]:
# MLP head over embeddings (MPNet)
mlp = MLPClassifier(hidden_layer_sizes=(256,), activation='relu', solver='adam',
                    alpha=1e-4, batch_size=256, max_iter=30, random_state=42)
mlp.fit(X_train_mp, y_train)

pred_mlp = mlp.predict(X_test_mp)
acc_mlp = accuracy_score(y_test, pred_mlp)
print('MLP (MPNet) acc:', acc_mlp)

results['MPNet_MLP'] = (acc_mlp, mlp)
results


MLP (MPNet) acc: 0.9833521091500813




{'MiniLM_LR': (0.9569407935911879, LogisticRegression(C=5, max_iter=3000)),
 'MiniLM_SVM': (0.9596945800475654,
  SVC(C=5, kernel='linear', probability=True)),
 'MPNet_LR': (0.973964200776067, LogisticRegression(C=5, max_iter=3000)),
 'MPNet_SVM': (0.9794717736888221,
  SVC(C=5, kernel='linear', probability=True)),
 'MPNet_MLP': (0.9833521091500813,
  MLPClassifier(batch_size=256, hidden_layer_sizes=(256,), max_iter=30,
                random_state=42))}

In [7]:
# TF-IDF + LR baseline and soft ensemble with best embedding model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# TF-IDF baseline (на train/test)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.8)
X_train_tf = vectorizer.fit_transform(texts_train)
X_test_tf = vectorizer.transform(texts_test)

lr_tf = LogisticRegression(max_iter=3000)
lr_tf.fit(X_train_tf, y_train)

pred_tf = lr_tf.predict(X_test_tf)
acc_tf = accuracy_score(y_test, pred_tf)
print('TF-IDF+LR acc:', acc_tf)

results['TFIDF_LR'] = (acc_tf, (vectorizer, lr_tf))

# Выбор лучшей embeddings-модели
best_name = max(results, key=lambda k: results[k][0])
best_acc, best_est = results[best_name]
print(f'Best embeddings model: {best_name} acc={best_acc:.4f}')

# Подготовим вероятности для soft-ensemble (если модель умеет probas)
def get_proba(est, X):
    if isinstance(est, LogisticRegression) or isinstance(est, SVC):
        return est.predict_proba(X)
    if isinstance(est, MLPClassifier):
        return est.predict_proba(X)
    # (vectorizer, lr) пара
    if isinstance(est, tuple) and isinstance(est[1], LogisticRegression):
        vec, lrm = est
        return lrm.predict_proba(vec.transform(texts_test))
    return None

'''
# Вероятности
# Для embeddings лучшую модель применим на соответствующем X_test
if 'MPNet' in best_name:
    proba_emb = best_est.predict_proba(X_test_mp)
else:
    proba_emb = best_est.predict_proba(X_test_mini)
'''

if 'MPNet' in best_name:
    proba_emb = get_proba(best_est, X_test_mp)
else:
    proba_emb = get_proba(best_est, X_test_mini)



proba_tf = lr_tf.predict_proba(X_test_tf)

# Soft-average ансамбль
proba_ens = 0.5 * proba_emb + 0.5 * proba_tf
pred_ens = np.argmax(proba_ens, axis=1)
acc_ens = accuracy_score(y_test, pred_ens)
print('Soft ensemble acc:', acc_ens)

results['SOFT_ENSEMBLE'] = (acc_ens, ('ensemble', best_name))
results


TF-IDF+LR acc: 0.9888596820628364
Best embeddings model: TFIDF_LR acc=0.9889
Soft ensemble acc: 0.9888596820628364


{'MiniLM_LR': (0.9569407935911879, LogisticRegression(C=5, max_iter=3000)),
 'MiniLM_SVM': (0.9596945800475654,
  SVC(C=5, kernel='linear', probability=True)),
 'MPNet_LR': (0.973964200776067, LogisticRegression(C=5, max_iter=3000)),
 'MPNet_SVM': (0.9794717736888221,
  SVC(C=5, kernel='linear', probability=True)),
 'MPNet_MLP': (0.9833521091500813,
  MLPClassifier(batch_size=256, hidden_layer_sizes=(256,), max_iter=30,
                random_state=42)),
 'TFIDF_LR': (0.9888596820628364,
  (TfidfVectorizer(max_df=0.8, max_features=5000, min_df=3, ngram_range=(1, 2)),
   LogisticRegression(max_iter=3000))),
 'SOFT_ENSEMBLE': (0.9888596820628364, ('ensemble', 'TFIDF_LR'))}

In [8]:
# Pick best overall and predict validation
best_overall = max(results, key=lambda k: results[k][0])
best_overall_acc, best_overall_est = results[best_overall]
print('Best overall:', best_overall, best_overall_acc)

# Build validation predictions
if best_overall.startswith('MiniLM_'):
    proba_val = best_overall_est.predict_proba(X_val_mini)
elif best_overall.startswith('MPNet_'):
    proba_val = best_overall_est.predict_proba(X_val_mp)
elif best_overall == 'MPNet_MLP':
    proba_val = best_overall_est.predict_proba(X_val_mp)
elif best_overall == 'TFIDF_LR':
    proba_val = best_overall_est[1].predict_proba(TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.8).fit(train_texts).transform(val_texts))
elif best_overall == 'SOFT_ENSEMBLE':
    # ensemble = 0.5 * best embedding + 0.5 * TF-IDF LR
    if 'MPNet' in best_name:
        proba_emb_val = best_est.predict_proba(X_val_mp)
    else:
        proba_emb_val = best_est.predict_proba(X_val_mini)
    proba_tf_val = lr_tf.predict_proba(TfidfVectorizer(max_features=5000, ngram_range=(1,2), min_df=3, max_df=0.8).fit(train_texts).transform(val_texts))
    proba_val = 0.5 * proba_emb_val + 0.5 * proba_tf_val
else:
    raise ValueError('Unknown best model key')

pred_val = np.argmax(proba_val, axis=1)
sub = pd.DataFrame({
    'id': range(len(val_df)),
    'label': pred_val,
    'probability_fake': proba_val[:,0],
    'probability_real': proba_val[:,1]
})

out_path = f'advanced_embeddings_predictions_{best_overall}.csv'
sub.to_csv(out_path, index=False)
print('Saved to', out_path)
sub.head()


Best overall: TFIDF_LR 0.9888596820628364
Saved to advanced_embeddings_predictions_TFIDF_LR.csv


Unnamed: 0,id,label,probability_fake,probability_real
0,0,0,0.815472,0.184528
1,1,0,0.753969,0.246031
2,2,1,0.495647,0.504353
3,3,0,0.536933,0.463067
4,4,0,0.707862,0.292138


In [11]:
# Save validation with predicted labels into dataset
import os

# Try to use in-memory validation data if present; otherwise read from disk
try:
    original_val = val_df.copy()
except NameError:
    original_val = pd.read_csv('dataset/validation_data.csv')

# Ensure lengths match
assert len(original_val) == len(sub), 'Length mismatch between validation and predictions'

labeled_val = original_val.copy()
labeled_val['label'] = sub['label'].values

out_path = os.path.join('dataset', 'validation_data_labeled_embeddings_advanced.csv')
labeled_val.to_csv(out_path, index=False)
print(f"Saved labeled validation to: {out_path}")


Saved labeled validation to: dataset/validation_data_labeled_embeddings_advanced.csv
