In [1]:
import numpy as np
import pandas as pd
import torch
import joblib
from scipy import sparse
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix, precision_score, recall_score
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader, TensorDataset
from datasets import Dataset

In [2]:
# Load dataframes
train_df = pd.read_csv(r"../datasets/cleaned_train.csv")
texts = train_df['text'].fillna("").tolist()
texts_clean = train_df['text_clean'].fillna("").tolist()
labels = train_df['target'].values
test_df = pd.read_csv(r"../datasets/cleaned_test.csv")
test_texts = test_df['text'].fillna("").tolist()
test_texts_clean = test_df['text_clean'].fillna("").tolist()


In [3]:
# Load Sklearn models
lr_model = joblib.load(r"../base-models/logistic_regression_model.pkl")
rf_model = joblib.load(r"../base-models/random_forest_model.pkl")
xgb_model = joblib.load(r"../base-models/gradient_boosted_model.pkl")
svm_model = joblib.load(r"../base-models/support_vector_machine_model.pkl")
lrbert_model = joblib.load(r"../base-models/lr_bert_model.pkl")
print("Sklearn models loaded.")

# Load fine-tuned BERT model and tokenizer
bert_model = DistilBertForSequenceClassification.from_pretrained(r"../base-models/finetuned_bert")
bert_tokenizer = DistilBertTokenizerFast.from_pretrained(r"../base-models/finetuned_bert")
print("BERT model loaded.")

# Set device (GPU if available)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"
bert_model = bert_model.to(device)
bert_model.eval()

Sklearn models loaded.
BERT model loaded.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [4]:
# Load tfidf vectorizer and w2v model
tfidf_vectorizer = joblib.load(r"../datasets/tfidf_vectorizer.pkl")  
w2v_model = Word2Vec.load(r"../datasets/word2vec_model.model")
idf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

# Load features used with xgboost
tfidf_train = sparse.load_npz(r"../datasets/tfidf_train_matrix.npz")
tfidf_test = sparse.load_npz(r"../datasets/tfidf_test_matrix.npz")

# Create weighted Word2Vec features
def get_weighted_w2v(text, model, idf_dict):
    tokens = word_tokenize(text)
    word_vecs = []
    weight_sum = 0
    for word in tokens:
        if word in model.wv and word in idf_dict:
            vec = model.wv[word] * idf_dict[word]
            word_vecs.append(vec)
            weight_sum += idf_dict[word]
    if word_vecs:
        return np.sum(word_vecs, axis=0) / weight_sum
    else:
        return np.zeros(model.vector_size)

w2v_train = np.array([get_weighted_w2v(t, w2v_model, idf_weights) for t in texts_clean])
w2v_test = np.array([get_weighted_w2v(t, w2v_model, idf_weights) for t in test_texts_clean])

# Stack features for xgboost
full_train_features = np.hstack([tfidf_train.toarray(), w2v_train])
full_test_features = np.hstack([tfidf_test.toarray(), w2v_test])

In [5]:
# Prepare StratifiedKFold
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
meta_features = np.zeros((len(train_df), 6))

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"Fold {fold+1}/{n_splits}")
    
    X_val_texts = [texts[i] for i in val_idx]
    X_val_tfidf = tfidf_vectorizer.transform(X_val_texts)
    val_full_features = full_train_features[val_idx]

    meta_features[val_idx, 0] = lr_model.predict_proba(X_val_tfidf)[:, 1]
    meta_features[val_idx, 1] = rf_model.predict_proba(X_val_tfidf)[:, 1]
    meta_features[val_idx, 2] = xgb_model.predict_proba(val_full_features)[:, 1]
    meta_features[val_idx, 3] = svm_model.decision_function(X_val_tfidf)

    inputs = bert_tokenizer(X_val_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = bert_model(**inputs, output_hidden_states=True)
        logits = outputs.logits
        last_hidden_state = outputs.hidden_states[-1]
        attention_mask = inputs['attention_mask']

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = (sum_embeddings / sum_mask).cpu().numpy()

        bert_probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

    meta_features[val_idx, 4] = lrbert_model.predict_proba(mean_embeddings)[:, 1]
    meta_features[val_idx, 5] = bert_probs

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5


In [6]:
# Train meta-model
meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(meta_features, labels)

In [7]:
# Evaluate meta-model
oof_preds = meta_model.predict(meta_features)

print("\n== Meta-Model Evaluation ==")
print("Accuracy:", accuracy_score(labels, oof_preds))
print("F1 Score:", f1_score(labels, oof_preds))
print("\nClassification Report:\n", classification_report(labels, oof_preds))
print("\nConfusion Matrix:\n", confusion_matrix(labels, oof_preds))


== Meta-Model Evaluation ==
Accuracy: 0.9402338105871536
F1 Score: 0.9294245385450597

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      4342
           1       0.94      0.92      0.93      3271

    accuracy                           0.94      7613
   macro avg       0.94      0.94      0.94      7613
weighted avg       0.94      0.94      0.94      7613


Confusion Matrix:
 [[4162  180]
 [ 275 2996]]


In [8]:
# Save evaluations for comparison
model_name = "Modular Ensemble"

metrics = {
    "model": model_name,
    "accuracy": accuracy_score(labels, oof_preds),
    "precision": precision_score(labels, oof_preds, average="binary"),
    "recall": recall_score(labels, oof_preds, average="binary"),
    "f1score": f1_score(labels, oof_preds, average="binary")
}

scores = pd.read_csv("../evaluation/scores.csv")

match = scores["model"] == metrics["model"]
if match.any():
    # Update existing row
    scores.loc[match, ["accuracy", "precision", "recall", "f1score"]] = metrics["accuracy"], metrics["precision"], metrics["recall"], metrics["f1score"]
else:
    # Insert new row
    scores = pd.concat([scores, pd.DataFrame([metrics])], ignore_index=True)

scores.to_csv("../evaluation/scores.csv", index=False)

print("Model scores saved to evaluation/scores.csv")

Model scores saved to evaluation/scores.csv


In [9]:
# Retrain Logistic Regression
lr_model_full = LogisticRegression(max_iter=1000, random_state=42)
lr_model_full.fit(tfidf_train, labels)

# Retrain Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_full.fit(tfidf_train, labels)

# Retrain XGBoost
from xgboost import XGBClassifier
xgb_model_full = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model_full.fit(full_train_features, labels)

# Retrain SVM
from sklearn.svm import SVC
svm_model_full = SVC(probability=True, random_state=42)
svm_model_full.fit(tfidf_train, labels)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [10]:
# Tokenize the full dataset
encodings = bert_tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=128)

# Create DataLoader
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
loader = DataLoader(dataset, batch_size=32, shuffle=False)  # Batch size small enough to fit GPU

# Collect outputs
all_embeddings = []

bert_model.eval()
with torch.no_grad():
    for batch in loader:
        input_ids, attention_mask = [b.to(device) for b in batch]
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        
        # Mean Pooling
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = (sum_embeddings / sum_mask).cpu()
        
        all_embeddings.append(mean_embeddings)

# Stack everything back into a full tensor
full_train_bert_embeddings = torch.cat(all_embeddings).numpy()

lrbert_model_full = LogisticRegression(max_iter=1000, random_state=42)
lrbert_model_full.fit(full_train_bert_embeddings, labels)

In [11]:
# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

# Prepare Dataset
dataset = Dataset.from_dict({
    'text': texts,
    'labels': labels
})

# Tokenize
def tokenize_function(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Training arguments
training_args = TrainingArguments(
    output_dir="./finetuned_bert_full",
    eval_strategy="no",           
    save_strategy="no",                  
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Train
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/7613 [00:00<?, ? examples/s]

Step,Training Loss
10,0.6872
20,0.637
30,0.5796
40,0.5633
50,0.4845
60,0.4778
70,0.4792
80,0.5121
90,0.4343
100,0.392


TrainOutput(global_step=1428, training_loss=0.3364546899201155, metrics={'train_runtime': 109.1465, 'train_samples_per_second': 209.251, 'train_steps_per_second': 13.083, 'total_flos': 756355729476096.0, 'train_loss': 0.3364546899201155, 'epoch': 3.0})

In [12]:
# Predict on Test Set
batch_size = 64

test_encodings = bert_tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt", max_length=128)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True)

all_logits = []
all_cls_embeddings = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = [b.to(device, non_blocking=True) for b in batch]
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        logits = outputs.logits
        all_logits.append(logits.cpu())
        last_hidden_state = outputs.hidden_states[-1]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = (sum_embeddings / sum_mask).cpu()
        all_cls_embeddings.append(mean_embeddings)

# Stack all batches together
test_logits = torch.cat(all_logits)
test_cls_embeddings = torch.cat(all_cls_embeddings)

# Prepare test_meta_features
test_meta_features = np.zeros((len(test_df), 6))
X_test_tfidf = tfidf_vectorizer.transform(test_texts)

# Use the retrained models for test set predictions
test_meta_features[:, 0] = lr_model_full.predict_proba(X_test_tfidf)[:, 1]
test_meta_features[:, 1] = rf_model_full.predict_proba(X_test_tfidf)[:, 1]
test_meta_features[:, 2] = xgb_model_full.predict_proba(full_test_features)[:, 1]
test_meta_features[:, 3] = svm_model_full.decision_function(X_test_tfidf)
test_meta_features[:, 4] = lrbert_model_full.predict_proba(test_cls_embeddings.numpy())[:, 1]
test_meta_features[:, 5] = torch.softmax(test_logits, dim=1)[:, 1].numpy()

# Meta-model predictions on test set
test_preds = meta_model.predict(test_meta_features)

submission = pd.DataFrame({
    'id': test_df['id'],
    'target': test_preds
})

submission.to_csv("modular_stacking_with_retrain.csv", index=False)
print("Submission file saved.")


Submission file saved.


In [13]:
joblib.dump(meta_model, "meta_model.pkl")

['meta_model.pkl']