In [4]:
# Imports
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
from tqdm import tqdm


In [5]:
# Load cleaned dataset
df = pd.read_csv("../data/processed/liar_cleaned.csv")
X = df["cleaned_text"].tolist()
y = df["label"].tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()  # Disable dropout etc.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
# Function to get CLS embeddings
def get_bert_embeddings(texts, tokenizer, model):
    embeddings = []
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[0][0]  # CLS token
        embeddings.append(cls_embedding.numpy())
    return np.array(embeddings)

# Get BERT embeddings
X_train_bert = get_bert_embeddings(X_train, tokenizer, model)
X_test_bert = get_bert_embeddings(X_test, tokenizer, model)


100%|██████████| 10232/10232 [11:23<00:00, 14.98it/s] 
100%|██████████| 2559/2559 [02:41<00:00, 15.81it/s]


In [8]:
# Train logistic regression
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_bert, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test_bert)
print("📊 Classification Report:\n")
print(classification_report(y_test, y_pred))


📊 Classification Report:

              precision    recall  f1-score   support

           0       0.54      0.48      0.51      1136
           1       0.62      0.68      0.65      1423

    accuracy                           0.59      2559
   macro avg       0.58      0.58      0.58      2559
weighted avg       0.59      0.59      0.59      2559



In [9]:
joblib.dump(clf, "../models/logreg_bert_model.pkl")
np.save("../data/processed/X_train_bert.npy", X_train_bert)
np.save("../data/processed/X_test_bert.npy", X_test_bert)
joblib.dump(tokenizer, "../models/bert_tokenizer.pkl")
joblib.dump(model, "../models/bert_model.pt")

print("✅ Model, tokenizer, and embeddings saved!")


✅ Model, tokenizer, and embeddings saved!
