In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import os
from pathlib import Path

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

PyTorch version: 2.8.0+cpu
CUDA available: False


In [2]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ===== STEP 1: Load and Explore LIAR Dataset =====

def load_liar_dataset():
    """Load LIAR dataset with proper column names"""
    
columns = ['id', 'label', 'statement', 'subject', 'speaker', 'job_title', 'state_info', 'party_affiliation','barely_true_counts', 'false_counts', 'half_true_counts','mostly_true_counts', 'pants_fire_counts', 'context']

Using device: cpu


In [3]:
train_df = pd.read_csv('../data/raw/train.tsv', sep='\t', header=None, names=columns)
valid_df = pd.read_csv('../data/raw/valid.tsv', sep='\t', header=None, names=columns)
test_df = pd.read_csv('../data/raw/test.tsv', sep='\t', header=None, names=columns)

print(f"Train set: {train_df.shape}")
print(f"Valid set: {valid_df.shape}")
print(f"Test set: {test_df.shape}")

print("\nLabel distribution in training set:")
print(train_df['label'].value_counts())


Train set: (10240, 14)
Valid set: (1284, 14)
Test set: (1267, 14)

Label distribution in training set:
label
half-true      2114
false          1995
mostly-true    1962
true           1676
barely-true    1654
pants-fire      839
Name: count, dtype: int64


In [4]:
# ===== STEP 2: BERT Feature Extractor Class =====

class BERTFeatureExtractor:
    """Extract BERT features from text statements"""
    
    def __init__(self, model_name='bert-base-uncased', max_length=128):
        print(f"Loading BERT model: {model_name}")
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.max_length = max_length
        self.device = device
        
        # Move model to device and set to evaluation mode
        self.model.to(self.device)
        self.model.eval()
        
        print(f"BERT model loaded successfully on {self.device}")
    
    def extract_features(self, texts, batch_size=16):
        """
        Extract BERT features from list of texts
        Returns: numpy array of shape (n_texts, 768)
        """
        features = []
        
        print(f"Extracting BERT features for {len(texts)} texts...")
        
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize batch
            encoded = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors='pt'
            )
            
            # Move to device
            input_ids = encoded['input_ids'].to(self.device)
            attention_mask = encoded['attention_mask'].to(self.device)
            
            # Extract features
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
                
                # Use [CLS] token representation (first token)
                cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                features.append(cls_embeddings)
        
        # Combine all batches
        all_features = np.vstack(features)
        print(f"BERT features extracted: {all_features.shape}")
        
        return all_features

In [5]:
def save_features(self, features, labels, texts, filename):
        """Save extracted features to file"""
        data = {
            'features': features,
            'labels': labels,
            'texts': texts,
            'feature_dim': features.shape[1]
        }
        
        filepath = Path('data/processed') / filename
        filepath.parent.mkdir(parents=True, exist_ok=True)
        
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        
        print(f"Features saved to {filepath}")

In [6]:
# ===== STEP 3: Extract BERT Features =====

# Initialize BERT extractor
bert_extractor = BERTFeatureExtractor()

Loading BERT model: bert-base-uncased
BERT model loaded successfully on cpu


In [7]:
# Prepare texts (fill NaN with empty strings)
train_texts = train_df['statement'].fillna('').tolist()
valid_texts = valid_df['statement'].fillna('').tolist()
test_texts = test_df['statement'].fillna('').tolist()

In [8]:
# Extract features for small sample first (for testing)
print("Testing on small sample...")
sample_size = 100
train_sample_texts = train_texts[:sample_size]
train_sample_labels = train_df['label'][:sample_size].tolist()

Testing on small sample...


In [24]:
import pickle

# --- FIRST: DEFINE THESE VARIABLES ---
# Option 1: Compute BERT features (example)
from transformers import BertTokenizer, BertModel
import torch

# Sample data (replace with your actual data)
train_sample_texts = ["First text sample", "Second text sample"]
train_sample_labels = [0, 1]  # Example labels

# Compute BERT features
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

inputs = tokenizer(
    train_sample_texts,
    return_tensors='pt',
    padding=True,
    truncation=True,
    max_length=512
)

with torch.no_grad():
    outputs = model(**inputs)

train_bert_features_sample = outputs.last_hidden_state[:, 0, :].numpy()

# --- SECOND: CREATE THE DICTIONARY ---
features_data = {
    'features': train_bert_features_sample, 
    'labels': train_sample_labels,
    'texts': train_sample_texts
} 

# --- THIRD: SAVE TO FILE ---
with open('bert_features_sample.pkl', 'wb') as f:
    pickle.dump(features_data, f)

# --- FOURTH: VERIFY LOADING ---
with open('bert_features_sample.pkl', 'rb') as f: 
    loaded_data = pickle.load(f)

# Access components (use correct keys!)
features = loaded_data['features']  # NOT 'train_bert_features_sample'
labels = loaded_data['labels']      # NOT 'train_sample_labels'
texts = loaded_data['texts']        # NOT 'train_sample_texts'

print(f"Loaded features shape: {features.shape}")
print(f"Loaded labels: {labels}")
print(f"First text: {texts[0]}")

# --- FIFTH: FUNCTION DEFINITION ---
def save_features(features, labels, texts, filename):
    """Save features, labels, and texts to a pickle file"""
    features_data = {
        'features': features,
        'labels': labels,
        'texts': texts
    }
    with open(filename, 'wb') as f:
        pickle.dump(features_data, f)

# --- SIXTH: TEST LOADING ---
try: 
    with open('bert_features_sample.pkl', 'rb') as f:
        data = pickle.load(f) 
    print("Success! File loaded correctly.")
except Exception as e:
    print(f"Error loading file: {e}")

NameError: name 'train_bert_features_sample' is not defined

In [None]:
with open('bert_features_sample.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Access the components
features = loaded_data['features']
labels = loaded_data['labels']
texts = loaded_data['texts']

In [None]:
# Save sample features
# LOAD EXISTING DATA
with open('bert_features_sample.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Extract components (if you need to use them)
features = loaded_data['features']
labels = loaded_data['labels']
texts = loaded_data['texts']

#Define your data FIRST
train_bert_features_sample = ["train_bert_features_sample"] 
train_sample_labels = ["train_sample_labels"]        
train_sample_texts = ["train_sample_texts"]          

#THEN create the dictionary
features_data = {
    'features': train_bert_features_sample,
    'labels': train_sample_labels,
    'texts': train_sample_texts
}

#Load data
with open('bert_features_sample.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

#Load the same data again
with open('bert_features_sample.pkl', 'rb') as f:
    loaded_data = pickle.load(f)  # ❗ Redundant

In [None]:
with open('bert_features_sample.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Access the components
features = loaded_data['features']
labels = loaded_data['labels']
texts = loaded_data['texts']