### Notebook for the training of the Fake News detection model using the LIAR dataset

In [1]:
import pandas as pd

# Load the dataset
train_path = '../liar_dataset/train.tsv'
test_path = '../liar_dataset/test.tsv'
valid_path = '../liar_dataset/valid.tsv'

# Load datasets
train_df = pd.read_csv(train_path, sep='\t', header=None)
test_df = pd.read_csv(test_path, sep='\t', header=None)
valid_df = pd.read_csv(valid_path, sep='\t', header=None)

# Select only the 'label' and 'statement' columns
train_df = train_df[[1, 2]]
test_df = test_df[[1, 2]]
valid_df = valid_df[[1, 2]]

train_df.columns = ['label', 'statement']
test_df.columns = ['label', 'statement']
valid_df.columns = ['label', 'statement']


In [2]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings for a given text
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)
    with torch.no_grad():  # Avoid computation graph creation (important for CPU performance)
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the embedding
    return torch.mean(outputs.last_hidden_state, dim=1).squeeze()

# Example: Get BERT embedding for a single statement
sample_embedding = get_bert_embedding("The economy is doing great!")
print(sample_embedding.shape)


  from .autonotebook import tqdm as notebook_tqdm


torch.Size([768])


In [3]:
from joblib import Parallel, delayed

# Parallelize the embedding extraction with joblib
def vectorize_statements_parallel(dataframe):
    # Use Parallel with delayed to parallelize the get_bert_embedding function
    embeddings = Parallel(n_jobs=-1)(delayed(get_bert_embedding)(text) for text in dataframe['statement'])
    return [embedding.numpy() for embedding in embeddings]

# Vectorize the train, test, and validation sets in parallel
train_embeddings = vectorize_statements_parallel(train_df)
test_embeddings = vectorize_statements_parallel(test_df)
valid_embeddings = vectorize_statements_parallel(valid_df)

# Check the dimensions of the train embeddings
print(f"Train embeddings shape: {len(train_embeddings)} statements, {train_embeddings[0].shape} per embedding")


KeyboardInterrupt: 

NameError: name 'train_encodings' is not defined