# Truthseeker Project
Trent Everard

CS 497

11/22/24

In [None]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.metrics import accuracy_score
import re
import torch
from torch import cuda
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from tqdm import tqdm_notebook as tqdm
import numpy as np 

device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
df = pd.read_csv("data/Truth_Seeker_Model_Dataset.csv")
df.shape

df = df[~df['5_label_majority_answer'].isin(['NO MAJORITY'])]
df.shape

sentences = pd.DataFrame()
sentences['statement-and-tweet'] = df['target'].astype(str) + ' Statement: '  +  df['statement'] + '| Tweet: ' +df['tweet']
sentences

Unnamed: 0,statement-and-tweet
0,True Statement: End of eviction moratorium mea...
2,True Statement: End of eviction moratorium mea...
3,True Statement: End of eviction moratorium mea...
4,True Statement: End of eviction moratorium mea...
5,True Statement: End of eviction moratorium mea...
...,...
134192,False Statement: Joe Bidens great-grandfather ...
134193,False Statement: Joe Bidens great-grandfather ...
134194,False Statement: Joe Bidens great-grandfather ...
134195,False Statement: Joe Bidens great-grandfather ...


### Grouped Train/Test Split

In [None]:
label_mapping = {
    'Disagree': 0,
    'Mostly Disagree': 1,
    'Mostly Agree': 2,
    'Agree': 3
}

df['label'] = df['5_label_majority_answer'].map(label_mapping)

def map_labels(labels, num_classes):
    if num_classes == 2:
        return labels.apply(lambda x: 0 if x <= 1 else 1)
    elif num_classes == 4:
        return labels
    else:
        raise ValueError("num_classes must be either 2 or 4.")

df2 = pd.DataFrame()
#df2['2-way-label'] = map_labels(df['label'], 2)
df2['4-way-label'] = df['label']
df2['statement-and-tweet'] = sentences['statement-and-tweet']

In [None]:
train_dataset, test_dataset = train_test_split(df2, test_size=0.2, random_state=32)

train_data = np.array(train_dataset['statement-and-tweet'])
train_labels = np.array(train_dataset[['4-way-label']])
train_data.shape, train_labels.shape

test_data = np.array(test_dataset['statement-and-tweet'])
test_labels = np.array(test_dataset[['4-way-label']])
test_data.shape, test_labels.shape

((22319,), (22319, 1))

### Datasets

In [None]:
class Truthseeker(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.text = text
        self.targets = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = self.text[index]

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }


### BERT Class

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()   
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        #self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = self.classifier(pooler)
        return output

### Data Setup

In [None]:
NUM_OUT = 4
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 2e-05

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
tokenizer.encode_plus(
            train_data[5],
            None,
            add_special_tokens=True,
            max_length=128,
            pad_to_max_length=True,
            truncation=True,
            return_txoken_type_ids=True
        )

training_data = Truthseeker(train_data, torch.from_numpy(train_labels), tokenizer, MAX_LEN)
test_data = Truthseeker(test_data, torch.from_numpy(test_labels), tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = DataLoader(training_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)

Keyword arguments {'return_txoken_type_ids': True} not recognized.


### Helpful Functions

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCELoss()(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()

    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)#
        optimizer.zero_grad()
    
        loss = loss_fn(outputs, targets)        
        loss.backward()
        optimizer.step()
        
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets']
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### Training Setup

In [None]:
model = BERTClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
# optimizer = torch.optim.AdamW(params = model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('Accuracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/1395 [00:00<?, ?it/s]

  'targets': torch.tensor(self.targets[index], dtype=torch.long)


ValueError: Using a target size (torch.Size([64, 1])) that is different to the input size (torch.Size([64, 4])) is deprecated. Please ensure they have the same size.