# Truthseeker Project
Trent Everard

CS 497

11/22/24

In [112]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import accuracy_score
import re
import torch
from torch import cuda
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from tqdm import tqdm_notebook as tqdm

device = 'cuda' if cuda.is_available() else 'cpu'
device

'cpu'

In [113]:
df = pd.read_csv("Truth_Seeker_Model_Dataset.csv")

print('Number of training sentences: {:,}\n'.format(df.shape[0]))

sentences = 'Statement: ' + df['statement'] + '| Tweet: ' + df['tweet']
sentences[10]

Number of training sentences: 134,198



'Statement: End of eviction moratorium means millions of Americans could lose their housing in the middle of a pandemic.| Tweet: BREAKING NEWS: Mitch McConnell accuses President Biden of pushing socialism by implementing the eviction moratorium that will stop millions of Americans from being thrown out on the street this month. RT if you think that Mitch is a heartless idiot!'

In [114]:
print("Unique values in '5_label_majority_answer':", df['5_label_majority_answer'].unique())

# Filter out rows with 'NO MAJORITY' in '5_label_majority_answer'
df = df[df['5_label_majority_answer'] != 'NO MAJORITY']
label_mapping = {
    'Disagree': 0,
    'Mostly Disagree': 1,
    'Mostly Agree': 2,
    'Agree': 3
}

df['label_numeric'] = df['5_label_majority_answer'].map(label_mapping)

def map_labels(labels_numeric, num_classes):
    if num_classes == 2:
        # Map to binary classes
        return labels_numeric.apply(lambda x: 0 if x <= 1 else 1)
    elif num_classes == 4:
        return labels_numeric
    else:
        raise ValueError("num_classes must be either 2 or 4.")

Unique values in '5_label_majority_answer': ['Mostly Agree' 'NO MAJORITY' 'Agree' 'Mostly Disagree' 'Disagree']


### Grouped Train/Test Split

In [115]:
NUM_OUT = 2

# Define the splitter
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=42)

df['label'] = map_labels(df['label_numeric'], NUM_OUT)

# Split the data
train_idx, test_idx = next(gss.split(df, groups=df['statement']))

# Create train and test sets
train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]


### Text Preprocessing

In [116]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

train_df['clean_statement'] = train_df['statement'].apply(clean_text)
test_df['clean_statement'] = test_df['statement'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['clean_statement'] = train_df['statement'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['clean_statement'] = test_df['statement'].apply(clean_text)


### Datasets

In [117]:
class Truthseeker(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels.iloc[index]

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True,
            return_attention_mask=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(label, dtype=torch.long)
        }


### BERT Class

In [118]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()   
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        #self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, NUM_OUT)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        output = self.classifier(pooler)
        return output

### Data Setup

In [119]:
MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 3
LEARNING_RATE = 2e-05

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Assuming 'clean_text' is the column with your text data in train_df
text_list = train_df['clean_statement'].tolist()

# Tokenize the text data
encoded_inputs = tokenizer(
    text_list,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',   # Use 'max_length' for padding
    truncation=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    return_tensors='pt'     # Return PyTorch tensors
)

training_data = Truthseeker(train_df['clean_statement'], train_df['label'], tokenizer, MAX_LEN)
test_data = Truthseeker(test_df['clean_statement'], test_df['label'], tokenizer, MAX_LEN)

train_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }    

training_loader = DataLoader(training_data, **train_params)
testing_loader = DataLoader(test_data, **test_params)

### Helpful Functions

In [120]:
def loss_fn(outputs, targets):
    loss_function = torch.nn.CrossEntropyLoss()
    return loss_function(outputs, targets)

def train(model, training_loader, optimizer):
    model.train()

    for data in tqdm(training_loader):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
    return loss
    
def validation(model, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for data in tqdm(testing_loader):
            targets = data['targets'].to(device, dtype=torch.long)
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids)
            outputs = torch.sigmoid(outputs).cpu().detach()
            fin_outputs.extend(outputs)
            fin_targets.extend(targets)
    return torch.stack(fin_outputs), torch.stack(fin_targets)

### Training Setup

In [122]:
model = BERTClass(NUM_OUT)
model.to(device)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
# optimizer = torch.optim.AdamW(params = model.parameters(), lr=LEARNING_RATE)

for epoch in range(EPOCHS):
    loss = train(model, training_loader, optimizer)
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')  
    guess, targs = validation(model, testing_loader)
    guesses = torch.max(guess, dim=1)
    targets = torch.max(targs, dim=1)
    print('Accuracy on test set {}'.format(accuracy_score(guesses.indices, targets.indices)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(training_loader):


  0%|          | 0/1390 [00:00<?, ?it/s]

KeyboardInterrupt: 