# Truthseeker Project
Trent Everard

CS 497

11/22/24

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm_notebook as tqdm
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler

## Load the Data

In [2]:
df = pd.read_csv("data/Truth_Seeker_Model_Dataset.csv")
df.shape
# Filter out 'NO MAJORITY'
df = df[~df['5_label_majority_answer'].isin(['NO MAJORITY'])]

# Reset index after filtering
df = df.reset_index(drop=True)

## Preprocess the Data

In [None]:
statements = df['statement']
groups = df['statement']

gss = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
train_idx, test_idx = next(gss.split(df, groups=groups))

train_df = df.iloc[train_idx]
test_df = df.iloc[test_idx]

In [4]:
# Training set class distribution
train_class_counts = train_df['5_label_majority_answer'].value_counts()
print("Training Set Class Distribution:")
print(train_class_counts)

# Test set class distribution
test_class_counts = test_df['5_label_majority_answer'].value_counts()
print("\nTest Set Class Distribution:")
print(test_class_counts)

Training Set Class Distribution:
5_label_majority_answer
Agree              43463
Mostly Agree       42720
Mostly Disagree     2308
Disagree             432
Name: count, dtype: int64

Test Set Class Distribution:
5_label_majority_answer
Mostly Agree       11086
Agree              10872
Mostly Disagree      583
Disagree             129
Name: count, dtype: int64


## Label Encoding

In [14]:
# Get the labels
label_encoder = LabelEncoder()
label_encoder.fit(train_df['5_label_majority_answer'])

train_labels = label_encoder.transform(train_df['5_label_majority_answer'])
test_labels = label_encoder.transform(test_df['5_label_majority_answer'])

# Initialize the LabelBinarizer for one-hot encoding
lb = LabelBinarizer()
lb.fit(train_labels)

# Transform labels
train_labels_encoded = lb.transform(train_labels)
test_labels_encoded = lb.transform(test_labels)

## Label Mapping

In [15]:
# Map labels to two classes
label_mapping = {
    'Agree': 'Agree',
    'Mostly Agree': 'Agree',
    'Disagree': 'Disagree',
    'Mostly Disagree': 'Disagree'
}

train_df['2class_label'] = train_df['5_label_majority_answer'].map(label_mapping)
test_df['2class_label'] = test_df['5_label_majority_answer'].map(label_mapping)

label_encoder_2class = LabelEncoder()
label_encoder_2class.fit(train_df['2class_label'])

train_labels_2class = label_encoder_2class.transform(train_df['2class_label'])
test_labels_2class = label_encoder_2class.transform(test_df['2class_label'])

# Encode labels
lb_2class = LabelBinarizer()
lb_2class.fit(train_labels_2class)

train_labels_2class_encoded = lb_2class.transform(train_labels_2class)
test_labels_2class_encoded = lb_2class.transform(test_labels_2class)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['2class_label'] = train_df['5_label_majority_answer'].map(label_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['2class_label'] = test_df['5_label_majority_answer'].map(label_mapping)


## BERT Model

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=512
        )
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        # For multi-class classification, labels can be integers
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    
    def __len__(self):
        return len(self.labels)

# For 4-class problem
train_texts = train_df['statement']
test_texts = test_df['statement']

train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels)

# For 2-class problem
train_dataset_2class = TextDataset(train_texts, train_labels_2class)
test_dataset_2class = TextDataset(test_texts, test_labels_2class)

print("Label Encoder Classes:", label_encoder.classes_)
print("\nLabel Mapping:")
for idx, label in enumerate(label_encoder.classes_):
    print(f"{idx}: {label}")
print("Sample train labels (4-class):", train_labels[:5])
print("Sample test labels (4-class):", test_labels[:5])
print("Sample train labels (2-class):", train_labels_2class[:5])
print("Sample test labels (2-class):", test_labels_2class[:5])

Label Encoder Classes: ['Agree' 'Disagree' 'Mostly Agree' 'Mostly Disagree']

Label Mapping:
0: Agree
1: Disagree
2: Mostly Agree
3: Mostly Disagree
Sample train labels (4-class): [2 0 2 0 0]
Sample test labels (4-class): [2 2 0 2 2]
Sample train labels (2-class): [0 0 0 0 0]
Sample test labels (2-class): [0 0 0 0 0]


## Oversampling

In [8]:
# Combine texts and labels into a DataFrame for resampling
train_data = pd.DataFrame({
    'statement': train_df['statement'],
    'label': train_labels
})

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(
    train_data[['statement']], train_data['label']
)

train_texts_resampled = X_resampled['statement']
train_labels_resampled = y_resampled

## Train the Model for 4-class

In [None]:
# Use GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)
model.to(device)

# Create data loaders
train_dataset = TextDataset(train_texts_resampled, train_labels_resampled)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Loss Function
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# # Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 4
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(train_loader):


  0%|          | 0/10866 [00:00<?, ?it/s]

Epoch 1, Training Loss: 0.4778


  0%|          | 0/10866 [00:00<?, ?it/s]

Epoch 2, Training Loss: 0.4538


  0%|          | 0/10866 [00:00<?, ?it/s]

Epoch 3, Training Loss: 0.4507


  0%|          | 0/10866 [00:00<?, ?it/s]

Epoch 4, Training Loss: 0.4478


## Train the Model for 2-class

In [None]:
# Initialize the model
model_2class = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model_2class.to(device)

# Create data loaders
train_loader_2class = DataLoader(train_dataset_2class, batch_size=32, shuffle=True)
test_loader_2class = DataLoader(test_dataset_2class, batch_size=32)

# Optimizer
optimizer_2class = AdamW(model_2class.parameters(), lr=2e-5)

# Training loop
for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader):
        optimizer_2class.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        total_train_loss += loss.item()
        loss.backward()
        optimizer_2class.step()
    avg_train_loss = total_train_loss / len(train_loader)
    print(f'Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(train_loader):


  0%|          | 0/10866 [00:00<?, ?it/s]

Epoch 1, Training Loss: 0.4440


## Evaluate the Model for 4-class

In [12]:
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'4-Class Classification Accuracy: {accuracy:.4f}')

target_names = [label_encoder.classes_[idx] for idx in range(len(label_encoder.classes_))]

# Classification report
print(classification_report(true_labels, predictions, target_names=target_names))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(test_loader):


  0%|          | 0/1417 [00:00<?, ?it/s]

4-Class Classification Accuracy: 0.1368
                 precision    recall  f1-score   support

          Agree       0.49      0.24      0.32     10872
       Disagree       0.01      0.40      0.01       129
   Mostly Agree       0.46      0.02      0.03     11086
Mostly Disagree       0.03      0.43      0.05       583

       accuracy                           0.14     22670
      macro avg       0.25      0.27      0.10     22670
   weighted avg       0.46      0.14      0.17     22670



## Evaluate the Model for 2-class

In [22]:
model_2class.eval()
predictions_2class = []
true_labels_2class = []

with torch.no_grad():
    for batch in tqdm(test_loader_2class):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model_2class(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        
        predictions_2class.extend(preds.cpu().numpy())
        true_labels_2class.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy_2class = accuracy_score(true_labels_2class, predictions_2class)
print(f'2-Class Classification Accuracy: {accuracy_2class:.4f}')

# If classes are not strings, convert them
target_names = [str(cls) for cls in label_encoder_2class.classes_]

# Classification report
print(classification_report(true_labels_2class, predictions_2class, target_names=target_names))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch in tqdm(test_loader_2class):


  0%|          | 0/709 [00:00<?, ?it/s]

2-Class Classification Accuracy: 0.1354
              precision    recall  f1-score   support

       Agree       0.98      0.11      0.20     21958
    Disagree       0.03      0.91      0.06       712

    accuracy                           0.14     22670
   macro avg       0.50      0.51      0.13     22670
weighted avg       0.95      0.14      0.19     22670

