In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Get Training labels
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
df = pd.read_csv('/content/drive/MyDrive/CSC413FinalProject/liar_dataset/train.tsv', delimiter='\t', header=None)
# Optionally, add column names if the file doesn't include headers
df.columns = ["ID", "Label", "Statement", "Subject", "Speaker", "Speaker_Job", "Speaker_State", "Party", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"]


labels = list(df['Label'])

# Example list
categories = np.array(labels).reshape(-1, 1)

# Create the encoder and fit it
encoder = OneHotEncoder(sparse=False)
labels = encoder.fit_transform(categories)

print(labels.shape)

(10240, 6)




In [None]:
# Get embeddings and metadata
import torch
train_emb = torch.load('/content/drive/MyDrive/CSC413FinalProject/train_embeddings.pt').to('cuda:0')
val_emb = torch.load('/content/drive/MyDrive/CSC413FinalProject/val_embeddings.pt').to('cuda:0')

train_meta = torch.load('/content/drive/MyDrive/CSC413FinalProject/train_meta.pt').to('cuda:0')
val_meta = torch.load('/content/drive/MyDrive/CSC413FinalProject/valid_meta.pt').to('cuda:0')

train_input = torch.cat((train_emb,train_meta), dim=1)
val_input = torch.cat((val_emb,val_meta), dim=1)

train_emb = train_emb.unsqueeze(1)

print(train_input.shape)
print(val_input.shape)

torch.Size([10240, 938])
torch.Size([1284, 938])


In [None]:
# Get validation Labels
df_val = pd.read_csv('/content/drive/MyDrive/CSC413FinalProject/liar_dataset/valid.tsv', delimiter='\t', header=None)
# Optionally, add column names if the file doesn't include headers
df_val.columns = ["ID", "Label", "Statement", "Subject", "Speaker", "Speaker_Job", "Speaker_State", "Party", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"]

val_labels = list(df_val['Label'])

# Example list
categories = np.array(val_labels).reshape(-1, 1)

# Create the encoder and fit it
encoder = OneHotEncoder(sparse=False)
val_labels = encoder.fit_transform(categories)

print(val_labels.shape)

(1284, 6)




In [None]:
#Remove nan from input
# Check for NaN values
nan_mask = torch.isnan(train_input)
nan_mask_2 = torch.isnan(val_input)

# Replace NaN values with zeros
train_input[nan_mask] = 0
val_input[nan_mask_2] = 0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim



# Define RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden state with zeros
        batch_size = x.size(0)  # Get batch size
        h0 = torch.zeros(1, batch_size, self.hidden_size).to(x.device)

        # Forward propagate RNN
        out, _ = self.rnn(x, h0)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

# Parameters
input_size = 768  # Size of BERT embeddings
hidden_size = 128  # Hidden size of RNN
output_size = 6  # Size of output labels
learning_rate = 0.001
num_epochs = 10
batch_size = 64

# Define RNN model
rnnmodel = RNNModel(input_size, hidden_size, output_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnnmodel.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for i in range(0, len(train_emb), batch_size):
        # Get mini-batch data
        inputs = train_emb[i:i+batch_size].to('cpu')
        print(inputs.shape)
        targets = labels[i:i+batch_size]

        # Forward pass
        outputs = rnnmodel(inputs)

        # Compute loss
        loss = criterion(outputs, torch.tensor(targets))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i == 0):
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch+1, num_epochs, i+1, len(train_emb)//batch_size, loss.item()))


In [None]:
# BELOW IS TRAINING WITH RNN OUTPUTS + METADATA:

train_full_input = torch.cat((model(train_emb.to('cpu')),train_meta.to('cpu')), dim=1).detach()
print(train_full_input.shape)

torch.Size([10240, 176])


In [None]:
#Remove nan from input
# Check for NaN values
nan_mask = torch.isnan(train_full_input)

# Replace NaN values with zeros
train_full_input[nan_mask] = 0

In [None]:
# Define model
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, num_classes, hidden = 500):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, hidden)
        self.hidden_linear = nn.Linear(hidden, num_classes)

    def forward(self, x):
        x = self.linear(x)  # Output logits for each class
        x = nn.functional.relu(x)
        x = self.hidden_linear(x)
        return nn.functional.softmax(x, dim=1)

In [None]:
def training(train_embeddings, labels, model, epochs=10):
  dataset = TensorDataset(train_embeddings, torch.Tensor(labels).float())
  data_loader = DataLoader(dataset, batch_size=10, shuffle=True)



  model = model

  # Loss and optimizer
  criterion = nn.BCELoss()
  optimizer = optim.SGD(model.parameters(), lr=0.001)
  num_epochs = epochs  # Set the number of epochs

  for epoch in range(num_epochs):
      for inputs, targets in data_loader:
          # Forward pass
          outputs = model(inputs.to(device='cpu'))
          loss = criterion(outputs, targets)

          # Backward and optimize
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
  return model

In [None]:
train_full_input = train_full_input.to(torch.float)
training_model = LogisticRegression(train_full_input.shape[1], 6)
training_model = training(train_full_input, labels, training_model)

Epoch [1/10], Loss: 0.4579
Epoch [2/10], Loss: 0.4363
Epoch [3/10], Loss: 0.4983
Epoch [4/10], Loss: 0.4626
Epoch [5/10], Loss: 0.4170
Epoch [6/10], Loss: 0.4471
Epoch [7/10], Loss: 0.4560
Epoch [8/10], Loss: 0.4277
Epoch [9/10], Loss: 0.4565
Epoch [10/10], Loss: 0.4196


In [None]:
#Train Accuracy
model_outputs = torch.argmax(training_model(train_full_input.to('cpu')), axis =1)
argmax_labels = torch.argmax(torch.Tensor(labels), axis =1)
print(sum(model_outputs == argmax_labels)/len(model_outputs))

tensor(0.3757)


In [None]:
# Accuracy of just the RNN
model_outputs = torch.argmax(rnnmodel(train_emb.to('cpu')), axis =1)
argmax_labels = torch.argmax(torch.Tensor(labels), axis =1)
print(sum(model_outputs == argmax_labels)/len(model_outputs))

tensor(0.3890)
