In [None]:
import torch

In [None]:
from transformers import BertModel, BertTokenizer

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device='cuda')


In [None]:
def tokenize(sentences, tokenizer, max_length=128):
    encoded_dict = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
    return encoded_dict['input_ids'].to(device='cuda'), encoded_dict['attention_mask'].to(device = 'cuda')


In [None]:
def get_bert_embeddings(sentences, tokenizer, model):
    model.eval()
    with torch.no_grad():
        input_ids, attention_mask = tokenize(sentences, tokenizer)
        outputs = model(input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state[:, 0, :].detach()  # Using the [CLS] token
    return embeddings


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/CSC413FinalProject/liar_dataset/train.tsv', delimiter='\t', header=None)
# Optionally, add column names if the file doesn't include headers
df.columns = ["ID", "Label", "Statement", "Subject", "Speaker", "Speaker_Job", "Speaker_State", "Party", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"]


In [None]:
df_val = pd.read_csv('/content/drive/MyDrive/CSC413FinalProject/liar_dataset/valid.tsv', delimiter='\t', header=None)
# Optionally, add column names if the file doesn't include headers
df_val.columns = ["ID", "Label", "Statement", "Subject", "Speaker", "Speaker_Job", "Speaker_State", "Party", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"]


In [None]:
df

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Speaker_Job,Speaker_State,Party,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10235,5473.json,mostly-true,There are a larger number of shark attacks in ...,"animals,elections",aclu-florida,,Florida,none,0.0,1.0,1.0,1.0,0.0,"interview on ""The Colbert Report"""
10236,3408.json,mostly-true,Democrats have now become the party of the [At...,elections,alan-powell,,Georgia,republican,0.0,0.0,0.0,1.0,0.0,an interview
10237,3959.json,half-true,Says an alternative to Social Security that op...,"retirement,social-security",herman-cain,,Georgia,republican,4.0,11.0,5.0,3.0,3.0,a Republican presidential debate
10238,2253.json,false,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...


In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

labels = list(df['Label'])

# Example list
categories = np.array(labels).reshape(-1, 1)

# Create the encoder and fit it
encoder = OneHotEncoder(sparse=False)
labels = encoder.fit_transform(categories)

print(labels.shape)

(10240, 6)




In [None]:
val_labels = list(df_val['Label'])

# Example list
categories = np.array(val_labels).reshape(-1, 1)

# Create the encoder and fit it
encoder = OneHotEncoder(sparse=False)
val_labels = encoder.fit_transform(categories)

print(val_labels.shape)

(1284, 6)




In [None]:
inputs = list(df['Statement'])

In [None]:
inputs_val = list(df_val['Statement'])

In [None]:
embeddings = get_bert_embeddings(inputs[9000:], tokenizer, model)
torch.save(embeddings, 'embeddingsfinal.pt')

In [None]:
names = ['1000', '2000', '3000', '4000', '5000', '6000', '7000', '8000', '9000', 'final']
train_embeddings = None
for i in names:
  embed = torch.load('embeddings'+i+'.pt')
  if train_embeddings == None:
    train_embeddings = embed
  else:
    train_embeddings = torch.cat((train_embeddings, embed), dim=0)

In [None]:
embeddings = get_bert_embeddings(inputs_val, tokenizer, model)


In [None]:
torch.save(embeddings, '/content/drive/MyDrive/CSC413FinalProject/val_embeddings.pt')

In [None]:
train_embeddings.shape

torch.Size([10240, 768])

In [None]:
val_embeddings = embeddings

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class LogisticRegression(nn.Module):
    def __init__(self, input_dim, num_classes, hidden = 400):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, hidden)
        self.hidden_linear = nn.Linear(hidden, num_classes)

    def forward(self, x):
        x = self.linear(x)  # Output logits for each class
        x = nn.functional.relu(x)
        x = self.hidden_linear(x)
        return nn.functional.softmax(x, dim=1)


In [None]:
def training(train_embeddings, labels, model, epochs=10):
  dataset = TensorDataset(train_embeddings, torch.Tensor(labels).float())
  data_loader = DataLoader(dataset, batch_size=10, shuffle=True)



  model = model

  # Loss and optimizer
  criterion = nn.BCELoss()
  optimizer = optim.SGD(model.parameters(), lr=0.001)
  num_epochs = epochs  # Set the number of epochs

  for epoch in range(num_epochs):
      for inputs, targets in data_loader:
          # Forward pass
          outputs = model(inputs.to(device='cpu'))
          loss = criterion(outputs, targets)

          # Backward and optimize
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

      print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
  return model


In [None]:
training_model = LogisticRegression(train_embeddings.shape[1], 6)
training_model = training(train_embeddings, labels, training_model)

Epoch [1/10], Loss: 0.4277
Epoch [2/10], Loss: 0.4339
Epoch [3/10], Loss: 0.4482
Epoch [4/10], Loss: 0.4300
Epoch [5/10], Loss: 0.4673
Epoch [6/10], Loss: 0.4532
Epoch [7/10], Loss: 0.4446
Epoch [8/10], Loss: 0.4439
Epoch [9/10], Loss: 0.4294
Epoch [10/10], Loss: 0.4237


In [None]:
model_outputs = torch.argmax(training_model(train_embeddings.to('cpu')), axis =1)

In [None]:
argmax_labels = torch.argmax(torch.Tensor(labels), axis =1)

In [None]:
print(sum(model_outputs == argmax_labels)/len(model_outputs))

tensor(0.2409)


In [None]:
val_model_outputs = torch.argmax(training_model(val_embeddings.to('cpu')), axis =1)

In [None]:
argmax_val_labels = torch.argmax(torch.Tensor(val_labels), axis =1)

In [None]:
print(sum(val_model_outputs == argmax_val_labels)/len(val_model_outputs))

tensor(0.2399)


In [None]:
torch.save(train_embeddings, '/content/drive/MyDrive/CSC413FinalProject/train_embeddings.pt')