In [1]:
from google.colab import drive
import pandas as pd
import torch
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
COLUMN_HEADERS = ["ID", "Label", "Statement", "Subject", "Speaker", "Speaker_Job", "Speaker_State", "Party", "barely_true_counts", "false_counts", "half_true_counts", "mostly_true_counts", "pants_on_fire_counts", "context"]

In [3]:
def extract_labels(lines):
  """ Given a list of strings, each of which containing one or more labels
  separated by commas, extract the labels and return as a nested list.
  """

  # Empty fields are represented as NaN in the dataset, so using those
  # as if they were empty strings
  return [(line.split(",") if isinstance(line, str) else []) for line in lines]

In [4]:
def build_label_dict(labels):
  """ Given a nested list of labels, assign each distinct label a unique
  id and return a dictionary mapping labels to their ids.
  """
  label_dict = {}
  next_id = 0
  N = len(labels)

  for i in range(N):
    for label in labels[i]:
      if label not in label_dict:
        label_dict[label] = next_id
        next_id += 1

  return label_dict

In [5]:
def get_occurrence_matrix(labels, label_dict):
  """ Given a nested list of labels and the label id dictionary, returns
  a matrix of shape N * M, where N is the number of data items and M
  is the total number of distinct labels, such that each entry (i, j) is a
  boolean indication of whether item i has label with id j.
  """
  N = len(labels)
  M = len(label_dict)
  occ = torch.zeros(N, M)

  for i in range(N):
    for label in labels[i]:
      if label in label_dict:
        occ[i][label_dict[label]] = 1

  return occ

In [6]:
def get_counts_matrix(dataframe):
  """ Extract the truth counts from the given dataframe and return as
  a tensor.
  """

  return torch.cat(
      (
          torch.tensor(dataframe["barely_true_counts"]).unsqueeze(1),
          torch.tensor(dataframe["false_counts"]).unsqueeze(1),
          torch.tensor(dataframe["half_true_counts"]).unsqueeze(1),
          torch.tensor(dataframe["mostly_true_counts"]).unsqueeze(1),
          torch.tensor(dataframe["pants_on_fire_counts"]).unsqueeze(1)
      ), dim=1
  )

In [7]:
def gen_metadata(in_filepath, out_filepath, subj_dict=None, party_dict=None):
  """ Reads a dataset, generates the full metadata tensor, and saves
  into a file.
  If subj_dict and party_dict are provided, they are used for label
  id lookup; otherwise, they are built from the dataset.
  In either case, both dictionaries are also returned.
  """
  df = pd.read_csv(in_filepath, delimiter='\t', header=None)
  df.columns = COLUMN_HEADERS

  subj_labels = extract_labels(list(df["Subject"]))
  party_labels = extract_labels(list(df["Party"]))

  if subj_dict is None:
    subj_dict = build_label_dict(subj_labels)

  if party_dict is None:
    party_dict = build_label_dict(party_labels)

  subj_occ = get_occurrence_matrix(subj_labels, subj_dict)
  party_occ = get_occurrence_matrix(party_labels, party_dict)

  metadata = torch.cat((subj_occ, party_occ, get_counts_matrix(df)), dim=1)
  torch.save(metadata, out_filepath)

  print(f"---\nDataset: {in_filepath}")
  print(f"Metadata shape: {metadata.shape}")
  print(f"Saved to: {out_filepath}")

  return subj_dict, party_dict

In [8]:
IN_FILEPATH_TRAIN = '/content/drive/MyDrive/CSC413FinalProject/liar_dataset/train.tsv'
IN_FILEPATH_VALID = '/content/drive/MyDrive/CSC413FinalProject/liar_dataset/valid.tsv'
IN_FILEPATH_TEST = '/content/drive/MyDrive/CSC413FinalProject/liar_dataset/test.tsv'
OUT_FILEPATH_TRAIN = '/content/drive/MyDrive/CSC413FinalProject/train_meta.pt'
OUT_FILEPATH_VALID = '/content/drive/MyDrive/CSC413FinalProject/valid_meta.pt'
OUT_FILEPATH_TEST = '/content/drive/MyDrive/CSC413FinalProject/test_meta.pt'

In [9]:
# Generate for all three datasets
subj_dict, party_dict = gen_metadata(IN_FILEPATH_TRAIN, OUT_FILEPATH_TRAIN)
gen_metadata(IN_FILEPATH_VALID, OUT_FILEPATH_VALID, subj_dict, party_dict)
_ = gen_metadata(IN_FILEPATH_TEST, OUT_FILEPATH_TEST, subj_dict, party_dict)  # Assign return value so that colab doesn't print it

---
Dataset: /content/drive/MyDrive/CSC413FinalProject/liar_dataset/train.tsv
Metadata shape: torch.Size([10240, 170])
Saved to: /content/drive/MyDrive/CSC413FinalProject/train_meta.pt
---
Dataset: /content/drive/MyDrive/CSC413FinalProject/liar_dataset/valid.tsv
Metadata shape: torch.Size([1284, 170])
Saved to: /content/drive/MyDrive/CSC413FinalProject/valid_meta.pt
---
Dataset: /content/drive/MyDrive/CSC413FinalProject/liar_dataset/test.tsv
Metadata shape: torch.Size([1267, 170])
Saved to: /content/drive/MyDrive/CSC413FinalProject/test_meta.pt
