In [1]:
import pandas as pd
from features.preprocessing_utils import (
    Datapoint,
    normalize_and_clean,
    normalize_labels,
    normalize_and_clean_counts,
    normalize_and_clean_state_info,
    normalize_and_clean_party_affiliations,
    normalize_and_clean_speaker_title,
)
import csv
from typing import List, Tuple, Dict
import logging

In [2]:
TRAIN_PATH = "../../data/raw/train.tsv"
VAL_PATH = "../../data/raw/val.tsv"
TEST_PATH = "../../data/raw/test.tsv"

In [3]:
train_df = pd.read_csv(
    TRAIN_PATH,
    sep="\t",
    names=[
        "id",
        "statement_json",
        "label",
        "statement",
        "subject",
        "speaker",
        "speaker_title",
        "state_info",
        "party_affiliation",
        "barely_true_count",
        "false_count",
        "half_true_count",
        "mostly_true_count",
        "pants_fire_count",
        "context",
        "justification",
    ],
)
val_df = pd.read_csv(
    VAL_PATH,
    sep="\t",
    names=[
        "id",
        "statement_json",
        "label",
        "statement",
        "subject",
        "speaker",
        "speaker_title",
        "state_info",
        "party_affiliation",
        "barely_true_count",
        "false_count",
        "half_true_count",
        "mostly_true_count",
        "pants_fire_count",
        "context",
        "justification",
    ],
)
test_df = pd.read_csv(
    TEST_PATH,
    sep="\t",
    names=[
        "id",
        "statement_json",
        "label",
        "statement",
        "subject",
        "speaker",
        "speaker_title",
        "state_info",
        "party_affiliation",
        "barely_true_count",
        "false_count",
        "half_true_count",
        "mostly_true_count",
        "pants_fire_count",
        "context",
        "justification",
    ],
)

In [6]:
logging.basicConfig(
    format="%(levelname)s - %(asctime)s - %(filename)s - %(message)s",
    level=logging.DEBUG,
)
LOGGER = logging.getLogger(__name__)


def read_datapoints(datapath: str) -> List[Dict]:
    LOGGER.info(f"Reading data from {datapath}")
    with open(datapath) as f:
        reader = csv.DictReader(
            f,
            delimiter="\t",
            fieldnames=[
                "id",
                "statement_json",
                "label",
                "statement",
                "subject",
                "speaker",
                "speaker_title",
                "state_info",
                "party_affiliation",
                "barely_true_count",
                "false_count",
                "half_true_count",
                "mostly_true_count",
                "pants_fire_count",
                "context",
                "justification",
            ],
        )
        # Filter out rows with any missing values
        return [row for row in reader if all(row[field] for field in reader.fieldnames)]

In [7]:
train_datapoints = read_datapoints(TRAIN_PATH)
LOGGER.info(f"Read {len(train_datapoints)} training datapoints")

INFO - 2024-10-26 15:32:26,804 - 1187241512.py - Reading data from ../../data/raw/train.tsv
INFO - 2024-10-26 15:32:26,880 - 3927029435.py - Read 6669 training datapoints


In [8]:
LOGGER.info("Normalizing and cleaning data...")
# Drop any datapoints containing None or NaN

train_datapoints = normalize_and_clean(train_datapoints)

INFO - 2024-10-26 15:32:28,925 - 1542445790.py - Normalizing and cleaning data...
