# Split merged clean label files

In [1]:
import os

import pandas as pd

%matplotlib inline
%config InlineBackend.figure_format='retina'

import math
from collections import defaultdict
from textwrap import wrap
import numpy as np

import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

import nltk.data
from nltk.tokenize import sent_tokenize
from nltk.corpus import alpino

sent_detector = nltk.data.load('tokenizers/punkt/dutch.pickle')

In [2]:
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
DATA_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged"
SAVE_DIR = "/home/leonardovida/dev/hist-aware/notebooks/data/labeled-full/split_labeled/merged_split"
PRE_TRAINED_MODEL_NAME = 'wietsedv/bert-base-dutch-cased'

### Create Tokenizer

In [4]:
import unicodedata

class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

    def __init__(self, do_lower_case=False):
        """Constructs a BasicTokenizer.
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case

    def tokenize(self, text, return_str=False):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)

        orig_tokens = text.strip().split()
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))

        return split_tokens

    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)

    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]

    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)


def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False


def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat in ("Cc", "Cf"):
        return True
    return False


def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    return cat.startswith("P")


In [5]:
tokenizer = BasicTokenizer()

### Split text and explode new

In [7]:
def split_text(r):
    out = []
    sents = sent_detector.tokenize(r)
    sents = [' '.join(tokenizer.tokenize(s)) for s in sents if len(s)>80]
    return sents

def apply_split_text(df):
    df["text_split"] = df["text"].apply(split_text)
    df.text_split.replace([], np.nan, inplace=True)
    df.dropna(subset=['text_split'], inplace=True)
    # Cancel all text_split == 0
    df.drop(df[df.text_split.map(len) == 0].index, inplace=True)
    return df

In [8]:
DECADES = ["1970s", "1980s", "1990s"]
TYPES = ["coal", "gas", "oil"]

for DECADE in DECADES:
    for TYPE in TYPES:
        name = f"{DECADE}_{TYPE}_merged.csv"
        df = pd.read_csv(os.path.join(DATA_DIR, name))
        print(df.shape)
        df = apply_split_text(df)
        print(df.shape)
        df = df.explode('text_split')
        print(df.shape)
        df.to_csv(os.path.join(SAVE_DIR, f"{DECADE}_{TYPE}_merged_split.csv"))

(344, 4)
(344, 5)
(2980, 5)
(179, 4)
(179, 5)
(1728, 5)
(500, 4)
(500, 5)
(6008, 5)
(325, 4)
(325, 5)
(2844, 5)
(511, 4)
(511, 5)
(5409, 5)
(484, 4)
(484, 5)
(5905, 5)
(337, 4)
(332, 5)
(1916, 5)
(222, 4)
(221, 5)
(1335, 5)
(193, 4)
(192, 5)
(1218, 5)
