In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parent.parent
if module_path not in sys.path:
    sys.path.insert(0, str(module_path))

In [None]:
module_path

In [None]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from sklearn.model_selection import train_test_split

In [None]:
from src.loader import TextLoader

In [None]:
!mkdir -p data/

In [None]:
loader = TextLoader()

In [None]:
loader.df

In [None]:
formatted_output = []
for text in loader.iterate(verbose=True, purify_text=True, purify_discourses=True):
    if text.id in ("F91D7BB4277C", "354946A1CA46", "EB3D0704BCF0", "B689C28463CB"):  # Broken, to delete
        continue

    curr_start = 0
    is_count_set = False
    for discourse in text.discourses:
        new_start_ind = text.index(discourse.text, curr_start)

        first_word = ""
        curr_ind = new_start_ind
        while curr_ind < len(text) and (char := text[curr_ind]) != " ":
            first_word += char
            curr_ind += 1

        # Verify if char before discourse is a space
        no_chars_added = 0
        if new_start_ind > 0 and text[new_start_ind - 1] != " ":
            output = first_word
            curr_ind = new_start_ind - 1
            while curr_ind >= 0 and (char := text[curr_ind]) != " ":
                output = char + output
                curr_ind -= 1

            print()
            print(text.id)
            print(f"First word is: `{first_word}` but I think it should be: `{output}`")

            first_word = output
            if not is_count_set:
                is_count_set = True

            no_chars_added = new_start_ind - curr_ind - 1
            new_start_ind = curr_ind + 1

        curr_start = new_start_ind + len(discourse.text) + no_chars_added

        last_word = ""
        curr_ind = curr_start - 1
        while curr_ind >= 0 and (char := text[curr_ind]) != " ":
            last_word = char + last_word
            curr_ind -= 1

        # Verify if char after discourse is a space or dot
        no_chars_added = 0
        if curr_start < len(text) and text[curr_start] not in (" ", "."):
            output = last_word
            curr_ind = curr_start
            while curr_ind < len(text) and (char := text[curr_ind]) not in (" ", "."):
                output += char
                curr_ind += 1

            print()
            print(text.id)
            print(f"Last word is: `{last_word}` but I think it should be: `{output}`")

            last_word = output
            if not is_count_set:
                is_count_set = True

            curr_start = curr_ind

        discourse_words = discourse.words
        if discourse_words[0] != first_word:
            discourse_words[0] = first_word
        if discourse_words[-1] != last_word:
            discourse_words[-1] = last_word

        formatted_output.append(
            (
                text.id,  # Text ID
                discourse.id,  # Discourse ID
                new_start_ind,  # Discourse start index
                curr_start,  # Discourse end index
                " ".join(discourse_words),  # Discourse text
                discourse.type.value,  # Discourse type
            )
        )


In [None]:
df = pd.DataFrame(formatted_output, columns=["text_id", "disc_id", "disc_start", "disc_end", "disc_text", "disc_type"])

In [None]:
df

In [None]:
# df.to_csv(module_path / "data" / "train_v1_no_predictionstring.xz", index=False, compression="xz")
df = pd.read_csv(module_path / "data" / "train_v1_no_predictionstring.xz", compression="xz")

In [None]:
df

In [None]:
def check_idx_validity(row: pd.Series):
    print(f"\r{int(row.name):>6}", end="")
    text = loader.load_text_with_id(row["text_id"], purify_text=True, purify_discourses=True)
    if text[row["disc_start"] : row["disc_end"]] != row["disc_text"]:
        print(f"Text with id: {row['text_id']} is invalid for discourse with id: {row['disc_id']}")
        print()

In [None]:
_ = df.apply(check_idx_validity, axis=1)

In [None]:
# Check if predictionstring will be valid
for ind, text_id in enumerate(pd.unique(df["text_id"])):
    print(f"\r{ind:>6} / {len(pd.unique(df['text_id']))}", end="")

    text = loader.load_text_with_id(text_id, purify_text=True, purify_discourses=True)
    for disc_row in df[df["text_id"] == text_id].itertuples():
        disc_words = disc_row.disc_text.split()
        no_words_before = len(text[: disc_row.disc_start].split())

        start = no_words_before
        end = start + len(disc_words)

        if " ".join(text.words[start:end]).strip(STRIP_CHARS) != disc_row.disc_text.strip(STRIP_CHARS):
            print(f"\nText with id: {text_id} is invalid for discourse with id: {disc_row.disc_id}")


In [None]:
def create_predictionstring(row: pd.Series):
    print(f"\r{int(row.name):>6}", end="")
    text = loader.load_text_with_id(row.text_id, purify_text=True, purify_discourses=True)

    disc_words = row.disc_text.split()
    no_words_before = len(text[: row.disc_start].split())

    start = no_words_before
    end = start + len(disc_words)

    return " ".join(map(str, range(start, end)))

In [None]:
df["predictionstring"] = df.apply(create_predictionstring, axis=1)

In [None]:
df

In [None]:
df.to_csv(module_path / "data" / "train_v1_with_predictionstring.xz", index=False, compression="xz")


In [None]:
dff = pd.read_csv(module_path / "data" / "train_v1_with_predictionstring.xz", compression="xz")

In [None]:
text = loader.load_text_with_id("423A1CA112E2", purify_text=True, purify_discourses=True)


In [None]:
for disc in text.discourses:
    print(disc, disc.text in text.text)

In [None]:
print(text)

In [None]:
print(repr(text.text))

In [None]:
st, en = text.discourses[1].ind_start, text.discourses[1].ind_end
print(text.text[st:en])

In [None]:
p_st, p_en = text.discourses[1].predictionstring[0], text.discourses[1].predictionstring[-1]
print(text.words[p_st : p_en + 1])


In [None]:
# text = loader.load_text_with_id("E881FAAEC690")
# text = loader.load_text_with_id("6FD9A4641AD7")
for text in loader.iterate(verbose=True):
    ...

In [None]:
text.id

In [None]:
for disc in text.discourses:
    print(disc)


In [None]:
print(text)

In [None]:
def create_doc(
    doc_type: str, offset: int = 0, limit: int = 0, shuffle: bool = True, seed: int = 8888
) -> None:
    current_ends = []
    output = []
    DS_count = 0
    DE_count = 0
    for text_no, text in enumerate(
        loader.iterate(offset=offset, limit=limit, shuffle=shuffle, seed=seed)
    ):
        print(f"\r{text_no + 1:3} / {limit}", end="")
        current_ends = [
            (disc.predictionstring[0], disc.predictionstring[-1]) for disc in text.discourses
        ]

        curr_start, curr_end = current_ends.pop(0)
        for word_ind, word in enumerate(text.words):
            if word_ind > curr_end:
                if not current_ends:
                    break

                curr_start, curr_end = current_ends.pop(0)

            if word_ind == curr_start:
                output.append(f"{word} DS\n")
                DS_count += 1
            elif word_ind == curr_end:
                output.append(f"{word} DE\n")
                DE_count += 1
            else:
                output.append(f"{word} O\n")

            if DE_count > DS_count:
                raise Exception(f"Wut for {word_ind}")

            if "." in word:
                output.append("\n")

        output.append("<DOC>\n")

    output = output[:-1]  # remove last <DOC>

    with open(f"data/NER_{doc_type}.txt", "w") as f:
        f.writelines(output)


In [None]:
text = loader.load_text_with_id("DBF7EB6A9E02")
disc, = [disc for disc in text.discourses if disc.id == 1622489430075]
print(disc.text)
print(text[disc.ind_start:disc.ind_end].split())
pred_start, pred_end = int(disc.predictionstring[0]), int(disc.predictionstring[-1])
print(text.words[pred_start:pred_end+1]) # TODO: Problem with predictionstring

In [None]:
text = loader.load_text_with_id("DBF7EB6A9E02")
for disc in text.discourses:
    print(disc.id)
    char_start = disc.ind_start
    char_end = disc.ind_end
    word_start = len(text[:char_start].split())
    word_end = word_start + len(text[char_start:char_end].split())
    word_end = min(word_end, len(text.split()))
    print((word_start, word_end))
    
    d_text = disc.text.split()
    e_text = text.words[word_start:word_end]
    if d_text[0] != e_text[0]:
        d_text.remove(d_text[0])
        e_text.remove(e_text[-1])

    for ind, d in enumerate(d_text):
        print(f"{d} == {e_text[ind]} --> {d == e_text[ind]}")423A1CA112E2

In [None]:
current_ends = [
    (disc.predictionstring[0], disc.predictionstring[-1]) for disc in text.discourses
]
current_ends


In [None]:
curr_start, curr_end = current_ends.pop(0)
for word_ind, word in enumerate(text.words):
    if word_ind > curr_end:
        if not current_ends:
            break

        curr_start, curr_end = current_ends.pop(0)

    if word_ind == curr_start:
        print(f"{word} ({word_ind}) DS")
    elif word_ind == curr_end:
        print(f"{word} ({word_ind}) DE")
    else:
        print(f"{word} ({word_ind}) O")


In [None]:
len(loader)


In [None]:
# train_size = 0.8
# dev_size = 0.15
# test_size = 0.05

train_size, dev_size = train_test_split(
    range(len(loader)), test_size=0.2, random_state=8888
)
dev_size, test_size = train_test_split(
    dev_size, test_size=0.25, random_state=8888
)
train_size, test_size, dev_size = len(train_size), len(test_size), len(dev_size)

print(f"Train size: {train_size}, Dev size: {dev_size}, Test size: {test_size}")


In [None]:
create_doc("train", offset=0, limit=train_size, shuffle=False, seed=8888)

In [None]:
create_doc("dev", offset=train_size, limit=dev_size)

In [None]:
create_doc("test", offset=train_size + dev_size, limit=test_size)