In [1]:
import sys
from pathlib import Path

In [2]:
module_path = Path.cwd().parent.parent
if module_path not in sys.path:
    sys.path.insert(0, str(module_path))

In [3]:
module_path

PosixPath('/home/marek/Projects/Python/evaluating-student-writing')

In [4]:
import pandas as pd


In [5]:
from src.loader import TextLoader


In [6]:
STRIP_CHARS = ".,?"


In [None]:
loader = TextLoader()


## Create formatted output

In [None]:
formatted_output = []
for text in loader.iterate(verbose=True, purify_text=True, purify_discourses=True):
    if text.id in ("F91D7BB4277C", "354946A1CA46", "EB3D0704BCF0", "B689C28463CB"):  # Broken, to delete
        continue

    curr_start = 0
    is_count_set = False
    for discourse in text.discourses:
        new_start_ind = text.index(discourse.text, curr_start)

        first_word = ""
        curr_ind = new_start_ind
        while curr_ind < len(text) and (char := text[curr_ind]) != " ":
            first_word += char
            curr_ind += 1

        # Verify if char before discourse is a space
        no_chars_added = 0
        if new_start_ind > 0 and text[new_start_ind - 1] != " ":
            output = first_word
            curr_ind = new_start_ind - 1
            while curr_ind >= 0 and (char := text[curr_ind]) != " ":
                output = char + output
                curr_ind -= 1

            print()
            print(text.id)
            print(f"First word is: `{first_word}` but I think it should be: `{output}`")

            first_word = output
            if not is_count_set:
                is_count_set = True

            no_chars_added = new_start_ind - curr_ind - 1
            new_start_ind = curr_ind + 1

        curr_start = new_start_ind + len(discourse.text) + no_chars_added

        last_word = ""
        curr_ind = curr_start - 1
        while curr_ind >= 0 and (char := text[curr_ind]) != " ":
            last_word = char + last_word
            curr_ind -= 1

        # Verify if char after discourse is a space or dot
        no_chars_added = 0
        if curr_start < len(text) and text[curr_start] not in (" ", "."):
            output = last_word
            curr_ind = curr_start
            while curr_ind < len(text) and (char := text[curr_ind]) not in (" ", "."):
                output += char
                curr_ind += 1

            print()
            print(text.id)
            print(f"Last word is: `{last_word}` but I think it should be: `{output}`")

            last_word = output
            if not is_count_set:
                is_count_set = True

            curr_start = curr_ind

        discourse_words = discourse.words
        if discourse_words[0] != first_word:
            discourse_words[0] = first_word
        if discourse_words[-1] != last_word:
            discourse_words[-1] = last_word

        formatted_output.append(
            (
                text.id,  # Text ID
                discourse.id,  # Discourse ID
                new_start_ind,  # Discourse start index
                curr_start,  # Discourse end index
                " ".join(discourse_words),  # Discourse text
                discourse.type.value,  # Discourse type
            )
        )


In [None]:
df = pd.DataFrame(formatted_output, columns=["text_id", "disc_id", "disc_start", "disc_end", "disc_text", "disc_type"])


In [None]:
df.head()

In [None]:
def check_idx_validity(row: pd.Series):
    print(f"\r{int(row.name):>6}", end="")
    text = loader.load_text_with_id(row["text_id"], purify_text=True, purify_discourses=True)
    if text[row["disc_start"] : row["disc_end"]] != row["disc_text"]:
        print(f"Text with id: {row['text_id']} is invalid for discourse with id: {row['disc_id']}")
        print()

In [None]:
_ = df.apply(check_idx_validity, axis=1)

#### Needs to be manually commented to save the output (so it doesn't overwrite the original file)

In [10]:
train_v1_no_predstr = Path(module_path / "data" / "train_v1_no_predictionstring.xz")
if not train_v1_no_predstr.exists():
    df.to_csv(train_v1_no_predstr, index=False, compression="xz")


In [None]:
# Check if predictionstring will be valid
for ind, text_id in enumerate(pd.unique(df["text_id"])):
    print(f"\r{ind:>6} / {len(pd.unique(df['text_id']))}", end="")

    text = loader.load_text_with_id(text_id, purify_text=True, purify_discourses=True)
    for disc_row in df[df["text_id"] == text_id].itertuples():
        disc_words = disc_row.disc_text.split()
        no_words_before = len(text[: disc_row.disc_start].split())

        start = no_words_before
        end = start + len(disc_words)

        if " ".join(text.words[start:end]).strip(STRIP_CHARS) != disc_row.disc_text.strip(STRIP_CHARS):
            print(f"\nText with id: {text_id} is invalid for discourse with id: {disc_row.disc_id}")


In [None]:
def create_predictionstring(row: pd.Series):
    print(f"\r{int(row.name):>6}", end="")
    text = loader.load_text_with_id(row.text_id, purify_text=True, purify_discourses=True)

    disc_words = row.disc_text.split()
    no_words_before = len(text[: row.disc_start].split())

    start = no_words_before
    end = start + len(disc_words)

    return " ".join(map(str, range(start, end)))

In [None]:
df["predictionstring"] = df.apply(create_predictionstring, axis=1)

In [None]:
df.head()

#### Needs to be manually commented to save the output (so it doesn't overwrite the original file)

In [11]:
train_v1_with_predstr = Path(module_path / "data" / "train_v1_with_predictionstring.xz")
if not train_v1_with_predstr.exists():
    df.to_csv(train_v1_with_predstr, index=False, compression="xz")


In [12]:
df_no_predstr = pd.read_csv(train_v1_no_predstr, compression="xz")
df_no_predstr.head()

Unnamed: 0,text_id,disc_id,disc_start,disc_end,disc_text,disc_type
0,423A1CA112E2,1622627660524,7,228,Modern humans today are always on their phone....,Lead
1,423A1CA112E2,1622627653021,229,311,They are some really bad consequences when stu...,Position
2,423A1CA112E2,1622627671020,312,399,Some certain areas in the United States ban ph...,Evidence
3,423A1CA112E2,1622627696365,400,754,When people have phones they know about certai...,Evidence
4,423A1CA112E2,1622627759780,755,882,Driving is one of the way how to get around. P...,Claim


In [13]:
df_with_predstr = pd.read_csv(train_v1_with_predstr, compression="xz")
df_with_predstr.head()

Unnamed: 0,text_id,disc_id,disc_start,disc_end,disc_text,disc_type,predictionstring
0,423A1CA112E2,1622627660524,7,228,Modern humans today are always on their phone....,Lead,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622627653021,229,311,They are some really bad consequences when stu...,Position,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622627671020,312,399,Some certain areas in the United States ban ph...,Evidence,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622627696365,400,754,When people have phones they know about certai...,Evidence,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622627759780,755,882,Driving is one of the way how to get around. P...,Claim,139 140 141 142 143 144 145 146 147 148 149 15...
