# Imports

In [1]:
# Packages
from pandas import read_csv, DataFrame, concat
from scipy.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from numpy import array, mean
from matplotlib.pyplot import savefig, xlim, ylim
from os import makedirs
from yaml import safe_load, dump as yaml_dump
from munch import munchify, unmunchify
from IPython.display import display, Markdown
import re

# Processing Code Rules
## Load YAML

In [2]:
rules = munchify(safe_load(open("big_data/rules.yaml", "r").read()))

## Helpers

In [3]:
def __OR_from_list__(words):
    z = ""
    first = True
    for word in words:
        if type(word) == str:
            if not first:
                z += '|'

            if word.endswith("*"):
                z += word[:-1]
                z += '\w*'
            else:
                z += word

            first = False
    
    return z

def make_OR_clause(words):
    z = '(?=(?:^|.*\W)('
    z += __OR_from_list__(words)
    z += ')(?:\W|$))'
    return z

def __query_from_word__(word):
    word = word.replace(".*", '" AND "')
    word = re.sub(r'\(\?<?!([^)]+)\)', r'" AND NOT("\1") AND "', word)
    word = '("' + word + '")'
    word = word.replace(' AND ""', "")
    word = word.replace('"" AND ', "")
    return word

def make_OR_query(words):
    z = "("
    z += " OR ".join(map(__query_from_word__, words))
    z += ")"
    return z

## Generating Regex

In [4]:
for code in rules.codes.keys():
    rules.codes[code].__regex__ = "".join([
        make_OR_clause(rules.codes[code].logic[L])
        for L in rules.codes[code].logic.keys()
    ])
    
    rules.codes[code].__query__ = " AND ".join([
        make_OR_query(rules.codes[code].logic[L])
        for L in rules.codes[code].logic.keys()
    ])

print(rules)
open("big_data/rules.yaml", "w").write("---\n" + yaml_dump(unmunchify(rules)))

Munch({'codes': Munch({'Abduction': Munch({'__query__': '(("student*") OR ("them") OR ("their") OR ("they") OR ("these") OR ("people")) AND (("compar*") OR ("possible") OR ("could have") OR ("one-to-one") OR ("story") OR ("supposed to") OR ("looks like") OR ("seems to") OR ("seem to") OR ("if" AND "were"))', '__regex__': '(?=(?:^|.*\\W)(student\\w*|them|their|they|these|people)(?:\\W|$))(?=(?:^|.*\\W)(compar\\w*|possible|could have|one-to-one|story|supposed to|looks like|seems to|seem to|if.*were)(?:\\W|$))', 'definition': 'TODO', 'examples': ["it's possible that they were trying to game the system", 'TODO'], 'logic': Munch({'target': ['student*', 'them', 'their', 'they', 'these', 'people'], 'think': ['compar*', 'possible', 'could have', 'one-to-one', 'story', 'supposed to', 'looks like', 'seems to', 'seem to', 'if.*were']})}), 'Action': Munch({'__query__': '(("help*") OR ("question") OR ("you should")) AND (("this person") OR ("they") OR ("student") OR ("them"))', '__regex__': '(?=(?:

9802

## Testing Regex

In [5]:
for code in rules.codes.keys():
    display(Markdown(f"**{code}:**"))
    rule = re.compile(rules.codes[code].__regex__, re.MULTILINE)
    for example in rules.codes[code].examples:
        result = rule.search(example)
        if result is not None:
            found = "//".join([
                group.strip() for group in result.groups() if group is not None
            ])

            display(Markdown(f"Found **{found}** in the example: *{example}*"))
        else:
            display(Markdown(f"Did not find a match in the example: *{example}*"))

**Abduction:**

Found **they//possible** in the example: *it's possible that they were trying to game the system*

Did not find a match in the example: *TODO*

**Action:**

Did not find a match in the example: *TODO*

**Conception:**

Did not find a match in the example: *TODO*

**Design:**

Did not find a match in the example: *TODO*

**DesignTeam:**

Did not find a match in the example: *TODO*

**DiggingInto:**

Found **where did//that** in the example: *where did that come from?*

Did not find a match in the example: *TODO*

**Evaluation:**

Found **mental energy, and i don't have the time to work** in the example: *this tool requires a lot of mental energy, and i don't have the time to work on that*

Did not find a match in the example: *TODO*

**Motivation:**

Did not find a match in the example: *TODO*

**Operationalization:**

Did not find a match in the example: *TODO*

**Selection:**

Found **stand out//student** in the example: *stand out student*

Did not find a match in the example: *TODO*

**Shadowspect:**

Did not find a match in the example: *TODO*

**Transposition:**

Did not find a match in the example: *TODO*

# Processing Data
## Loading Data

In [6]:
data = read_csv("big_data/fulldata.csv", sep=",")
data["S.A.G.L"] = data["Session"] + "." + data["Activity"] + "." + data["Group"] + "." + data["LineNo"].astype("str")
data["Text"] = data["Text"].str.replace("\n", " ")

## Helpers

In [7]:
def apply_rule(data, rule, code):
    automated_codes = []
    automated_clues = []
    rule = re.compile(rule, re.MULTILINE)
    for text in data.Text:
        if type(text) == str:
            result = rule.match(text.lower())
            if result is not None:
                automated_codes.append(1)
                automated_clues.append(
                    "//".join([
                        group.strip() for group in result.groups() if group is not None
                    ])
                )
            else:
                automated_codes.append(0)
                automated_clues.append("")
        else:
            automated_codes.append(0)
            automated_clues.append("")

    data[code] = automated_codes
    data[f"{code}_clue"] = automated_clues

## Applying Code Rules

In [8]:
for code in rules.codes.keys():
    apply_rule(data, rules.codes[code].__regex__, code)

# Generating Test Sets
## Loading Data

In [9]:
training = munchify(safe_load(open("big_data/training.yaml", "r").read()))

## Helpers

In [10]:
def inflated_sample(data, code, random_state=42, size=16):
    drop = []
    if code in training.keys():
        for sagl in training[code]:
            drop += list(data[data["S.A.G.L"] == sagl].index)
            
    data = data.drop(drop)
    positives = data[data[code] == 1].sample(n=size, random_state=random_state)
    others = data.drop(positives.index).sample(n=size*4, random_state=random_state)
    sample = concat([positives, others])
    return sample.sample(frac=1, random_state=random_state)

def gen_test(code, random_state=42, size=16):
    sample = inflated_sample(data, code, size=size, random_state=random_state)
    sample[["Text", code, f"{code}_clue"]].to_csv(f"output/{code}-automated.csv")
    sample.Text.to_csv(f"output/{code}-uncoded.csv")
    if code not in training.keys():
        training[code] = []
        
    training[code] += list(sample["S.A.G.L"].values)
    open("big_data/training.yaml", "w").write("---\n" + yaml_dump(unmunchify(training)))
    display(Markdown("**Done!**"))
    display(Markdown(f"Test/train set has been generated for `{code}`."))
    display(Markdown(f"The human rater's file has been placed at `output/{code}-uncoded.csv`. Please download this file and code it manually *before* looking at the automated rater's results."))
    display(Markdown(f"The automated rater's file has been placed at `output/{code}-automated.csv`."))
    display(Markdown(f"*Please do not run this function for `{code}` again until you are ready to produce a new, unique test set!*"))
    display(Markdown("And **please** remember to push changes to `big_data/rules.csv` and `big_data/training.csv` to OSF once you are done checking your results against the computer's."))

## Running Generation Method(s)

In [11]:
random_state = 100
# gen_test("Abduction", random_state=random_state)
# gen_test("Action", random_state=random_state)
# gen_test("Conception", random_state=random_state)
# gen_test("Design", random_state=random_state)
# gen_test("DesignTeam", random_state=random_state)
# gen_test("DiggingInto", random_state=random_state)
# gen_test("Evaluation", random_state=random_state)
# gen_test("Motivation", random_state=random_state)
# gen_test("Operationalization", random_state=random_state)
# gen_test("Selection", random_state=random_state)
# gen_test("Shadowspect", random_state=random_state)
# gen_test("Transposition", random_state=random_state)