In [2]:
from typing import Optional, List, Dict, Tuple, Set
import numpy as np
import pandas as pd
import spacy
import key_words
import os

In [8]:
import pickle
def save_obj(obj:object,name:str):
    ext = '.pickle'
    with open(name + ext, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_obj(name:str)->object:
    ext = '.pickle'
    with open(name + ext, 'rb') as handle:
        return pickle.load(handle)

In [3]:
EXCEPTION_ENTITES = set(["DATE","TIME","PERCENT","MONEY","QUANTITY","ORDINAL", "CARDINAL","WORK_OF_ART"])
NER = spacy.load('en_core_web_sm')

In [10]:
train_path = "data/train_phase1.tsv"
df = pd.read_csv(train_path, sep="\t")

In [11]:
def ner(
    paragraph: str)\
        -> Dict[str, str]:
    doc = NER(paragraph)
    res = {
        entity.text : entity.label_
        for entity in doc.ents
    }
    return res

In [12]:
ess_dict = {
    "ess_1_model_answers":  load_obj("data/essaySet_1_model_answers"),
    "ess_2_model_answers": load_obj("data/essaySet_2_model_answers"),
    "ess_3_model_answers": load_obj("data/essaySet_3_model_answers"),
    "ess_4_model_answers": load_obj("data/essaySet_4_model_answers"),
    "ess_5_model_answers": load_obj("data/essaySet_5_model_answers"),
    "ess_6_model_answers": load_obj("data/essaySet_6_model_answers"),
    "ess_7_model_answers": load_obj("data/essaySet_7_model_answers"),
    "ess_8_model_answers": load_obj("data/essaySet_8_model_answers"),
    "ess_9_model_answers": load_obj("data/essaySet_9_model_answers"),
    "ess_10_model_answers": load_obj("data/essaySet_10_model_answers"),
}

In [14]:
import string
# doc = 'You need to know how much vinegar was used in each container.'
# remove stopwords from doc
# tokens = [w for w in set(doc.split()) if w not in stop_words]
# remove punctuation from tokens
# tokens = [w.translate(str.maketrans('', '', string.punctuation)) for w in [doc]]

In [17]:
for essay in range(0,10):
    key = f'ess_{essay+1}_model_answers'
    ess_dict[key] = [w.translate(str.maketrans('', '', string.punctuation)) for w in ess_dict[key] ]

In [18]:
ner_res = {}
for essay in range(0,10):
    key = f'ess_{essay+1}_model_answers'
    ners = list(map(ner,ess_dict[key]))
    ner_res[key] = ners

In [19]:
ner_res["ess_10_model_answers"]

[{'45DEGC': 'CARDINAL', '53DEGC': 'CARDINAL'},
 {'Light Gray': 'WORK_OF_ART', 'the winter': 'DATE'},
 {'Light Gray Because': 'WORK_OF_ART', 'the summer': 'DATE'},
 {'1': 'CARDINAL', '54DEG': 'CARDINAL'},
 {'54 degrees': 'QUANTITY', '41 degrees': 'QUANTITY'},
 {'48 degrees': 'QUANTITY',
  '45': 'CARDINAL',
  'Li': 'PERSON',
  '53DEGC': 'CARDINAL'},
 {'48DEG': 'CARDINAL'},
 {'Dark': 'ORG', '54DEG': 'CARDINAL', '41DEG': 'CARDINAL'},
 {},
 {},
 {'53DEGc': 'CARDINAL', '42DEGc': 'CARDINAL'},
 {'just 10 minutes': 'TIME',
  '53': 'CARDINAL',
  '10 minutes': 'TIME',
  '48DEG': 'CARDINAL',
  '10 minut': 'QUANTITY',
  '45DEG': 'DATE'},
 {'Brandi': 'ORG', 'Jerry': 'PERSON'},
 {'One': 'CARDINAL', '53 degrees': 'QUANTITY', '42 degrees': 'QUANTITY'},
 {'54 degrees': 'QUANTITY'},
 {},
 {'Brandi': 'ORG',
  'Jerrys': 'PERSON',
  '6DEGC': 'CARDINAL',
  '10DEGC': 'CARDINAL',
  '12': 'CARDINAL',
  'DEGwarmer': 'PRODUCT'},
 {'54DEG 52DEG 53DEG': 'CARDINAL', '53DEGc': 'CARDINAL'},
 {},
 {'6º': 'CARDINAL'},
 

In [20]:
named_entites_dict = {}
for essay in range(0,10):
    key = f'ess_{essay+1}_model_answers'
    named_entites = list(filter(lambda x: x if x else None, ner_res[key]))
    named_entites = list(map(lambda named_entity: list(filter(lambda x: x not in EXCEPTION_ENTITES ,named_entity)), named_entites))
    # flatten the list
    named_entites = [*set([item for sublist in named_entites for item in sublist])]
    named_entites = [*set([str(item).lower() for item in named_entites])]
    named_entites_dict[key] = named_entites

In [21]:
[len(v) for i,v in named_entites_dict.items()]

[13, 8, 27, 21, 23, 9, 17, 16, 25, 36]

In [5]:
def match_grading(
    entities: List[str],
    doc: str)\
        -> float:
    # type check
    if not isinstance(entities, list):
        entities = [entities]
    #  entities contain stop words
    grade = [True
            for entity in entities
            if entity in doc]
    try:
        return len(grade)/len(entities)
    except ZeroDivisionError:
        # return -1.0
        return 0

In [24]:
docs = [ df.query(f'EssaySet == {i}')["EssayText"].values.tolist() for i in range(1,11)]

In [25]:
ner_grades = np.array(list(map(lambda essay: np.array(list(map(lambda student_answer:
        match_grading(named_entites,student_answer),
        essay))),docs)))

  ner_grades = np.array(list(map(lambda essay: np.array(list(map(lambda student_answer:


In [28]:
[len(i)for i in docs]

[1672, 1278, 1891, 1738, 1795, 1797, 1799, 1799, 1798, 1640]

In [32]:
[i.shape[0] for i in ner_grades.tolist()]

[1672, 1278, 1891, 1738, 1795, 1797, 1799, 1799, 1798, 1640]

In [34]:
# make a directory to save the results and save
os.makedirs('data/results', exist_ok=True)
save_obj(ner_grades,"data/results/ner_res")

# specail keys

## we can get around lack of this data by taking the intersection sets of each model answer

In [14]:
enclosure = "\"\""
ess_set_keys = {}

In [46]:
ess1_set = [set(model_ans.split()) for model_ans in ess_dict["ess_1_model_answers"]]

In [93]:
for essay in range(0,10):
    key = f'ess_{essay+1}_model_answers'
    ess_set = [set(model_ans.split()) for model_ans in ess_dict[key]]
    o = ess_set[0]
    for i in range(1,11):
        o = o.intersection(ess1_set[i])
    ess_set_keys[key] = [*o]

In [94]:
ess_set_keys

{'ess_1_model_answers': ['need', 'to'],
 'ess_2_model_answers': [],
 'ess_3_model_answers': ['to'],
 'ess_4_model_answers': ['to'],
 'ess_5_model_answers': ['to'],
 'ess_6_model_answers': ['to'],
 'ess_7_model_answers': ['to'],
 'ess_8_model_answers': ['to'],
 'ess_9_model_answers': ['to'],
 'ess_10_model_answers': ['to']}

In [95]:
ess_set_keys["ess_2_model_answers"]=[" "]
ess_set_keys

{'ess_1_model_answers': ['need', 'to'],
 'ess_2_model_answers': [' '],
 'ess_3_model_answers': ['to'],
 'ess_4_model_answers': ['to'],
 'ess_5_model_answers': ['to'],
 'ess_6_model_answers': ['to'],
 'ess_7_model_answers': ['to'],
 'ess_8_model_answers': ['to'],
 'ess_9_model_answers': ['to'],
 'ess_10_model_answers': ['to']}

In [105]:
special_keywords_res = {}
for essay in range(0,10):
    key = f'ess_{essay+1}_model_answers'
    # hard_keywords = list(map(lambda doc: key_words.get_str_between(doc, enclosure),ess_dict[key]))
    hard_keywords = ess_set_keys[key]
    # special_keywords = list(map(special_keywords,ess_dict[key]))
    docs = df.query(f'EssaySet == {essay+1}')["EssayText"].values.tolist()
    # hard_keywords = list(map(lambda keys: match_grading(keys,doc),hard_keywords))
    hard_keywords = key_words.hard_keywords_grading(hard_keywords,docs)
    special_keywords_res[key] = hard_keywords


In [107]:
special_keywords_res["ess_1_model_answers"].shape

(1672,)

In [109]:
# make a directory to save the results and save
os.makedirs('data/results', exist_ok=True)

save_obj(special_keywords_res,"data/results/special_keywords_res")