In [None]:
import boto3
import numpy as np
import pandas as pd
import re
import string

from gensim.models import Word2Vec, KeyedVectors
from gensim import matutils
from nltk import word_tokenize, sent_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords
from scipy import spatial
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

seed = 129081

bucket = "recover-sagemaker-nemours-980869347866"
s3_client = boto3.client('s3')

#word2vec set-up
#create semi-custom stopwords set
stops = set(stopwords.words("english"))
stops.add("mg")
stops.add("ml")
stops.add("md")
stops.add("date")
stops.add("nd")
stops.add("th")
stops.add("st")
stops.add("rd")
stops.remove("no")
stops.remove("not")
stops.remove("don't")
stops.remove("don")
stops.remove("shouldn")
stops.remove("didn't")
stops.remove("too")
stops.remove("wasn")
stops.remove("hasn't")
stops.remove("aren")
stops.remove("aren't")
stops.remove("wasn't")
stops.remove("couldn't")
stops.remove("doesn't")
stops.remove("didn")
stops.remove("hasn")

model = KeyedVectors.load_word2vec_format('s3://' + bucket + 'w2vs3.bin', binary=True)

seed = 129081
np.random.seed(seed)

#takes a doc (list of str tokens) and a word2vec model
#returns an np vector that is the average of all the tokens in the doc
def vector_docs(docs, model=model):
    features = []
    for tokens in docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model.wv:
                try:
                    vectors.append(model.wv[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features

#takes a sentence (str) and an nltk tokenizer
#returns a list of tokens
def process_text_word2vec(text, tokenizer=word_tokenize):
    #clean text for better tokenization
    text.replace("504", "Fiveohfour")
    text = re.sub("-", " ", text)
    text = re.sub("/", " ", text)
    text = re.sub(":", " ", text)
    text = re.sub("=", " ", text)
    remove = "[" + string.punctuation + string.digits + "¿¼½Â·°©®" + "]"
    text = text.lower()
    text = re.sub(remove, "", text)
    #remove multiple spaces
    text = re.sub(" +", " ", text)
    
    tokens = tokenizer(text)
    #remove common words
    tokens = [x for x in tokens if x not in stops]
    return tokens


In [None]:
import io
w2v = pd.read_csv(f"s3://{bucket}/data/regex_overlap_subset_all.csv",

w2v.text = w2v.text.astype(str)
w2v.text = w2v.text.apply(lambda x: x[30:170])
w2v["processed_text"] = w2v.text.apply(process_text_word2vec)  

data = io.BytesIO()
s3_client.download_fileobj(Bucket=bucket, Key='present_list.npy', Fileobj=data)
data.seek(0)
present_list = np.load(data)
data = io.BytesIO()
s3_client.download_fileobj(Bucket=bucket, Key='absent_list.npy', Fileobj=data)
data.seek(0)
absent_list = np.load(data)
data = io.BytesIO()
s3_client.download_fileobj(Bucket=bucket, Key='hypothetical_list.npy', Fileobj=data)
data.seek(0)
hypothetical_list = np.load(data)

def get_prediction(row):
    r = row.processed_text
    r = [model[word] for word in r if word in model.key_to_index]
    r = matutils.unitvec(np.array(r).mean(axis=0))
    present_sim = np.dot(present_list, r)
    absent_sim = np.dot(absent_list, r)
    #This improves accuracy
    hypothetical_sim = np.dot(hypothetical_list, r) - 0.25
    highest = max([present_sim, absent_sim, hypothetical_sim])
    if highest == present_sim:
        return('Present')
    else:
        return('Absent')
    
w2v['w2v_assertion_binary'] = w2v.apply(get_prediction, axis=1)

In [None]:
w2v.assertion = w2v.assertion.astype(str)
w2v.assertion = w2v.assertion.apply(lambda x:x[2:-2])
w2v.assertion = w2v.assertion.str.title()
w2v.assertion = w2v.assertion.str.replace("'", "")

def change_assertion(row):
    if len(row) > 0:
        ret = row
        if "," in ret:
            if ret == "Hypothetical', 'Family":
                ret = 'Hypothetical'
            if ret == "Hypothetical', 'Hypothetical":
                ret = 'Hypothetical'
            if ret == "Hypothetical', 'Present":
                ret = 'Present'
            if ret == "Prsent', 'None":
                ret = 'Present'
            if ret == "Possible', 'Family":
                ret = 'Present'
            if ret == "Possible', 'Present":
                ret = 'Present'
            if ret == "Present', 'Someoneelse":
                ret = 'Present'
            if ret == "Someoneelse', 'Family":
                ret = 'Other'
        if ret == 'Someoneelse' or ret == 'Family':
            ret = 'Other'
        if ret == 'Planned':
            ret = 'Absent'
        if ret == 'Possible':
            ret = 'Present'
        return ret
w2v.assertion = w2v.assertion.apply(change_assertion)

In [None]:
def bestguess(row):
    if row.assertion == 'Other' or row.assertion == 'Past':
        if row.regex_assertion == "Absent" and row.w2v_assertion_binary == "Absent":
            return 'Absent'
        else:
            return 'Present'
    elif row.assertion == 'Hypothetical':
        if row.regex_assertion == 'Present':
            if row.w2v_assertion_binary == 'Absent':
                return 'Absent'
            else:
                return row.regex_assertion
        else:
            return row.regex_assertion
    else:
        return row.regex_assertion

w2v["best_guess"] = w2v.apply(bestguess, axis=1)
w2v.to_csv((f"s3://{bucket}/data/regex_overlap_subset_all.csv"), index=False)