In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from utils import cfg, load_data, get_labels, get_hypothesis, tokenize, clean_str

In [3]:
def clean_data(data: dict) -> None:
    for i in range(len(data['documents'])):
        data['documents'][i]['text'] = clean_str(data['documents'][i]['text'])
        data['documents'][i]['text'] = tokenize(data['documents'][i]['text'])

In [4]:
# For each document for each 17 hypothesis , we take all spans and make corosponding y as 1 if that span's hyposthesis is mentioned in "spans" else 0
# so x is [spantext(vectorizedd using tfidf) + hypothesis(vectorized using tfidf)] for all spans and y is list of 1 or 0 for that span
# For each 17 hypothesis of all document we mark 1 in which span it appears.
# doc1 : hypo1 -> x = [[span1+hypo1],[span2+hypo1],...], y = [1,0,1,0,....]

from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import scipy.sparse as sp
import nltk
def get_XY(data: dict, tfidf: TfidfVectorizer, hypothesis: dict, labels: dict, n_docs : int, threshold : float = 0.1) -> (list, list):

    X = []
    Y = []

    hypothesis_vecs = {}
    for key, val in hypothesis.items():
        hypothesis_vecs[key] = tfidf.transform([val])

    for i in tqdm(range(min(n_docs, len(data["documents"])))):
        doc_text = data["documents"][i]["text"]

        for key, val in hypothesis.items():
            choice = data["documents"][i]["annotation_sets"][0]["annotations"][key]["choice"]
            if choice == "NotMentioned":
                continue

            spans_for_hypothesis = data["documents"][i]["annotation_sets"][0]["annotations"][key]["spans"]

            for j, span in enumerate(data["documents"][i]["spans"]):
                start_idx = span[0]
                end_idx = span[1]

                span_text = doc_text[start_idx:end_idx]
                span_vector = tfidf.transform([span_text])

                input_vec = sp.hstack([span_vector, hypothesis_vecs[key]])
                # return X, Y
                X += [input_vec]
                Y += [1 if j in spans_for_hypothesis else 0]
        
    return sp.vstack(X), Y
        

In [5]:
train = load_data(cfg['train_path'])
clean_data(train)
hypothesis = get_hypothesis(train)
labels = get_labels()

In [6]:
all_text = ""

for i in range(len(train["documents"])):
    all_text += train["documents"][i]["text"] + " "

tfidf = TfidfVectorizer()
tfidf.fit([all_text])

In [7]:
X_train, Y_train = get_XY(train, tfidf, hypothesis, labels=labels, n_docs=100)

100%|██████████| 100/100 [01:10<00:00,  1.41it/s]


In [8]:
from sklearn.svm import SVC

model = SVC(kernel='linear', probability=True)
model.fit(X_train, Y_train)

In [9]:
test = load_data(cfg['test_path'])
clean_data(test)
X_test, Y_test = get_XY(test, tfidf, hypothesis, labels=labels, n_docs=100)

100%|██████████| 100/100 [00:58<00:00,  1.70it/s]


In [10]:
Y_pred = model.predict(X_test)

In [11]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score

def precision_at_80_recall(ypred, ytrue):
    precision, recall, thresholds = precision_recall_curve(ytrue, ypred)
    idx = (abs(recall - 0.8)).argmin()
    return precision[idx]

In [12]:
from sklearn.metrics import average_precision_score
def mean_average_precision(Y_pred, Y_test):
    aps = []
    aps = average_precision_score(Y_test, Y_pred)
    return np.mean(aps)

In [13]:
all_y_pred_test = Y_pred
all_y_true_test = Y_test

In [14]:
prec_arr = []
prec_arr = precision_at_80_recall(Y_pred, Y_test)

print("Precision @ 80% recall: ", np.mean(np.array(prec_arr)))
print("Mean Average Precision: ", mean_average_precision(Y_pred, Y_test))

Precision @ 80% recall:  0.02521623982193672
Mean Average Precision:  0.02521623982193672
