# Imports


In [1]:
use_annotator_1=True  #if False then train/val/test on annotator 2

In [2]:
%cd drive/My\ Drive/1\ Job/Product\ and\ Code/TermExtraction
!ls

/content/drive/My Drive/1 Job/Product and Code/TermExtraction
 ACL				    htfl_en.pickle
 ACTER-master			    htfl_fr.pickle
 ACTER-master.zip		    htfl_nl.pickle
 additionaltexts		    logs
 additionaltexts_extracted_de.txt  'saved models'
 additionaltexts_extracted_EN.txt   train_data_corp_en_new.pkl
 extracted_terms_htfl		    wnut17train.conll
 false_negatives.txt		    wnut17train.conll.1
 false_positives.txt


In [3]:
!pip install transformers
!pip install sacremoses
!pip install sentencepiece
!pip install seqeval

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 9.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 30.7MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

In [4]:
#torch and tranformers for model and training
import torch  
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import XLMRobertaTokenizerFast              
from transformers import XLMRobertaForTokenClassification
from transformers import AdamW                            
from transformers import get_linear_schedule_with_warmup
from transformers import DataCollatorForTokenClassification
from transformers import Trainer, TrainingArguments
import sentencepiece

#sklearn for evaluation
from sklearn import preprocessing                       
from sklearn.metrics import classification_report        
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid         
from sklearn.model_selection import ParameterSampler      
from sklearn.utils.fixes import loguniform

#nlp preprocessing
from nltk import ngrams                                 
from spacy.pipeline import SentenceSegmenter
from spacy.lang.en import English
from spacy.pipeline import Sentencizer
from sacremoses import MosesTokenizer, MosesDetokenizer


#utilities
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
import glob, os
import time
import datetime
import random
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
import pickle         # for saving data structures
from pynvml import *  # for checking gpu memory

In [5]:
!ls

 ACL				    htfl_en.pickle
 ACTER-master			    htfl_fr.pickle
 ACTER-master.zip		    htfl_nl.pickle
 additionaltexts		    logs
 additionaltexts_extracted_de.txt  'saved models'
 additionaltexts_extracted_EN.txt   train_data_corp_en_new.pkl
 extracted_terms_htfl		    wnut17train.conll
 false_negatives.txt		    wnut17train.conll.1
 false_positives.txt


In [6]:
!pip freeze > requirements_TermExtraction.txt

In [None]:
# connect to GPU 
device = torch.device('cuda')

print('Connected to GPU:', torch.cuda.get_device_name(0))

Connected to GPU: Tesla P100-PCIE-16GB


# Prepare Data

**Functions for preprocessing and creating of Training Data**

In [None]:
#load sentence list 
with open("ACL/ACL2_annotator1_src.txt") as file_in:
    sentences_an1 = []
    for line in file_in:
        sentences_an1.append(line.strip())

with open("ACL/ACL2_annotator2_src.txt") as file_in:
    sentences_an2 = []
    for line in file_in:
        sentences_an2.append(line.strip())


In [None]:
#load label list 
with open("ACL/ACL2_annotator1_label.txt") as file_in:
    labels_an1 = []
    for line in file_in:
        labels_an1.append(line.strip().split("\t"))

with open("ACL/ACL2_annotator2_label.txt") as file_in:
    labels_an2 = []
    for line in file_in:
        labels_an2.append(line.strip().split("\t"))


In [None]:
# tokenize
def preprocess(sentences):
  sentences_tokenized=[]
  mt = MosesTokenizer(lang='en')
  for s in sentences:
    tokenized_text = mt.tokenize(s, return_str=True)            #append tuple of tokens and original senteence
    sentences_tokenized.append((tokenized_text.split(), s))     #append tuple of tokens and original senteence
  return sentences_tokenized


In [None]:
sentences_an1_tok = preprocess(sentences_an1)
sentences_an2_tok = preprocess(sentences_an2)

In [None]:
#find indices of a sublist sub in a list l
def find_sub_list(subl,l):
    results=[]
    subllen=len(subl)
    for ind in (i for i,e in enumerate(l) if e==subl[0]):
        if l[ind:ind+subllen]==subl:
            results.append((ind,ind+subllen-1))

    return results

In [None]:
#input is list of sentences and a list of corresponding terms
def create_training_data(sentences, terms):

  #create empty dataframe
  training_data = []
  
  md = MosesDetokenizer(lang='en')

  print(len(sentences))
  count=0

  for i in range(len(sentences)):
    count+=1
    #if count%100==0:print(count)

    s=sentences[i][0]  

    #create label list, with "n" for non-terms, "B-T" for beginning of a term and "T" for the continuation of a term
    tags=["n"]*len(s)

    # check all terms
    for t in terms[i]:
      #find indices of term in sentence token list
      t_as_list=t.split()
      #print(t_as_list, s)
      if len(t_as_list)>0:
        sublist_indices=find_sub_list(t_as_list, s)
        for indices in sublist_indices:
          for ind in range(indices[0],indices[1]+1):
            #if term start
            if ind==indices[0]:
              tags[ind]="B-T"
            #if continuation of a Term
            else: 
              tags[ind]="T"

    training_data.append((s,tags))
        

  return training_data

  

In [None]:
data_an1=create_training_data(sentences_an1_tok, labels_an1)
data_an2=create_training_data(sentences_an2_tok, labels_an2)

900
1301


In [None]:
len(labels_an1)==len(data_an1) and len(labels_an1)==len(sentences_an1)

True

In [None]:
len(labels_an2)==len(data_an2) and len(labels_an2)==len(sentences_an2)

True

In [None]:
#train val test split either with data of annotator 1 or 2

if use_annotator_1:
  trainings_data = data_an1[:540]
  val_data = data_an1[540:720]
  test_data = data_an1[720:]

  gold_set_for_validation=set()
  gold_set_for_test=set()

  for terms in labels_an1[540:720]:
    for t in terms:
      if t not in gold_set_for_validation:
        gold_set_for_validation.add(t)

  for terms in labels_an1[720:]:
    for t in terms:
      if t not in gold_set_for_test:
        gold_set_for_test.add(t)

else:
  trainings_data = data_an2[:781]
  val_data = data_an2[781:1041]
  test_data = data_an2[1041:]

  gold_set_for_validation=set()
  gold_set_for_test=set()

  for terms in labels_an2[781:1041]:
    for t in terms:
      if t not in gold_set_for_validation:
        gold_set_for_validation.add(t)

  for terms in labels_an2[1041:]:
    for t in terms:
      if t not in gold_set_for_test:
        gold_set_for_test.add(t)

print("An1", use_annotator_1)
print("Train",len(trainings_data))
print("Val",len(val_data))
print("Test",len(test_data))
print("Terms Val",len(gold_set_for_validation))
print("Terms Test",len(gold_set_for_test))

An1 True
Train 540
Val 180
Test 180
Terms Val 421
Terms Test 478


In [None]:
#look for some example sentences that contain multi word term
for i in range(100):
  if len(test_data[i][0])<12 and "T" in test_data[i][1]:
    print(test_data[i][0])
    print(test_data[i][1])
    print()

['Secondly', ',', 'we', 'exhibit', 'a', 'provably', 'optimal', 'free', 'indexation', 'algorithm', '.']
['n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-T', 'T', 'T', 'n']

['This', 'paper', 'introduces', 'a', 'robust', 'interactive', 'method', 'for', 'speech', 'understanding', '.']
['n', 'n', 'n', 'n', 'n', 'B-T', 'T', 'T', 'T', 'T', 'n']

['The', 'generalized', 'LR', 'parsing', 'is', 'enhanced', 'in', 'this', 'approach', '.']
['n', 'B-T', 'T', 'T', 'n', 'n', 'n', 'n', 'n', 'n']

['A', 'pilot', 'system', 'has', 'shown', 'great', 'effectiveness', 'of', 'this', 'approach', '.']
['n', 'B-T', 'T', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n']

['The', 'interpretation', 'reflects', 'the', 'temporary', 'belief', 'about', 'the', 'world', '.']
['n', 'n', 'n', 'n', 'B-T', 'T', 'n', 'n', 'B-T', 'n']



In [None]:
#seperate tokens and tags

#train
train_tags=[tup[1] for tup in trainings_data]
train_texts=[tup[0] for tup in trainings_data]

#val
val_tags=[tup[1] for tup in val_data]
val_texts=[tup[0] for tup in val_data]

#test
test_tags=[tup[1] for tup in test_data]
test_texts=[tup[0] for tup in test_data]

In [None]:
val_texts[0]

['Their',
 'human',
 'listeners',
 'are',
 'usually',
 'able',
 'to',
 'cope',
 'with',
 'these',
 'deviations',
 'with',
 'little',
 'difficulty',
 '.']

In [None]:
print("train")
print(len(trainings_data), trainings_data[10])
print("validation")
print(len(val_data),val_data[10])
print("test")
print(len(test_data),test_data[10])

train
540 (['In', 'this', 'presentation', ',', 'we', 'describe', 'the', 'features', 'of', 'and', 'requirements', 'for', 'a', 'genuinely', 'useful', 'software', 'infrastructure', 'for', 'this', 'purpose', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'n', 'B-T', 'n', 'n', 'n', 'n', 'B-T', 'T', 'n', 'n', 'n', 'n'])
validation
180 (['The', 'resulting', 'logical', 'expression', 'is', 'then', 'transformed', 'by', 'a', 'planning', 'algorithm', 'into', 'efficient', 'Prolog', ',', 'cf.', 'query', 'optimisation', 'in', 'a', 'relational', 'database', '.'], ['n', 'n', 'B-T', 'T', 'n', 'n', 'n', 'n', 'n', 'B-T', 'T', 'n', 'n', 'B-T', 'n', 'n', 'B-T', 'T', 'n', 'n', 'B-T', 'T', 'n'])
test
180 (['The', 'basic', 'goal', 'in', 'building', 'that', 'editor', 'was', 'to', 'provide', 'an', 'adequate', 'tool', 'to', 'help', 'lexicologists', 'produce', 'a', 'valid', 'and', 'coherent', 'dictionary', 'on', 'the', 'basis', 'of', 'a', 'linguistic', 'theory', '.'], ['n', 'n', 'n', 'n', 'n', 'n', 'B-T', 'n'

# Tokenize 

In [None]:
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=9096718.0, style=ProgressStyle(descript…




In [None]:
label_list=["n", "B-T", "T"]
label_to_id = {l: i for i, l in enumerate(label_list)}
num_labels=len(label_list)

def tokenize_and_align_labels(texts, tags):
  tokenized_inputs = tokenizer(
      texts,
      padding=True,
      truncation=True,
      # We use this argument because the texts in our dataset are lists of words (with a label for each word).
      is_split_into_words=True,
  )
  labels = []
  for i, label in enumerate(tags):
      word_ids = tokenized_inputs.word_ids(batch_index=i)
      previous_word_idx = None
      label_ids = []
      for word_idx in word_ids:
          # Special tokens have a word id that is None. We set the label to -100 so they are automatically
          # ignored in the loss function.
          if word_idx is None:
              label_ids.append(-100)
          # We set the label for the first token of each word.
          elif word_idx != previous_word_idx:
              label_ids.append(label_to_id[label[word_idx]])
          # For the other tokens in a word, we set the label to either the current label or -100, depending on
          # the label_all_tokens flag.
          else:
              label_ids.append(-100)
          previous_word_idx = word_idx

      labels.append(label_ids)
  tokenized_inputs["labels"] = labels
  return tokenized_inputs  


train_input_and_labels = tokenize_and_align_labels(train_texts, train_tags)

val_input_and_labels = tokenize_and_align_labels(val_texts, val_tags)

test_input_and_labels = tokenize_and_align_labels(test_texts, test_tags)


In [None]:
# create dataset
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OurDataset(train_input_and_labels, train_input_and_labels["labels"])

val_dataset = OurDataset(val_input_and_labels, val_input_and_labels["labels"])

test_dataset = OurDataset(test_input_and_labels, test_input_and_labels["labels"])

# Training

In [None]:
# extract set from true_predictions

def extract_terms(token_predictions, val_texts):
  extracted_terms = set()
  # go over all predictions
  for i in range(len(token_predictions)):
    pred = token_predictions[i]
    txt  = val_texts[i]
    for j in range(len(pred)):
      # if right tag build term and add it to the set otherwise just continue
      if pred[j]=="B-T":
        term=txt[j]
        for k in range(j+1,len(pred)):
          if pred[k]=="T": term+=" "+txt[k]
          else: break
        extracted_terms.add(term)
  return extracted_terms

In [None]:
# how to compute the metrics (we don't use this one for the trainer, only to get inference predictions later...)
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    return {
        "accuracy_score": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }

In [None]:
# how to compute the metrics TermEval style for Trainer
# only for validation set since the gold_set is fixed to be the validation set !
def compute_metrics_2(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    extracted_terms=extract_terms(true_predictions, val_texts) # fixed validation set!!
    #extracted_terms = set([item.lower() for item in extracted_terms])  #for ACL we do not need to lowercase the data 
    gold_set=gold_set_for_validation      # fixed validation set!!
    true_pos=extracted_terms.intersection(gold_set)
    recall=len(true_pos)/len(gold_set)
    precision=len(true_pos)/len(extracted_terms)

    return {
        "precision": precision,
        "recall": recall,
        "f1": 2*(precision*recall)/(precision+recall),
    }

In [None]:
# initialize model
model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=num_labels)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-st

In [None]:
# training arguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total # of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=0,                  # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=2e-5,
    logging_dir='./logs',            # directory for storing logs
    evaluation_strategy= "epoch",#"steps", # or use epoch here
    eval_steps=100,
    #save_total_limit=1,
    load_best_model_at_end=True,   #loads the model with the best evaluation score
    metric_for_best_model="f1",
    greater_is_better=True
)

In [None]:
# initialize huggingface trainer
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_2,
    )

In [None]:
# train
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.222186,0.642127,0.745843,0.69011
2,No log,0.174606,0.712719,0.771971,0.741163
3,No log,0.197761,0.74938,0.71734,0.73301
4,No log,0.212888,0.708609,0.76247,0.734554
5,No log,0.204812,0.741419,0.769596,0.755245
6,No log,0.239005,0.70614,0.764846,0.734322
7,No log,0.256164,0.735714,0.733967,0.734839
8,0.152900,0.270901,0.72973,0.769596,0.749133
9,0.152900,0.285894,0.719647,0.774347,0.745995
10,0.152900,0.287512,0.726027,0.755344,0.740396


TrainOutput(global_step=680, training_loss=0.12057683608111214, metrics={'train_runtime': 742.7479, 'train_samples_per_second': 0.916, 'total_flos': 0, 'epoch': 10.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 1110393856, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -693235712, 'train_mem_gpu_alloc_delta': 3360015360, 'train_mem_cpu_peaked_delta': 702091264, 'train_mem_gpu_peaked_delta': 946824704})

In [None]:
trainer.save_model("./saved models/term_acl_1305_an1")

# Evaluation

In [None]:
#TODO REWRITE EVALUATE FUNCTION TO TAKE TXT + DF + DATASET AS PARAMETER AND THEN ITS ALL EZ PZ OMG

In [None]:
#load model
PATH = "./saved models/term_acl_1305_an1" 
model_trained = XLMRobertaForTokenClassification.from_pretrained(PATH)

In [None]:
# initialize huggingface trainer with already trained model
trainer = Trainer(
        model=model_trained,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

In [None]:
#en
predictions, labels, metrics = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

#test
test_predictions, test_labels, test_metrics = trainer.predict(test_dataset)
test_predictions = np.argmax(test_predictions, axis=2)
# Remove ignored index (special tokens)
true_test_predictions = [
    [label_list[p] for (p, l) in zip(test_prediction, test_label) if l != -100]
    for test_prediction, test_label in zip(test_predictions, test_labels)
]



**List Evaluation**

In [None]:
def computeTermEvalMetrics(extracted_terms, gold_df):
  #make lower case cause gold standard is lower case
  #extracted_terms = set([item.lower() for item in extracted_terms])
  gold_set=set(gold_df)
  true_pos=extracted_terms.intersection(gold_set)
  recall=len(true_pos)/len(gold_set)
  precision=len(true_pos)/len(extracted_terms)

  print("Intersection",len(true_pos))
  print("Gold",len(gold_set))
  print("Extracted",len(extracted_terms))
  print("Recall:", recall)
  print("Precision:", precision)
  print("F1:", 2*(precision*recall)/(precision+recall))

In [None]:
#extracted_terms = extract_terms(true_predictions, val_texts)
extracted_terms = extract_terms(true_predictions, val_texts)

#test_extracted_terms = extract_terms(true_test_predictions, test_texts)
test_extracted_terms = extract_terms(true_test_predictions, test_texts)

In [None]:
computeTermEvalMetrics(extracted_terms, gold_set_for_validation)

Intersection 317
Gold 421
Extracted 421
Recall: 0.7529691211401425
Precision: 0.7529691211401425
F1: 0.7529691211401426


In [None]:
computeTermEvalMetrics(test_extracted_terms, gold_set_for_test)

Intersection 354
Gold 478
Extracted 473
Recall: 0.7405857740585774
Precision: 0.7484143763213531
F1: 0.7444794952681387


**Analysis FP and FN**

In [None]:
extracted_terms =  test_extracted_terms
gold_set=gold_set_for_test
true_pos=extracted_terms.intersection(gold_set)

In [None]:
recall=len(true_pos)/len(gold_set)
precision=len(true_pos)/len(extracted_terms)

print("Intersection",len(true_pos))
print("Gold",len(gold_set))
print("Extracted",len(extracted_terms))
print("Recall:", recall)
print("Precision:", precision)
print("F1:", 2*(precision*recall)/(precision+recall))

Intersection 354
Gold 478
Extracted 473
Recall: 0.7405857740585774
Precision: 0.7484143763213531
F1: 0.7444794952681387


In [None]:
#false negatives (what was missed)
fn=gold_set-extracted_terms
fn

{'',
 'ATIS (Air Travel Information System) domain',
 'C++',
 'CSR (Connected Speech Recognition) corpus',
 'Canadian Hansards',
 'Chinese newspapers',
 'DARPA Resource Management corpus',
 "February '92 benchmark evaluation",
 "February '92 test sentences",
 'February 1992 ATIS benchmark tests',
 "Grolier's Encyclopedia",
 'HMM with Gaussian mixture observation densities',
 'IDUS',
 'IDUS (Intelligent Document Understanding System)',
 'LRE project SmTA double check',
 'MADCOW (Multi-site ATIS Data COllection Working group)',
 'MIT ATIS (Air Travel Information Service) system',
 "NTHU's statistic-based system",
 'OCR',
 "October '91 dry-run test set",
 "October '91 test set",
 'Resource Management (RM) corpus',
 "Roget's Thesaurus",
 'Similarity-driven Transfer System (SimTran)',
 'Simulated annealing approach',
 'Spanish',
 'Syntactic analysis of the English coordinate sentences',
 'TDMT (Transfer-Driven Machine Translation)',
 'TDMT on APs',
 'adaptive dynamic word formation',
 'adap

In [None]:
#false positives (wrongly seen as term)
fp=extracted_terms-gold_set
fp

{'5K vocabulary',
 'ATIS benchmark tests',
 'C + +',
 'DARPA',
 'DCG formalism',
 'English coordinate sentences',
 'Gaussian mixture observation densities',
 'HMM',
 'IDUS development',
 'LR parsing',
 'LRE',
 'NTHU',
 'OCR accuracy',
 'Resource Management corpus',
 'SmTA double check',
 'Syntactic analysis',
 'TDMT on',
 'accounts based on processing',
 'adjoining',
 'analysis and generation',
 'annealing approach',
 'applications',
 'benchmark evaluation',
 'bracketed sentences',
 'browsing and editing',
 'chains',
 'characters',
 'combinatorics',
 'communications',
 'compound noun component',
 'computational systems',
 'constraints',
 'context',
 'contextual clues',
 'copying of unmodified subgraphs',
 'data',
 'data collection',
 'discourse effect',
 'discrimination and robustness oriented adaptive learning procedure',
 'dry-run test',
 'error',
 'errors',
 'evaluation metric',
 'examples',
 'frequent word',
 'full lexicalization',
 'further grammar',
 'general-purpose language und

In [None]:
#true pos
true_pos

{'APs',
 'ATNs',
 'BU recognition system',
 'Bayesian learning',
 'CSR',
 'CSR corpus',
 'CSR pilot corpus',
 'Chart-like parsing',
 'Chinese Natural Language Processing',
 'Chinese-English texts',
 'DARPA speech recognition technology',
 'Dynamic Grammars',
 'ER',
 'English',
 'English alphabets',
 'English coordinate structure analysis model',
 'English sentences',
 'English word',
 'English words',
 'English-Japanese MT system',
 'Graph unification',
 'HBG',
 'HBG model',
 'Horn logic program',
 'Human-Machine Communication',
 'IR',
 'Japanese',
 'Japanese bunsetsu',
 'Japanese homophone errors',
 'Japanese kanji-kana characters',
 'Japanese revision support systems',
 'Japanese texts',
 'KANA-KANJI conversion',
 'LHIP',
 'LIMSI',
 'Language understanding',
 'MADCOW',
 'MAP estimation approach',
 'MIT ATIS system',
 'MLE reestimation algorithms',
 'MT',
 'MT systems',
 'MUC-3 evaluation',
 'Markov probability',
 'NLP techniques',
 'Natural Language Processing',
 'Non Verbal and Mult