# Import

Run the following cell to import the necessary libraries and functions after installing them

In [2]:
!pip install -r requirements.txt

Collecting spacy==2.3.7
  Downloading spacy-2.3.7-cp38-cp38-win_amd64.whl (9.7 MB)
Collecting thinc<7.5.0,>=7.4.1
  Downloading thinc-7.4.5-cp38-cp38-win_amd64.whl (910 kB)
Collecting six~=1.15.0
  Using cached six-1.15.0-py2.py3-none-any.whl (10 kB)


Installing collected packages: six, thinc, spacy
  Attempting uninstall: six
    Found existing installation: six 1.16.0
    Uninstalling six-1.16.0:
      Successfully uninstalled six-1.16.0
  Attempting uninstall: thinc
    Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfully uninstalled thinc-7.4.0
Successfully installed six-1.15.0 spacy-2.3.7 thinc-7.4.5


In [2]:
import utils, importlib, sys
importlib.reload(sys.modules['utils'])
from utils import *

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package benepar_en3 to
[nltk_data]     C:\Users\moi\AppData\Roaming\nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 33.0MB/s]
2022-01-10 14:45:36 INFO: Downloading default packages for language: en (English)...
2022-01-10 14:45:37 INFO: File exists: C:\Users\moi\stanza_resources\en\default.zip.
2022-01-10 14:45:42 INFO: Finished downloading models and saved to C:\Users\moi\stanza_resources.
[nltk_data] Downloading package verbnet to
[nltk_data]     C:\Users\moi\AppData\Roaming\nltk_data...
[nltk_data]   Package verbnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\moi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is 

# Preprocessing and Data Preparation

## Parse the document

Read the example file: **Example.txt**

In [3]:
exampleData=pd.read_csv("Example.txt",names=["Context"],sep="\t")

Show the content of the file to make sure it is properly parsed

In [4]:
exampleData

Unnamed: 0,Context
0,The S&T component shall send all approval requ...


## Apply the NLP pipeline and construct the triples

In [5]:
nlp = en_core_web_sm.load()

In [6]:
exampleData["Context"]=exampleData.apply(lambda x: applynlp(x["Context"],nlp),axis=1)

In [7]:
pronouns=["I","me","my","mine","myself","you","you","your","yours","yourself","he","him","his","his","himself","she","her","her","hers","herself","it","it","its","itself","we","us","our","ours","ourselves","you","you","your","yours","yourselves","they","them","their","theirs","themselves"]
li=[]
i,j=0,0
ids=[]
for context in exampleData.Context.unique():
    for pronoun in findPronouns(context,pronouns):
        Id=str(i)+"-"+pronoun.text+"-"+str(j)
        while Id in ids:
            j+=1
            Id=str(i)+"-"+pronoun.text+"-"+str(j)
        for candidateAntecedent in getNPs(context,pronoun):
            li.append([Id,context,pronoun,pronoun.i,candidateAntecedent])
            ids.append(Id)
    i+=1
exampleData=pd.DataFrame(li,columns=["Id","Context","Pronoun","Position","Candidate Antecedent"])

Show the data frame containing the triples to make sure they are properly constructed

In [8]:
exampleData.head()

Unnamed: 0,Id,Context,Pronoun,Position,Candidate Antecedent
0,0-it-0,"(The, S&T, component, shall, send, all, approv...",it,19,"(The, S&T, component)"
1,0-it-0,"(The, S&T, component, shall, send, all, approv...",it,19,"(all, approval, requests)"
2,0-it-0,"(The, S&T, component, shall, send, all, approv...",it,19,"(the, DBS)"
3,0-it-0,"(The, S&T, component, shall, send, all, approv...",it,19,"(the, request)"
4,0-it-0,"(The, S&T, component, shall, send, all, approv...",it,19,"(storage, parameters)"


# SpanBERT-based Solutions

## Prepare SpanBERT-based Solutions

In [10]:
fast_tokenizer = BertTokenizerFast.from_pretrained('SpanBERT/spanbert-base-cased')
nlpmodel = BertForTokenClassification.from_pretrained('SpanBERT-NLPv21.8.10')
remodel = BertForTokenClassification.from_pretrained('SpanBERT-REv21.9.01')

In [11]:
test=[]
for Id in exampleData.Id.unique():
    c=exampleData[exampleData.Id==Id].Context.unique()[0]
    pronoun=exampleData[exampleData.Id==Id].Pronoun.unique()[0]
    hashedpronoun=pronoun.text+"#1"
    hashedcontext=c[:pronoun.i].text+" "+hashedpronoun+" "+c[pronoun.i+1:].text
    test.append([Id,hashedcontext,hashedpronoun])
testdf=pd.DataFrame(test,columns=["Id","context","pronoun"])

In [12]:
test_data = SpanDetectionData(testdf, fast_tokenizer,train=False)

## Solution 1: SpanBERT<sub>NLP</sub>

In [13]:
for param in nlpmodel.base_model.parameters():
    param.requires_grad = False

In [14]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_steps=200,
    logging_steps= 200,          
    save_total_limit = 5,
    #evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=3,              # total number of training epochs
    learning_rate = 2e-5,
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs'
)

nlp_trainer = Trainer(
    model=nlpmodel,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
)

In [15]:
nlp_predictions=nlp_trainer.predict(test_data)

***** Running Prediction *****
  Num examples = 1
  Batch size = 32


In [16]:
ttruncated_predictions,tpredicted_spans=processPred(nlp_predictions,test_data,testdf,fast_tokenizer,T=0.9)

In [17]:
spans=[]
for i,j in zip(testdf.index, tpredicted_spans):
    spans.append(findspans(testdf.context[i],j))
testdf['Resolution']=spans
testdf['Detection']=testdf['Resolution'].apply(lambda x: "Unambiguous" if len(x)!=0 else "Ambiguous")

### The anaphoric ambiguity handling results of SpanBERT<sub>NLP</sub>

In [18]:
for i in testdf.index:
    context=testdf.context[i]
    pronoun=testdf.pronoun[i]
    resolution=testdf.Resolution[i]
    split1=context.split(pronoun)
    if resolution and resolution[0] in split1[0]:
        split1[0]=split1[0].replace(resolution[0],color.UNDERLINE+color.GREEN+resolution[0]+color.END+color.END)
    print(split1[0]+'\033[1m'+pronoun+'\033[0m'+split1[1])
    print("Detected as: "+'\033[1m'+testdf.Detection[i]+'\033[0m')
    print()

[4m[92mThe S&T component[0m[0m shall send all approval requests to the DBS. If the request contains storage parameters, [1mit#1[0m shall create a configuration record from the parameters.
Detected as: [1mUnambiguous[0m



## Solution 2: SpanBERT<sub>RE</sub>

In [19]:
for param in remodel.base_model.parameters():
    param.requires_grad = False
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    save_steps=200,
    logging_steps= 200,          
    save_total_limit = 5,
    #evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    num_train_epochs=3,              # total number of training epochs
    learning_rate = 2e-5,
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs'
)

re_trainer = Trainer(
    model=remodel,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
)
re_predictions=re_trainer.predict(test_data)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 1
  Batch size = 32


In [20]:
ttruncated_predictions,tpredicted_spans=processPred(re_predictions,test_data,testdf,fast_tokenizer,T=0.9)

In [21]:
spans=[]
for i,j in zip(testdf.index, tpredicted_spans):
    spans.append(findspans(testdf.context[i],j))
testdf['Resolution']=spans
testdf['Detection']=testdf['Resolution'].apply(lambda x: "Unambiguous" if len(x)!=0 else "Ambiguous")

### The anaphoric ambiguity handling results of SpanBERT<sub>RE</sub>

In [22]:
for i in testdf.index:
    context=testdf.context[i]
    pronoun=testdf.pronoun[i]
    resolution=testdf.Resolution[i]
    split1=context.split(pronoun)
    if resolution and resolution[0] in split1[0]:
        split1[0]=split1[0].replace(resolution[0],color.UNDERLINE+color.GREEN+resolution[0]+color.END+color.END)
    print(split1[0]+'\033[1m'+pronoun+'\033[0m'+split1[1])
    print("Detected as: "+'\033[1m'+testdf.Detection[i]+'\033[0m')
    print()

[4m[92mThe S&T component[0m[0m shall send all approval requests to the DBS. If the request contains storage parameters, [1mit#1[0m shall create a configuration record from the parameters.
Detected as: [1mUnambiguous[0m



# ML-based Solutions

*Note that the ML classifiers for anaphoric ambiguity detection are different from the ones for anaphora resoutions*

## Language Features Extraction

In [9]:
nlp = en_core_web_sm.load()
nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
nlp1 = stanza.Pipeline('en')

2022-01-10 14:46:10 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| sentiment | sstplus   |
| ner       | ontonotes |

2022-01-10 14:46:10 INFO: Use device: cpu
2022-01-10 14:46:10 INFO: Loading: tokenize
2022-01-10 14:46:10 INFO: Loading: pos
2022-01-10 14:46:10 INFO: Loading: lemma
2022-01-10 14:46:10 INFO: Loading: depparse
2022-01-10 14:46:11 INFO: Loading: sentiment
2022-01-10 14:46:12 INFO: Loading: ner
2022-01-10 14:46:13 INFO: Done loading processors!


In [10]:
exampleDataOriginal=exampleData.copy()
exampleData=extract_LF(exampleData,nlp1)

## Feature Embeddings Extraction

In [11]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
model = transformers.BertModel.from_pretrained('bert-base-cased',output_hidden_states = True,)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Sum the embeddings derived from last hidden four layers of **BERT**

In [12]:
Hs4v = exampleData.apply(
    lambda x: get_4layers_emb(hashdouble(x['Context'],x['Pronoun'],x['Candidate Antecedent']).strip() + " [SEP] " + x['Pronoun'].text +
                           "#1 [SEP] " + x['Candidate Antecedent'].text+"#2",tokenizer,model,concat=False),
    axis=1)
Hs4=Hs4v.apply(lambda s: pd.Series(
    {i: float(s[i])
     for i in range(0, len(Hs4v[Hs4v.index[0]]))}))

### Extract the embeddings from **SBERT** 

In [27]:
from sentence_transformers import SentenceTransformer
smodel = SentenceTransformer('paraphrase-mpnet-base-v2')

loading configuration file C:\Users\moi/.cache\torch\sentence_transformers\sentence-transformers_paraphrase-mpnet-base-v2\config.json
Model config MPNetConfig {
  "_name_or_path": "old_models/paraphrase-mpnet-base-v2/0_Transformer",
  "architectures": [
    "MPNetModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "mpnet",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "relative_attention_num_buckets": 32,
  "transformers_version": "4.10.0",
  "vocab_size": 30527
}

loading weights file C:\Users\moi/.cache\torch\sentence_transformers\sentence-transformers_paraphrase-mpnet-base-v2\pytorch_model.bin
All model checkpoint weights were used when initializing MPNetModel.

All the weights of MPNetModel were initia

In [28]:
sbertv=exampleData.apply(
    lambda x: smodel.encode(hashdouble(x['Context'],x['Pronoun'],x['Candidate Antecedent']).strip() + " [SEP] " + x['Pronoun'].text +
                           "#1 [SEP] " + x['Candidate Antecedent'].text+"#2"),
    axis=1)
sbert=sbertv.apply(lambda s: pd.Series(
    {i: float(s[i])
     for i in range(0, len(sbertv[sbertv.index[0]]))}))

## Prepare the ML-based Solutions

In [13]:
X=exampleData.drop(["Context","Pronoun","Candidate Antecedent"],axis=1)

In [14]:
X.isNextVerbAnimate=X.isNextVerbAnimate.astype(bool)

In [15]:
object_cols = []
to_remove=['Id']
for col, types in zip(
        X.dtypes.index,
        X.dtypes):
    if types == object:
        if len(X[col].unique())<30:
            object_cols.append(col)
        else:
            to_remove.append(col)

In [16]:
object_cols.pop(0)

'Id'

In [17]:
X=X.drop(to_remove,axis=1)

In [18]:
X=pd.get_dummies(X,columns=object_cols)

In [19]:
trainCols=loadObj("trainingCols.list")

In [20]:
for i,j in zip(X.isna().sum().index,X.isna().sum()):
    if j>0:
        print(i,j)

In [21]:
X=X.fillna(value=0)

In [22]:
for col in list(X.columns):
    if col not in trainCols:
        X.drop(col,axis=1,inplace=True)

In [23]:
for col in trainCols:
    if col not in X.columns:
        X[col]=0

In [24]:
X['Id']=exampleData['Id']

## Solution 3: ML<sub>LF</sub>

### The anaphoric ambiguity detection results of ML<sub>LF</sub>

In [43]:
ML_LF_Detection=loadObj("gn.obj")

In [40]:
gn=GaussianNB().set_params(**{'var_smoothing': 1.0})

In [41]:
gn.fit(loadObj("X.tr").drop("Id",axis=1),loadObj("y.tr"))

GaussianNB(var_smoothing=1.0)

In [38]:
X.shape

(5, 75)

In [42]:
saveObj(gn,"gn.obj")

In [44]:
ML_LF_D_predictions=ML_LF_Detection.predict_proba(X.drop('Id',axis=1))

In [43]:
detdf=getprediction(X.drop('Id',axis=1).index,ML_LF_D_predictions,X.Id,0.5,exampleData)

In [44]:
for i in detdf.index:
    p=detdf.Pronoun[i]
    doc=p.doc
    print(doc[:p.i].text+'\033[1m'+" "+p.text+'\033[0m'+" "+doc[p.i+1:].text)
    print("Detected as: "+'\033[1m'+detdf.result[i]+'\033[0m')
    print()

The S&T component shall send all approval requests to the DBS. If the request contains storage parameters,[1m it[0m shall create a configuration record from the parameters.
Detected as: [1mAmbiguous[0m



### The anaphora resolution results of ML<sub>LF</sub>

In [45]:
ML_LF_Resolution=loadObj("ML_LF-resolution.Anaphora")

In [46]:
ML_LF_R_predictions=pd.Series(ML_LF_Resolution.predict_proba(X.drop("Id",axis=1)).tolist())

In [47]:
resdf=getResolution(ML_LF_R_predictions,X,exampleData,theta=0)

In [48]:
for Id,p in zip(resdf.Id,resdf.Predicted):
    doc=p.doc
    pronoun=exampleData[exampleData.Id==Id].Pronoun.unique()[0]
    if p:
        split1=doc[:p.start].text+" "+color.UNDERLINE+color.GREEN+p.text+color.END+color.END+" "+doc[p.end:pronoun.i].text
    print(split1+'\033[1m'+" "+pronoun.text+'\033[0m'+" "+doc[pronoun.i+1:].text)
    print()

The S&T component shall send all approval requests to the DBS. If [4m[92mthe request[0m[0m contains storage parameters,[1m it[0m shall create a configuration record from the parameters.



## Solution 4: ML<sub>FE</sub>

### The anaphoric ambiguity detection results of ML<sub>FE</sub>

In [32]:
ML_FE_Detection=loadObj("ML_FE-detection.Anaphora")

In [33]:
ML_FE_D_predictions=ML_FE_Detection.predict_proba(Hs4)

In [34]:
detdf=getprediction(Hs4.index,ML_FE_D_predictions,X.Id,0.5,exampleData)

In [35]:
for i in detdf.index:
    p=detdf.Pronoun[i]
    doc=p.doc
    print(doc[:p.i].text+'\033[1m'+" "+p.text+'\033[0m'+" "+doc[p.i+1:].text)
    print("Detected as: "+'\033[1m'+detdf.result[i]+'\033[0m')
    print()

The S&T component shall send all approval requests to the DBS. If the request contains storage parameters,[1m it[0m shall create a configuration record from the parameters.
Detected as: [1mAmbiguous[0m



### The anaphora resolution results of ML<sub>FE</sub>

In [53]:
ML_FE_Resolution=loadObj("ML_FE-resolution.Anaphora")

In [54]:
ML_FE_R_predictions=pd.Series(ML_FE_Resolution.predict_proba(sbert).tolist())

In [55]:
resdf=getResolution(ML_FE_R_predictions,X,exampleData,theta=0)

In [56]:
for Id,p in zip(resdf.Id,resdf.Predicted):
    doc=p.doc
    pronoun=exampleData[exampleData.Id==Id].Pronoun.unique()[0]
    if p:
        split1=doc[:p.start].text+" "+color.UNDERLINE+color.GREEN+p.text+color.END+color.END+" "+doc[p.end:pronoun.i].text
    print(split1+'\033[1m'+" "+pronoun.text+'\033[0m'+" "+doc[pronoun.i+1:].text)
    print()

The S&T component shall send all approval requests to [4m[92mthe DBS[0m[0m . If the request contains storage parameters,[1m it[0m shall create a configuration record from the parameters.



## Solution 5: ML<sub>ensemble</sub>

### The anaphoric ambiguity detection results of ML<sub>ensemble</sub>

In [57]:
ML_EnD_predictions=ensembleprobaN(ML_FE_D_predictions,ML_LF_D_predictions,theta=0.1)
detdf=getprediction(X.drop('Id',axis=1).index,ML_EnD_predictions,X.Id,0.5,exampleData)

In [58]:
for i in detdf.index:
    p=detdf.Pronoun[i]
    doc=p.doc
    print(doc[:p.i].text+'\033[1m'+" "+p.text+'\033[0m'+" "+doc[p.i+1:].text)
    print("Detected as: "+'\033[1m'+detdf.result[i]+'\033[0m')
    print()

The S&T component shall send all approval requests to the DBS. If the request contains storage parameters,[1m it[0m shall create a configuration record from the parameters.
Detected as: [1mAmbiguous[0m



### The anaphora resolution results of ML<sub>ensemble</sub>

In [59]:
ML_EnR_predictions=pd.Series(ensembleprobas(ML_FE_R_predictions,ML_LF_R_predictions).tolist())

In [60]:
resdf=getResolution(ML_EnR_predictions,X,exampleData,theta=0)

In [61]:
for Id,p in zip(resdf.Id,resdf.Predicted):
    doc=p.doc
    pronoun=exampleData[exampleData.Id==Id].Pronoun.unique()[0]
    if p:
        split1=doc[:p.start].text+" "+color.UNDERLINE+color.GREEN+p.text+color.END+color.END+" "+doc[p.end:pronoun.i].text
    print(split1+'\033[1m'+" "+pronoun.text+'\033[0m'+" "+doc[pronoun.i+1:].text)
    print()

The S&T component shall send all approval requests to the DBS. If [4m[92mthe request[0m[0m contains storage parameters,[1m it[0m shall create a configuration record from the parameters.

