# Training PamModels

This notebook provides an overview of the training pipeline of the PAM (Predicting Argument Modifiers) Models in this project. It includes the code for training different models with different features. It also includes a couple of examples of the features that are extracted for training the specific model. Additionally, the classification report will be provided here as well.

In [2]:
import os  # nopep8
import sys  # nopep8
from pathlib import Path  # nopep8

# Add the 'src' directory to sys.path
current_working_directory = os.getcwd()  # Get the current working directory
src_path = (
    Path(current_working_directory).resolve() / "src"
)  # Construct the path to the 'src' directory
sys.path.append(str(src_path))  # Add 'src' directory to sys.path

from corpus import *
from models.framework import *

# path to trained models
models_path = Path(src_path) / "models" / "trained_pkl"

# define the corpus path
corpus_path = Path(src_path).resolve() / "corpus" / "md_corpus_ontonotes.pkl"

md_corpus_onto = Corpus.load_corpus(corpus_path)

from sklearn.linear_model import LogisticRegression

## Dummy Model

The features for the dummy model consists of a Bag of Words (BoW)

In [3]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.bow,
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
bow_model = PamModel(
    model=LogisticRegression(max_iter=300),
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
bow_model.train()

# save model
bow_model.save_model("dummy_bow_model.pkl")

In [3]:
# load model
bow_model = PamModel.load_model(filename=Path(models_path) / "dummy_bow_model.pkl")

In [4]:
# get a list of the feature names
feature_names = bow_model.feature_extractor.vectorizer.get_feature_names_out()
print(feature_names[90:100])

# ['accordance' 'according' 'accordingly' 'account' 'accounts' 'acquisition'
#  'across' 'act' 'action' 'active']

['accordance' 'according' 'accordingly' 'account' 'accounts' 'acquisition'
 'across' 'act' 'action' 'active']


In [4]:
bow_model.evaluate(print_report=True)

# weighted average f1-score: 0.84

              precision    recall  f1-score   support

         ADV       0.77      0.63      0.70      1074
         DIS       0.85      0.85      0.85       954
         LOC       0.80      0.85      0.82       766
         MNR       0.72      0.61      0.66       675
         MOD       0.99      0.98      0.99       898
         NEG       0.69      0.99      0.81       549
         TMP       0.90      0.92      0.91      1963

    accuracy                           0.84      6879
   macro avg       0.82      0.83      0.82      6879
weighted avg       0.84      0.84      0.84      6879



In [31]:
bow_model.get_feature_importance("NEG")


# feature	importance
# 1185	not	4.369207
# 1165	never	2.541671
# 1191	nt	2.296133
# 1174	no	1.352227
# 1178	nor	1.097732
# ...	...	...
# 1224	only	-2.946917
# 1619	so	-2.969488
# 1237	or	-3.027511
# 155	and	-3.270357
# 861	in	-3.321753

Unnamed: 0,feature,importance
1185,not,4.369207
1165,never,2.541671
1191,nt,2.296133
1174,no,1.352227
1178,nor,1.097732
...,...,...
1224,only,-2.946917
1619,so,-2.969488
1237,or,-3.027511
155,and,-3.270357


In [32]:
bow_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'

## TF-IDF Model

This functions similarly to the BoW model. However, it uses TF-IDF weighting for the vocabulary.

In [6]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.tfidf,
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
tfidf_model = PamModel(
    model=LogisticRegression(max_iter=300),
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
tfidf_model.train()

# save model
tfidf_model.save_model("tfidf_model.pkl")

In [5]:
# load model
tfidf_model = PamModel.load_model(filename=Path(models_path) / "tfidf_model.pkl")

In [None]:
# get a list of the feature names
feature_names = tfidf_model.feature_extractor.vectorizer.get_feature_names_out()
print(feature_names[:100])

In [7]:
tfidf_model.evaluate(print_report=True)

# weighted average f1-score: 0.85

              precision    recall  f1-score   support

         ADV       0.76      0.66      0.71      1074
         DIS       0.87      0.85      0.86       954
         LOC       0.83      0.85      0.84       766
         MNR       0.75      0.64      0.69       675
         MOD       1.00      0.98      0.99       898
         NEG       0.75      0.99      0.85       549
         TMP       0.89      0.93      0.91      1963

    accuracy                           0.85      6879
   macro avg       0.83      0.84      0.84      6879
weighted avg       0.85      0.85      0.85      6879



In [8]:
tfidf_model.get_feature_importance("NEG")

# 	feature	importance
# 2911	not	4.921588
# 2868	never	2.594258
# 2927	nt	2.244173
# 2889	no	1.501277
# 2900	nor	0.996759
# ...	...	...
# 328	and	-3.838011
# 436	at	-3.946682
# 2992	on	-4.450119
# 4395	the	-6.017031
# 2101	in	-7.461249

Unnamed: 0,feature,importance
2911,not,4.921588
2868,never,2.594258
2927,nt,2.244173
2889,no,1.501277
2900,nor,0.996759
...,...,...
328,and,-3.838011
436,at,-3.946682
2992,on,-4.450119
4395,the,-6.017031


In [14]:
tfidf_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'

## N-Gram Model

This model uses TF-IDF weighted unigrams and bigrams as features.

In [10]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.ngram,
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
ngram_model = PamModel(
    model=LogisticRegression(max_iter=300),
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
ngram_model.train()

# save model
ngram_model.save_model("ngram_model.pkl")

In [None]:
# load model
ngram_model = PamModel.load_model(filename=Path(models_path) / "ngram_model.pkl")

In [None]:
# get a list of the feature names
feature_names = ngram_model.feature_extractor.vectorizer.get_feature_names_out()
print(feature_names[90:100])

In [11]:
ngram_model.evaluate(print_report=True)

# weighted average f1-score: 0.86

              precision    recall  f1-score   support

         ADV       0.77      0.69      0.73      1074
         DIS       0.87      0.86      0.87       954
         LOC       0.82      0.87      0.85       766
         MNR       0.78      0.64      0.71       675
         MOD       1.00      0.98      0.99       898
         NEG       0.75      0.99      0.85       549
         TMP       0.90      0.93      0.92      1963

    accuracy                           0.86      6879
   macro avg       0.84      0.85      0.84      6879
weighted avg       0.86      0.86      0.86      6879



In [12]:
ngram_model.get_feature_importance("NEG")

# 	feature	importance
# 5459	not	4.993346
# 5372	never	2.703227
# 5497	nt	2.284522
# 5427	no	1.721516
# 5445	nor	1.035561
# ...	...	...
# 5532	of	-4.228029
# 1005	at	-4.245273
# 5731	on	-4.834429
# 7940	the	-6.809075
# 3819	in	-8.592979

Unnamed: 0,feature,importance
5459,not,4.993346
5372,never,2.703227
5497,nt,2.284522
5427,no,1.721516
5445,nor,1.035561
...,...,...
5532,of,-4.228029
1005,at,-4.245273
5731,on,-4.834429
7940,the,-6.809075


In [13]:
ngram_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'

## N-Gram & POS

This model uses the n-gram model's features as before, but also adds the POS-tags.

In [16]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.ngram,
        FeatureExtractor.pos,
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
ngram_pos_model = PamModel(
    model=LogisticRegression(max_iter=300),
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
ngram_pos_model.train()

# save model
ngram_pos_model.save_model("ngram_pos_model.pkl")

In [None]:
# get a list of the feature names
feature_names = ngram_pos_model.feature_extractor.pos_encoder.get_feature_names_out()
print(feature_names[90:100])

In [None]:
# load model
ngram_pos_model = PamModel.load_model(filename=Path(models_path) / "ngram_pos_model.pkl")

In [17]:
ngram_pos_model.evaluate(print_report=True)

# weighted average f1-score: 0.86

              precision    recall  f1-score   support

         ADV       0.76      0.69      0.72      1074
         DIS       0.86      0.87      0.86       954
         LOC       0.84      0.87      0.86       766
         MNR       0.74      0.72      0.73       675
         MOD       1.00      0.98      0.99       898
         NEG       0.90      0.93      0.91       549
         TMP       0.90      0.93      0.92      1963

    accuracy                           0.86      6879
   macro avg       0.86      0.86      0.86      6879
weighted avg       0.86      0.86      0.86      6879



In [18]:
ngram_pos_model.get_feature_importance("NEG")

# 	feature	importance
# 5459	not	5.188587
# 5372	never	4.430910
# 14636	x0_PART	4.236774
# 5427	no	2.887181
# 5497	nt	2.638690
# ...	...	...
# 596	and	-2.847569
# 5731	on	-2.977937
# 5532	of	-3.022960
# 7940	the	-4.990417
# 3819	in	-5.673898

Unnamed: 0,feature,importance
5459,not,5.188587
5372,never,4.430910
14636,x0_PART,4.236774
5427,no,2.887181
5497,nt,2.638690
...,...,...
596,and,-2.847569
5731,on,-2.977937
5532,of,-3.022960
7940,the,-4.990417


In [19]:
ngram_pos_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'

## N-Gram, POS & NER Model

This model uses the n-gram_pos model's features as before, but also adds the NER-tags.

In [None]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.ngram,
        FeatureExtractor.pos,
        FeatureExtractor.ner,
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
ngram_pos_ner_model = PamModel(
    model=LogisticRegression(max_iter=500),
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
ngram_pos_ner_model.train()

# save model
ngram_pos_ner_model.save_model("ngram_pos_ner_model.pkl")

In [None]:
# load model
ngram_pos_ner_model = PamModel.load_model(filename=Path(models_path) / "ngram_pos_ner_model.pkl")

In [None]:
# get a list of the feature names
feature_names = ngram_pos_ner_model.feature_extractor.ner_encoder.get_feature_names_out()
print(feature_names[90:100])

In [21]:
ngram_pos_ner_model.evaluate(print_report=True)

# weighted average f1-score: 0.86

              precision    recall  f1-score   support

         ADV       0.76      0.69      0.72      1074
         DIS       0.87      0.87      0.87       954
         LOC       0.83      0.88      0.86       766
         MNR       0.74      0.72      0.73       675
         MOD       1.00      0.98      0.99       898
         NEG       0.90      0.93      0.91       549
         TMP       0.90      0.93      0.92      1963

    accuracy                           0.87      6879
   macro avg       0.86      0.86      0.86      6879
weighted avg       0.86      0.87      0.86      6879



In [22]:
ngram_pos_ner_model.get_feature_importance("NEG")

# 	feature	importance
# 5459	not	5.208589
# 5372	never	4.407519
# 14636	x0_PART	4.108050
# 5427	no	2.890373
# 5497	nt	2.518744
# ...	...	...
# 596	and	-2.836572
# 5532	of	-2.979627
# 5731	on	-2.981826
# 7940	the	-4.977608
# 3819	in	-5.635395

Unnamed: 0,feature,importance
5459,not,5.208589
5372,never,4.407519
14636,x0_PART,4.108050
5427,no,2.890373
5497,nt,2.518744
...,...,...
596,and,-2.836572
5532,of,-2.979627
5731,on,-2.981826
7940,the,-4.977608


In [23]:
ngram_pos_ner_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'

## N-Gram, POS, NER & Dependency Model

This model uses the n-gram_pos_ner model's features as before, but also adds the dependency structure as feature.

In [None]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.ngram,
        FeatureExtractor.pos,
        FeatureExtractor.ner,
        FeatureExtractor.get_dependency_features
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
ngram_pos_ner_dep_model = PamModel(
    model=LogisticRegression(max_iter=500),
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
ngram_pos_ner_dep_model.train()

# save model
ngram_pos_ner_dep_model.save_model("ngram_pos_ner_dep_model.pkl")

In [None]:
# load model
ngram_pos_ner_dep_model = PamModel.load_model(filename=Path(models_path) / "ngram_pos_ner_dep_model.pkl")

In [None]:
# get a list of the feature names
feature_names = ngram_pos_ner_dep_model.feature_extractor.dep_encoder.get_feature_names_out()
print(feature_names[90:100])

In [25]:
ngram_pos_ner_dep_model.evaluate(print_report=True)

# weighted average f1-score: 0.87

              precision    recall  f1-score   support

         ADV       0.74      0.70      0.72      1074
         DIS       0.88      0.84      0.86       954
         LOC       0.82      0.88      0.85       766
         MNR       0.76      0.75      0.75       675
         MOD       1.00      0.99      1.00       898
         NEG       0.97      0.99      0.98       549
         TMP       0.91      0.93      0.92      1963

    accuracy                           0.87      6879
   macro avg       0.87      0.87      0.87      6879
weighted avg       0.87      0.87      0.87      6879



In [26]:
ngram_pos_ner_dep_model.get_feature_importance("NEG")

# 	feature	importance
# 5459	not	5.207504
# 5372	never	3.152983
# 5427	no	2.653509
# 5497	nt	2.618150
# 14636	x0_PART	2.238358
# ...	...	...
# 596	and	-2.719431
# 5532	of	-2.890461
# 5731	on	-2.947505
# 7940	the	-4.875818
# 3819	in	-5.579103

Unnamed: 0,feature,importance
5459,not,5.207504
5372,never,3.152983
5427,no,2.653509
5497,nt,2.618150
14636,x0_PART,2.238358
...,...,...
596,and,-2.719431
5532,of,-2.890461
5731,on,-2.947505
7940,the,-4.875818


In [27]:
ngram_pos_ner_dep_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'

## N-Gram, POS, NER, DEP & Constituency Model

This model uses the n-gram_pos_ner_dep model's features as before, but also adds the constituency structure as feature.

Note that extracting the constituency features takes some time as they are generated with the `benepar` model.

In [None]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.ngram,
        FeatureExtractor.pos,
        FeatureExtractor.ner,
        FeatureExtractor.get_dependency_features,
        FeatureExtractor.get_constituency_features,
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
ngram_pos_ner_dep_const_model = PamModel(
    model=LogisticRegression(max_iter=600), # for faster training, use LogisticRegression(penalty='l2', C=1.0, class_weight='balanced', solver='saga', max_iter=600)
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
ngram_pos_ner_dep_const_model.train()

# save model
ngram_pos_ner_dep_const_model.save_model("ngram_pos_ner_dep_const_model.pkl")

In [None]:
# load model
ngram_pos_ner_dep_const_model = PamModel.load_model(filename=Path(models_path) / "ngram_pos_ner_dep_const_model.pkl")

In [None]:
# get a list of the feature names
feature_names = ngram_pos_ner_dep_const_model.feature_extractor.constituency_encoder.get_feature_names_out()
print(feature_names[90:100])

In [None]:
ngram_pos_ner_dep_const_model.evaluate(print_report=True)

In [None]:
ngram_pos_ner_dep_const_model.get_feature_importance("NEG")

In [None]:
ngram_pos_ner_dep_const_model.predict_string_from_sentence(role_string="in November 2009", sentence_string="Susan married in November 2009.")

## Spacy-Sentence-Bert Model

This feature extraction method processes the input data to extract sentence embeddings (vectors) for each string in the data using Sentence-BERT (`en_stsb_distilbert_base`) model.

In [None]:
# Instantiate feature extractor
feature_extractor = FeatureExtractor(
    feature_methods=[
        FeatureExtractor.get_sentence_bert_features
    ],
    corpus=md_corpus_onto,
)

# Instantiate PAM model (using logistic regression for the classification task)
sbert_model = PamModel(
    model=LogisticRegression(max_iter=300), # for faster training, use LogisticRegression(penalty='l2', C=1.0, class_weight='balanced', solver='saga', max_iter=300)
    feature_extractor=feature_extractor,
    corpus=md_corpus_onto,
    train_data="train",
    test_data="dev",
)

# Train model
sbert_model.train()

# save model
sbert_model.save_model("sbert_model.pkl")

In [None]:
# load model
sbert_model = PamModel.load_model(filename=Path(models_path) / "sbert_model.pkl")

In [36]:
sbert_model.evaluate(print_report=True)

# weighted average f1-score: 0.87

              precision    recall  f1-score   support

         ADV       0.73      0.68      0.71      1074
         DIS       0.86      0.90      0.88       954
         LOC       0.85      0.84      0.84       766
         MNR       0.76      0.72      0.74       675
         MOD       1.00      1.00      1.00       898
         NEG       0.98      0.99      0.98       549
         TMP       0.90      0.93      0.92      1963

    accuracy                           0.87      6879
   macro avg       0.87      0.87      0.87      6879
weighted avg       0.87      0.87      0.87      6879



In [37]:
sbert_model.predict_string("in November 2009")

# '{"string": "in November 2009", "predicted_role": "TMP"}'

'{"string": "in November 2009", "predicted_role": "TMP"}'