In [1]:
import pandas as pd
import numpy as np
import nltk
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics as crf_metrics
from sklearn.model_selection import train_test_split
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import gensim
import json 
from tqdm.notebook import tqdm
import pickle
import glob

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\RAYMOND\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\RAYMOND\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [None]:
# !pip install sklearn-crfsuite -U
# !pip install -U 'scikit-learn<0.24'
# !pip install gensim

In [2]:
train_df = pd.concat([pd.read_csv(file, sep='\t') for file in glob.glob('../data/train/*-tokens.tsv')])
train_df.sort_values('document_ID', inplace=True)
train_df.to_csv('../data/train-full.tsv', index=False, sep='\t')

# Prepare the data

In [3]:
train_df = pd.read_csv('../data/train-full.tsv', sep='\t')
validation_df = pd.read_csv('../data/validation-full.tsv', sep='\t')

In [4]:
train_df['doc-sent'] = [str(row.document_ID) + '-' + str(row.sentence_ID) for index, row in train_df.iterrows()]
validation_df['doc-sent'] = [str(row.document_ID) + '-' + str(row.sentence_ID) for index, row in validation_df.iterrows()]

### Ratio transformations

In [5]:
# Make new columns
train_df['total_occurences'] = 0
train_df['class_occurences'] = 0
train_df['attribute_occurences'] = 0
validation_df['total_occurences'] = 0
validation_df['class_occurences'] = 0
validation_df['attribute_occurences'] = 0

In [6]:
with open('../data/genmymodel/genmymodel_uml_extracted_metadata_final.json') as json_file:
    gmm_data = json.load(json_file)

# Store all classes and attributes independent of eachother
all_classes = []
all_attrs = []

# Loop over all metadata and append to proper list
for file, metadata in gmm_data.items():
    if 'classes' in metadata.keys():
        all_classes.append(metadata['classes'])

    if 'attributes' in metadata.keys():
        all_attrs.append(metadata['attributes'])

flatten = lambda t: [item for sublist in t for item in sublist]

all_classes = flatten(all_classes)
all_attrs = flatten(all_attrs)

In [7]:
noungroup = []
noungroup_indices = []

for index, row in tqdm(validation_df.iterrows()):
    if isinstance(row['fine_POS_tag'], str) and row['fine_POS_tag'][:2] == 'NN':
        noungroup.append(row['word'])
        noungroup_indices.append(index)
    else:
        if len(noungroup) == 0:
            continue
        else:
            full_ng = ' '.join(noungroup).lower()
            attr_no = all_attrs.count(full_ng)
            class_no = all_classes.count(full_ng)
            
            for noun_index in noungroup_indices:
                validation_df.loc[noun_index, ['class_occurences', 'attribute_occurences', 'total_occurences']] = [class_no, attr_no, attr_no + class_no]
                
            noungroup = []
            noungroup_indices = []

0it [00:00, ?it/s]

### Prepare IOB format

In [8]:
columns = ['doc-sent', 'word', 'lemma', 'POS_tag', 'fine_POS_tag', 'dependency_relation', 'event', 'supersense_category', 'entity', 'entity_type', 'entity_category', 'total_occurences', 'class_occurences', 'attribute_occurences', 'IOB_tag']
train_df = train_df[columns]
validation_df = validation_df[columns]

In [9]:
agg_func = lambda s: list(map(lambda w: tuple(w), s.loc[:, s.columns != 'doc-sent'].values.tolist()))

In [10]:
train_grouped_df = train_df.groupby('doc-sent').apply(agg_func)
validation_grouped_df = validation_df.groupby('doc-sent').apply(agg_func)

train_sentences = [s for s in train_grouped_df]
validation_sentences = [s for s in validation_grouped_df]

In [11]:
train_grouped_df

doc-sent
0-0      [(Software, Software, PROPN, NNP, compound, O,...
0-1      [(of, of, ADP, IN, prep, O, nan, nan, nan, nan...
0-10     [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
0-100    [(metadata, metadata, NOUN, NN, nsubjpass, O, ...
0-101    [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
                               ...                        
9-95     [(to, to, PART, TO, aux, O, nan, nan, nan, nan...
9-96     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-97     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-98     [(against, against, ADP, IN, prep, O, nan, nan...
9-99     [(The, the, DET, DT, det, O, nan, nan, nan, na...
Length: 3130, dtype: object

In [12]:
train_df.dropna(subset=['doc-sent'], inplace=True)

In [13]:
# fastText model for embedding generation
vocab = train_df['word'].values.tolist() + validation_df['word'].values.tolist()
model = gensim.models.FastText(vocab, min_count=1)

In [None]:
pickle.dump(model, open('fasttext-model.pkl', 'wb'))

In [None]:
def word2features(sent, i, embedding, ratio):
    word = sent[i][1]
    postag = sent[i][3]
    fine_postag = sent[i][4]
    
    features = {
        label: data
        for label, data in zip(columns[1:-1], sent[i][:-1])
    }
    
    features.update({
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag[:2]': postag[:2],
        'postag[:2]': postag[:2],
        'finepostag[:2]': fine_postag[:2],
        'finepostag[:2]': fine_postag[:2],
    })
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][3]
        finepostag1 = sent[i-1][4]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:finepostag': finepostag1,
            '-1:finepostag[:2]': finepostag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][1]
        postag1 = sent[i+1][3]
        finepostag1 = sent[i-1][4]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:finepostag': finepostag1,
            '+1:finepostag[:2]': finepostag1[:2],
        })
    else:
        features['EOS'] = True
    
    if not ratio:
        for ratio_feature in ['total_occurences', 'class_occurences', 'attribute_occurences']:
            del features[ratio_feature]
        
    if embedding:
        word_embedding = model.wv.get_vector(word)
        
        features.update({
            f'emb_pos_{i}': word_embedding[i]
            for i in range(len(word_embedding))
        })

    return features


def sent2features(sent, embedding = False, ratio = False):
    return [word2features(sent, i, embedding, ratio) for i in range(len(sent))]

def sent2labels(sent):
    return list(map(lambda s: s[-1], sent))

In [None]:
sent2features(validation_sentences[0][:7])

# Default model

In [None]:
X_train = np.array([sent2features(s) for s in train_sentences])
X_test = np.array([sent2features(s) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

In [None]:
labels = list(train_df['IOB_tag'].unique())
labels.remove('O')

In [None]:
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

In [None]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

In [None]:
# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

In [None]:
y_pred = rs.predict(X_test)

In [None]:
labels

In [None]:
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

In [None]:
[[[word['word'], pred] for word, pred in zip(sent, predictions)] for sent, predictions in zip(X_test, y_pred)]

# Default model + fastText

In [None]:
X_train = np.array([sent2features(s, embedding = True) for s in train_sentences])
X_test = np.array([sent2features(s, embedding = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

In [None]:
rs.fit(X_train, y_train)

In [None]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

# Default model + class/attribute ratio

In [None]:
X_train = np.array([sent2features(s, ratio = True) for s in train_sentences])
X_test = np.array([sent2features(s, ratio = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

In [None]:
rs.fit(X_train, y_train)

In [None]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

# All features together

In [None]:
X_train = np.array([sent2features(s, ratio = True, embedding = True) for s in train_sentences])
X_test = np.array([sent2features(s, ratio = True, embedding = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

In [None]:
rs.fit(X_train, y_train)

In [None]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

In [None]:
pickle.dump(rs, open('model-new.pkl', 'wb'))