In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Change to AR data
filepath = './../../../PART1/srl/output.conll'
# Add dev filepath

### Read in data, split train-test

In [3]:
# Change to fit our data for train and dev
df = pd.read_csv(filepath, delimiter='\t', names=['FORM', 'PRED_FORM', 'LEMMA', 'POS', 'DEPREL', 'PRED', 'FORM_-2', 'FORM_-1',
             'FORM_+1', 'FORM_+2', 'POS_-2', 'POS_-1', 'POS_+1', 'POS_+2', 'FORM_HEAD',
             'HEAD_predicate', 'HEAD_root', 'POS_HEAD', 'PRED_LEMMA', 'PRED_POS', 'PRED_DEPREL',
             'PRED_PRED', 'LABEL'])

In [4]:
# Remove
df_train = df[:804]
df_test = df[804:].reset_index(drop=True)

In [6]:
df_train.head()

Unnamed: 0,FORM,PRED_FORM,LEMMA,POS,DEPREL,PRED,FORM_-2,FORM_-1,FORM_+1,FORM_+2,POS_-2,POS_-1,POS_+1,POS_+2,FORM_HEAD,HEAD_predicate,HEAD_root,POS_HEAD,PRED_LEMMA,PRED_POS,PRED_DEPREL,PRED_PRED,LABEL
0,The,temperature,the,DT,NMOD,_,.,.,economy,'s,.,.,NN,POS,economy,0.0,0.0,NN,temperature,NN,SBJ,temperature.01,0
1,economy,temperature,economy,NN,NMOD,_,.,The,'s,temperature,.,DT,POS,NN,temperature,1.0,0.0,NN,temperature,NN,SBJ,temperature.01,1
2,'s,temperature,'s,POS,SUFFIX,_,The,economy,temperature,will,DT,NN,NN,MD,economy,0.0,0.0,NN,temperature,NN,SBJ,temperature.01,0
3,temperature,temperature,temperature,NN,SBJ,temperature.01,economy,'s,will,be,NN,POS,MD,VB,will,0.0,1.0,MD,temperature,NN,SBJ,temperature.01,1
4,will,temperature,will,MD,ROOT,_,'s,temperature,be,taken,POS,NN,VB,VBN,ROOT,0.0,0.0,ROOT,temperature,NN,SBJ,temperature.01,0


In [7]:
df_test.head()

Unnamed: 0,FORM,PRED_FORM,LEMMA,POS,DEPREL,PRED,FORM_-2,FORM_-1,FORM_+1,FORM_+2,POS_-2,POS_-1,POS_+1,POS_+2,FORM_HEAD,HEAD_predicate,HEAD_root,POS_HEAD,PRED_LEMMA,PRED_POS,PRED_DEPREL,PRED_PRED,LABEL
0,Economists,see,economist,NNS,SBJ,_,plunge,.,are,divided,NN,.,VBP,VBN,are,0.0,1.0,VBP,see,VB,IM,see.01,0
1,are,see,be,VBP,ROOT,_,.,Economists,divided,as,.,NNS,VBN,IN,ROOT,0.0,0.0,ROOT,see,VB,IM,see.01,0
2,divided,see,divide,VBN,VC,divide.02,Economists,are,as,to,NNS,VBP,IN,TO,are,0.0,1.0,VBP,see,VB,IM,see.01,0
3,as,see,as,IN,ADV,_,are,divided,to,how,VBP,VBN,TO,WRB,divided,1.0,0.0,VBN,see,VB,IM,see.01,0
4,to,see,to,TO,PMOD,_,divided,as,how,much,VBN,IN,WRB,JJ,as,0.0,0.0,IN,see,VB,IM,see.01,0


## Features

In [8]:
def extract_features_and_labels_generalised(df, header='nerc', feature_selection='all'):
    """
    Adapted from Quirine's Machine Learning for NLP assignment.
    Takes pd.DataFrame of features and labels and outputs two lists. One list of dictionaries
    containing the features, and the other a list of nerc labels.

    :param df: pd.DataFrame containing features and labels.
    :param str header: string of header of column containing labels. Default is 'nerc'
    :param str/list feature_selection: string that says all to include all features in df. Or list
        of feature labels to be included. The names must match column names in df.

    :returns two lists.
    """
    feature_list = []
    label_list = []

    feature_labels = list(df)
    feature_labels.remove(header)

    # Get list of dictionaries of features. Feature labels are the keys, feature value is the value.
    # If no selection is made, include all headers from df (except for NERC label) to feature dict.
    if feature_selection == 'all':
        for i in range(df.shape[0]):
            feature_dict = dict()
            for feature_label in feature_labels:
                feature_value = df.at[i, feature_label]
                feature_dict[feature_label] = feature_value
            feature_list.append(feature_dict)
    # If a selection is made, only include those features to feature dict.
    else:
        for i in range(df.shape[0]):
            feature_dict = dict()
            for feature_label in feature_selection:
                feature_value = df.at[i, feature_label]
                feature_dict[feature_label] = feature_value
            feature_list.append(feature_dict)

    # Get list of nerc labels
    label_list = list(df[header])

    return feature_list, label_list

def train_pred_logreg(train_feats, train_labels, test_feats):
    # Get features ready for model

    vec = DictVectorizer()
    train_feats_vec = vec.fit_transform(train_feats)

    # Make instance of model
    logisticRegr = LogisticRegression(multi_class='multinomial', solver='lbfgs')

    model = logisticRegr.fit(train_feats_vec, train_labels)

    predict_feats_vec = vec.transform(test_feats)

    predictions = model.predict(predict_feats_vec)
    return predictions

def get_inputs(feats, model):
    '''
    Copied from Quirine's Machine Learning for NLP assignment
    Generates array of vectors. Finds vector corresponding to token in the word embedding model and
    appends it to the list of inputs.

    The next function is given to apply any embedding mode that is loaded to any data set and return
    the list of embeddings for the data

    :param list feats: list of dictionaries. Dictionaries must contain only tokens.
    :param model: gensim.models.keyedvectors.Word2VecKeyedVectors

    Returns list of vectors
    '''
    inputs=[]
    for dictionary in feats:
        for label, token in dictionary.items():
            if token in model:
                vector=model[token]
            else: # if the word does not exist in the embeddings vocabulary, use an all-zeros vector
                vector=[0]*300
            inputs.append(vector)
    return inputs

In [9]:
# Extract feats/labels
train_feats, train_labels = extract_features_and_labels_generalised(df_train, header='LABEL', feature_selection='all')
test_feats, test_labels = extract_features_and_labels_generalised(df_test, header='LABEL', feature_selection='all')

In [10]:
# Train and predict
predictions = train_pred_logreg(train_feats, train_labels, test_feats)

# Write to df
df_test['maxent_pred'] = predictions

In [11]:
df_test.head()

Unnamed: 0,FORM,PRED_FORM,LEMMA,POS,DEPREL,PRED,FORM_-2,FORM_-1,FORM_+1,FORM_+2,POS_-2,POS_-1,POS_+1,POS_+2,FORM_HEAD,HEAD_predicate,HEAD_root,POS_HEAD,PRED_LEMMA,PRED_POS,PRED_DEPREL,PRED_PRED,LABEL,maxent_pred
0,Economists,see,economist,NNS,SBJ,_,plunge,.,are,divided,NN,.,VBP,VBN,are,0.0,1.0,VBP,see,VB,IM,see.01,0,0
1,are,see,be,VBP,ROOT,_,.,Economists,divided,as,.,NNS,VBN,IN,ROOT,0.0,0.0,ROOT,see,VB,IM,see.01,0,0
2,divided,see,divide,VBN,VC,divide.02,Economists,are,as,to,NNS,VBP,IN,TO,are,0.0,1.0,VBP,see,VB,IM,see.01,0,0
3,as,see,as,IN,ADV,_,are,divided,to,how,VBP,VBN,TO,WRB,divided,1.0,0.0,VBN,see,VB,IM,see.01,0,0
4,to,see,to,TO,PMOD,_,divided,as,how,much,VBN,IN,WRB,JJ,as,0.0,0.0,IN,see,VB,IM,see.01,0,0


## Evaluation

In [12]:
print('CLASSIFICATION REPORT')
print(classification_report(df_test['LABEL'], df_test['maxent_pred']))

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       229
           1       0.50      0.04      0.08        23

    accuracy                           0.91       252
   macro avg       0.71      0.52      0.52       252
weighted avg       0.87      0.91      0.87       252

