In [7]:
# all functions for feature extraction
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB 
import csv
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import pandas as pd

def extract_features_and_labels(trainingfile):
    """
    trainingfile: preprocessed conllu file

    """
    predicate_per_sent=[]
    with open(trainingfile, encoding='utf8') as file:
        file = csv.reader(file, delimiter='\t', quotechar='^')
        predicates=[]
        for row in file:
            # if current row is empty, it marks a sentence boundary
            if row == []:
                predicates.append('_') # add a empty predicate to sentences without predicate
                predicate_per_sent.append(predicates)
                predicates=[]
            
            # if token is a predicate: append to list of predicates of current sentence
            elif row !=[] and not row[0].startswith("#") and not row[0].startswith('"'):
                if row[10] != '_':
                    predicate = row[10]
                    predicates.append(predicate[:-3])
                    
    with open(trainingfile, encoding='utf8') as file:
        file = csv.reader(file, delimiter='\t', quotechar='^')
        features = []
        labels = []
        tokens = []
        prev_token = ''
        prev_pos = ''
        prev_deprel = ''
        
        current_sent = 0
        for row in file:
              
            if row == []:
                prev_token = ''
                prev_pos = ''
                prev_deprel = ''
            elif row != [] and row[0].startswith("#"):
                continue
            elif row != [] and not row[0].startswith("#") and not row[0].startswith('"'):
                token_labels = []
                token_features = {}
                current_token = row[1]
                current_pos = row[3]
                token_preds = []
                if prev_pos == "AUX":
                    passive = 1
                else:
                    passive = 0
                deprel_to_head = row[7] # dependency relation
                if row[6] > row[0]: # if current is before head
                    head_init = 0
                elif row[6] < row[0]:
                    head_init = 1
                else:
                    head_init = 2
                for i, element in enumerate(row):
                    if i > 10:
                        column_num = i -11
                        if element == '_':
                            head_pred = '_'
                            deprel_to_head = '_'
                        else:
                            head_pred = predicate_per_sent[current_sent][column_num]

                        token_preds.append(head_pred)
                        token_labels.append(element)
                
                        token_features = {'token':current_token, 'pos':current_pos, 
                                'prev_token':prev_token, 'prev_pos':prev_pos, 
                                'head_pred':head_pred, 'deprel_to_head':deprel_to_head,
                                'prev_deprel':prev_deprel, 'head_init':head_init, 
                                'passive':passive}

                # reassign prev token after adding to feature list
                prev_token = row[1]
                prev_pos = row[3]
                prev_deprel = deprel_to_head
            
                labels.append(element)
                features.append(token_features)

    return features, labels



In [8]:
training_features, training_labels = extract_features_and_labels("../data/en_ewt-up-train_preprocessed.conllu")

In [3]:
print(training_features[:10])

[{'token': 'Al', 'pos': 'PROPN', 'prev_token': '', 'prev_pos': '', 'head_pred': '_', 'deprel_to_head': '_', 'prev_deprel': '', 'head_init': 1, 'passive': 0}, {'token': '-', 'pos': 'PUNCT', 'prev_token': 'Al', 'prev_pos': 'PROPN', 'head_pred': '_', 'deprel_to_head': '_', 'prev_deprel': '_', 'head_init': 1, 'passive': 0}, {'token': 'Zaman', 'pos': 'PROPN', 'prev_token': '-', 'prev_pos': 'PUNCT', 'head_pred': '_', 'deprel_to_head': '_', 'prev_deprel': '_', 'head_init': 1, 'passive': 0}, {'token': ':', 'pos': 'PUNCT', 'prev_token': 'Zaman', 'prev_pos': 'PROPN', 'head_pred': '_', 'deprel_to_head': '_', 'prev_deprel': '_', 'head_init': 1, 'passive': 0}, {'token': 'American', 'pos': 'ADJ', 'prev_token': ':', 'prev_pos': 'PUNCT', 'head_pred': '_', 'deprel_to_head': '_', 'prev_deprel': '_', 'head_init': 0, 'passive': 0}, {'token': 'forces', 'pos': 'NOUN', 'prev_token': 'American', 'prev_pos': 'ADJ', 'head_pred': 'k', 'deprel_to_head': 'nsubj', 'prev_deprel': '_', 'head_init': 0, 'passive': 0}, 

In [4]:
print(training_labels[:10])

['_', '_', '_', '_', '_', 'ARG0', 'V', 'ARG1', '_', '_']


In [9]:
def extract_features_args(dev_test_data):
       
    predicate_per_sent=[]
    with open(dev_test_data, encoding='utf8') as file:
        file = csv.reader(file, delimiter='\t', quotechar='^')
        predicates=[]
        for row in file:
            # if current row is empty, it marks a sentence boundary
            if row == []:
                predicates.append('_') # add a empty predicate to sentences without predicate
                predicate_per_sent.append(predicates)
                predicates=[]
            
            # if token is a predicate: append to list of predicates of current sentence
            elif row !=[] and not row[0].startswith("#") and not row[0].startswith('"'):
                if row[10] != '_':
                    predicate = row[10]
                    predicates.append(predicate[:-3])
                    
    with open(dev_test_data, encoding='utf8') as file:
        file = csv.reader(file, delimiter='\t', quotechar='^')
        features = []
        labels = []
        tokens = []
        prev_token = ''
        prev_pos = ''
        prev_deprel = ''
        
        current_sent = 0
        for row in file:
              
            if row == []:
                prev_token = ''
                prev_pos = ''
                prev_deprel = ''
            elif row != [] and row[0].startswith("#"):
                continue
            elif row != [] and not row[0].startswith("#") and not row[0].startswith('"'):
                if row[-1] == "ARG":
                    token_labels = []
                    token_features = {}
                    current_token = row[1]
                    current_pos = row[3]
                    token_preds = []
                    if prev_pos == "AUX":
                        passive = 1
                    else:
                        passive = 0
                    deprel_to_head = row[7] # dependency relation
                    if row[6] > row[0]: # if current is before head
                        head_init = 0
                    elif row[6] < row[0]:
                        head_init = 1
                    else:
                        head_init = 2
                    for i, element in enumerate(row):
                        if i > 10:
                            column_num = i -11
                            if element == '_':
                                head_pred = '_'
                                deprel_to_head = '_'
                            else:
                                head_pred = predicate_per_sent[current_sent][column_num]

                            token_preds.append(head_pred)
                            token_labels.append(element)

                            token_features = {'token':current_token, 'pos':current_pos, 
                                    'prev_token':prev_token, 'prev_pos':prev_pos, 
                                    'head_pred':head_pred, 'deprel_to_head':deprel_to_head,
                                    'prev_deprel':prev_deprel, 'head_init':head_init, 
                                    'passive':passive}

                    # reassign prev token after adding to feature list
                    prev_token = row[1]
                    prev_pos = row[3]
                    prev_deprel = deprel_to_head

                    labels.append(element)
                    features.append(token_features)

    return features

In [10]:
dev_test_features, lab = extract_features_and_labels("en_ewt-up-dev_preprocessed_argclass.conllu")
# print(dev_test_features[:5])

In [11]:

def create_classifier(training_features, training_labels, modelname):
    """
    
    
    """
  
    modeltype = LogisticRegression(max_iter = 400)
        
    vec = DictVectorizer()
    features_vectorized = vec.fit_transform(training_features)
    model = modeltype.fit(features_vectorized, training_labels)
    return model, vec

In [36]:
logreg_features, logreg_vec= create_classifier(training_features, training_labels, "logreg")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
def classify_data(model, inputdata, outputfile, vec = None):
    """
    Use a trained classifier to classify inputdata and write the predictions to a file. Uses extract_features
    
    model: the trained classifier
    type model:
    vec: the vectorizer used to transform data
    type vec:
    inputdata: path to file with input data
    type inputdata: string
    outputfile: path to file where output should be stored
    type outputfile: string
    
    return: None
"""
    features = inputdata
    features = vec.transform(features)
    predictions = model.predict(features)
  
    predlist = []
    for item in predictions:
        predlist.append(item)
    return predlist
    


In [38]:
predlist = classify_data(logreg_features, dev_test_features, "trial_output.tsv", logreg_vec)

In [39]:
tokenlist = []
arglist = []
counter = 0
with open("en_ewt-up-dev_preprocessed.conllu", "r", encoding = "utf8") as inf:
    infile = csv.reader(inf, delimiter='\t', quotechar='*')
    for row in infile:
        if row != [] and not row[0].startswith("#") and not row[0].startswith('"'):
            tokenlist.append(row[1])
            arglist.append(row[-1])


    
    


In [40]:
from sklearn.metrics import classification_report

classification_report(arglist, predlist, average='macro')

  _warn_prf(average, modifier, msg_start, len(result))


(0.4883005772562087, 0.4521671853759478, 0.45015327876051126, None)