# Import modules

In [80]:
import pandas as pd
import numpy as np
#import nltk
#nltk.download('averaged_perceptron_tagger')
#from nltk.stem import WordNetLemmatizer
#nltk.download('wordnet')
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sn
import csv

from sklearn import metrics

# Code for our script below:

In [81]:
trainfile = 'c:/users/desir/Desktop/text_mining/applied TM/SEM-2012-SharedTask-CD-SCO-training-simple.v2.features.conll'
testfile = 'c:/users/desir/Desktop/text_mining/applied TM/SEM-2012-SharedTask-CD-SCO-dev-simple.v2.features.conll'

In [82]:
def create_vectorizer_and_classifier(features, labels):
    '''
    Function that takes feature-value pairs and gold labels as input and trains a logistic regression classifier
    
    :param features: feature-value pairs
    :param labels: gold labels
    :type features: a list of dictionaries
    :type labels: a list of strings
    
    :return lr_classifier: a trained LogisticRegression classifier
    :return vec: a DictVectorizer to which the feature values are fitted. 
    '''
    
    vec = DictVectorizer()
    #fit creates a mapping between observed feature values and dimensions in a one-hot vector, transform represents the current values as a vector 
    tokens_vectorized = vec.fit_transform(features)
    svm_classifier = LinearSVC()
    svm_classifier.fit(tokens_vectorized, labels)
    
    return svm_classifier, vec

def print_confusion_matrix(predictions, goldlabels):
    '''
    Function that prints out a confusion matrix
    
    :param predictions: predicted labels
    :param goldlabels: gold standard labels
    :type predictions, goldlabels: list of strings
    '''
    
    
    
    #based on example from https://datatofish.com/confusion-matrix-python/ 
    data = {'Gold':    goldlabels[1:], 'Predicted': predictions[1:]    }
    df = pd.DataFrame(data, columns=['Gold','Predicted'])

    confusion_matrix = pd.crosstab(df['Gold'], df['Predicted'], rownames=['Gold'], colnames=['Predicted'])
    print (confusion_matrix)


def print_precision_recall_fscore(predictions, goldlabels):
    '''
    Function that prints out precision, recall and f-score
    
    :param predictions: predicted output by classifier
    :param goldlabels: original gold labels
    :type predictions, goldlabels: list of strings
    '''
    
    precision = metrics.precision_score(y_true=goldlabels,
                        y_pred=predictions,
                        average='macro')

    recall = metrics.recall_score(y_true=goldlabels,
                     y_pred=predictions,
                     average='macro')


    fscore = metrics.f1_score(y_true=goldlabels,
                 y_pred=predictions,
                 average='macro')

    print('P:', precision, 'R:', recall, 'F1:', fscore)
    
#vectorizer and lr_classifier are the vectorizer and classifiers created in the previous cell.
#it is important that the same vectorizer is used for both training and testing: they should use the same mapping from values to dimensions
# predictions, goldlabels = get_predicted_and_gold_labels_token_only(testfile, vectorizer, lr_classifier)
# print_confusion_matrix(predictions, goldlabels)

In [83]:
# the functions with multiple features and analysis

#defines the column in which each feature is located (note: you can also define headers and use csv.DictReader)
#feature_to_index = {'TOKEN': 0, 'POS': 1, 'LEMMA': 2, 'PUNCTUATION': 3, 'STARTSWITH_CAPITAL_LETTER': 4, 'IS_STOPWORD': 5}


def extract_features_and_gold_labels(conllfile, selected_features):
    '''Function that extracts features and gold label from preprocessed conll (here: tokens only).
    
    :param conllfile: path to the (preprocessed) conll file
    :type conllfile: string
    
    
    :return features: a list of dictionaries, with key-value pair providing the value for the feature `token' for individual instances
    :return labels: a list of gold labels of individual instances
    '''
    feature_to_index = {'Token': 3 , 'Pre-token': 5, 'Next-token': 6, 'Lemma':7 , 'Pre-lemma':8 , 'Next-lemma':9, 'POS':10, 'Pre-POS':11 , 'Next-POS':12 , 'POS_classified':13 , 'Punctuation_python': 14, 'MatchesNegExp': 15, 'HasNegAffix':16, 'Negated event':17, 'NegAffix':18}
    features = []
    labels = []
    conllinput = open(conllfile, 'r')
    #delimiter indicates we are working with a tab separated value (default is comma)
    #quotechar has as default value '"', which is used to indicate the borders of a cell containing longer pieces of text
    #in this file, we have only one token as text, but this token can be '"', which then messes up the format. We set quotechar to a character that does not occur in our file
    csvreader = csv.reader(conllinput, delimiter='\t',quotechar='|')
    next(csvreader, None)
    for row in csvreader:
        #I preprocessed the file so that all rows with instances should contain 6 values, the others are empty lines indicating the beginning of a sentence
        if len(row) > 0:
            #structuring feature value pairs as key-value pairs in a dictionary
            #the first column in the conll file represents tokens
            feature_value = {}
            for feature_name in selected_features:
                row_index = feature_to_index.get(feature_name)
                feature_value[feature_name] = row[row_index]
            features.append(feature_value)
            #The last column provides the gold label (= the correct answer). 
            labels.append(row[4])
    return features, labels

def get_predicted_and_gold_labels(testfile, vectorizer, classifier, selected_features):
    '''
    Function that extracts features and runs classifier on a test file returning predicted and gold labels
    
    :param testfile: path to the (preprocessed) test file
    :param vectorizer: vectorizer in which the mapping between feature values and dimensions is stored
    :param classifier: the trained classifier
    :type testfile: string
    :type vectorizer: DictVectorizer
    :type classifier: LogisticRegression()
    
    
    
    :return predictions: list of output labels provided by the classifier on the test file
    :return goldlabels: list of gold labels as included in the test file
    '''
    
    #we use the same function as above (guarantees features have the same name and form)
    features, goldlabels = extract_features_and_gold_labels(testfile, selected_features)
    #we need to use the same fitting as before, so now we only transform the current features according to this mapping (using only transform)
    test_features_vectorized = vectorizer.transform(features)
    predictions = classifier.predict(test_features_vectorized)
    
    return predictions, goldlabels

#define which from the available features will be used (names must match key names of dictionary feature_to_index)
all_features = ['Token', 'Pre-token', 'Next-token', 'Lemma', 'Pre-lemma', 'Next-lemma', 'POS', 'Pre-POS', 'Next-POS', 'POS_classified', 'Punctuation_python', 'MatchesNegExp', 'HasNegAffix', 'Negated event', 'NegAffix']

sparse_feature_reps, labels = extract_features_and_gold_labels(trainfile, all_features)
#we can use the same function as before for creating the classifier and vectorizer
svm_classifier, vectorizer = create_vectorizer_and_classifier(sparse_feature_reps, labels)
#when applying our model to new data, we need to use the same features
predictions, goldlabels = get_predicted_and_gold_labels(testfile, vectorizer, svm_classifier, all_features)
print_confusion_matrix(predictions, goldlabels)
print_precision_recall_fscore(predictions, goldlabels)

Predicted  B-NEG  I-NEG      O
Gold                          
B-NEG        171      0      5
I-NEG          0      2      1
O              7      0  13380
P: 0.9867419870974384 R: 0.8792449064894537 F1: 0.921872055790301


In [84]:
report = classification_report(goldlabels,predictions,digits = 7, zero_division=0)

In [85]:
print(report)

              precision    recall  f1-score   support

       B-NEG  0.9606742 0.9715909 0.9661017       176
       I-NEG  1.0000000 0.6666667 0.8000000         3
           O  0.9995518 0.9994771 0.9995145     13388

    accuracy                      0.9990418     13567
   macro avg  0.9867420 0.8792449 0.9218721     13567
weighted avg  0.9990476 0.9990418 0.9990369     13567



In [88]:
# example of system with just one additional feature
#define which from the available features will be used (names must match key names of dictionary feature_to_index)
selected_features = ['Token', 'Pre-token', 'Pre-lemma', 'Pre-POS', 'NegAffix']

feature_values, labels = extract_features_and_gold_labels(trainfile, selected_features)
#we can use the same function as before for creating the classifier and vectorizer
svm_classifier, vectorizer = create_vectorizer_and_classifier(feature_values, labels)
#when applying our model to new data, we need to use the same features
predictions, goldlabels = get_predicted_and_gold_labels(testfile, vectorizer, svm_classifier, selected_features)
print_confusion_matrix(predictions, goldlabels)
print_precision_recall_fscore(predictions, goldlabels)
report = classification_report(goldlabels,predictions,digits = 7, zero_division=0)
print(report)

Predicted  B-NEG  I-NEG      O
Gold                          
B-NEG        173      0      3
I-NEG          0      1      2
O             13      0  13374
P: 0.9765779449346642 R: 0.7717722866550174 F1: 0.8183762200874246
              precision    recall  f1-score   support

       B-NEG  0.9301075 0.9829545 0.9558011       176
       I-NEG  1.0000000 0.3333333 0.5000000         3
           O  0.9996263 0.9990290 0.9993276     13388

    accuracy                      0.9986733     13567
   macro avg  0.9765779 0.7717723 0.8183762     13567
weighted avg  0.9987245 0.9986733 0.9986525     13567

