In [29]:
from nltk import SnowballStemmer
from sklearn import preprocessing

import numpy as np
import re

class FeatureSet:
    def __init__(self, data, train_len, validation_len, test_len):
        self.data = np.array(data)
        self.train_len = train_len
        self.validation_len = validation_len
        self.test_len = test_len


class Data:
    def __init__(self, corpus):
        self.corpus = corpus
        
    def print_stats(self):
        print("Corpus: ", self.corpus)
        print("\ntrain:")
        self.train.print_stats()
        print("\nvalidation:")
        self.validation.print_stats()
        print("\ntest:")
        self.test.print_stats()

    def get_set(self, set):
        if set == 'train':
            return self.train
        elif set == 'validation':
            return self.validation
        elif set == 'test':
            return self.test
        else:
            return None


class Dataset:
    def __init__(self):
        self.documents = []
        self.entity_counter = {}

    def add_document(self, document):
        self.documents.append(document)

    def print_stats(self):
        print("Entity counters:")
        for key in self.entity_counter:
            print(key, " -> ", self.entity_counter[key])

    def last_doc(self):
        return self.documents[len(self.documents) - 1]

    def merge(self, dataset):
        self.documents.extend(dataset.documents)
        self.entity_counter.update(dataset.entity_counter)


class Document:
    def __init__(self):
        self.sentences = []

    def add_sentence(self, sentence):
        self.sentences.append(sentence)

    def remove_last_sentence(self):
        self.sentences.pop()

    def __repr__(self):
        return " ".join(self.sentences)


class Sentence:
    def __init__(self):
        self.words = []

    def add_word(self, word):
        self.words.append(word)

    def __repr__(self):
        return " ".join(self.words)


class Word:
    def __init__(self, token, pos, entity, stem):
        self.token = token
        self.pos = pos
        self.entity = entity
        self.stem = stem
        # self.tag = tag

    def __repr__(self):
        return self.token


def get_data(corpus):
    return Data(corpus)


def read_datasets(path):
    filename = path.rsplit('/', 1)[-1]
    if filename.startswith('eng'):
        return read_eng_dataset(path)
    elif filename.startswith('esp'):
        return read_esp_dataset(path)
    elif filename.startswith('ned'):
        return read_ned_dataset(path)


def read_eng_dataset(path):
    dataset = Dataset()
    document = sentence = None
    stemmer = SnowballStemmer("english")
    with open(path, encoding='utf-8', mode='r') as f:
        new_sentence = True
        while True:
            line = f.readline()
            if line == '':
                break
            if line.startswith('-DOCSTART-'):
                document = Document()
                dataset.add_document(document)
            elif line.strip() == '':
                new_sentence = True
            else:
                if new_sentence:
                    sentence = Sentence()
                    document.add_sentence(sentence)
                    new_sentence = False
                args = line.split()
                ent_type = entity = None
                sentence.add_word(Word(args[0], args[1], args[3], stemmer.stem(args[0])))
                if args[3] != 'O':
                    ent_type, entity = args[3].split('-', 2)
                if ent_type == 'B':
                    dataset.entity_counter[entity] = dataset.entity_counter.get(entity, 0) + 1
    return dataset


def read_esp_dataset(path):
    dataset = Dataset()
    document = Document()
    sentence = None
    stemmer = SnowballStemmer("spanish")
    dataset.add_document(document)
    with open(path, encoding='latin-1', mode='r') as f:
        new_sentence = True
        while True:
            line = f.readline()
            if line == '':
                break
            if line.strip() == '':
                new_sentence = True
            else:
                if new_sentence:
                    sentence = Sentence()
                    document.add_sentence(sentence)
                    new_sentence = False
                args = line.split()
                ent_type = entity = None
                sentence.add_word(Word(args[0], None, args[1], stemmer.stem(args[0])))
                if args[1] != 'O':
                    ent_type, entity = args[1].split('-', 2)
                if ent_type == 'B':
                    dataset.entity_counter[entity] = dataset.entity_counter.get(entity, 0) + 1
    return dataset


def read_ned_dataset(path):
    dataset = Dataset()
    document = sentence = None
    stemmer = SnowballStemmer("dutch")
    with open(path, encoding='latin-1', mode='r') as f:
        new_sentence = True
        while True:
            line = f.readline()
            if line == '':
                break
            if line.startswith('-DOCSTART-'):
                document = Document()
                dataset.add_document(document)
            elif line.strip() == '':
                new_sentence = True
            else:
                if new_sentence:
                    sentence = Sentence()
                    document.add_sentence(sentence)
                    new_sentence = False
                args = line.split()
                ent_type = entity = None
                sentence.add_word(Word(args[0], args[1], args[2], stemmer.stem(args[0])))
                if args[2] != 'O':
                    ent_type, entity = args[2].split('-', 2)
                if ent_type == 'B':
                    dataset.entity_counter[entity] = dataset.entity_counter.get(entity, 0) + 1
    return dataset


def get_starting(entity):
    return entity.replace('I-', 'B-')


def normalize_flags(flags):
    new_flags = []
    prev_flag = -2
    for flag in flags:
        if flag != prev_flag + 1:
            new_flags.append(flag)
        prev_flag = flag
    return new_flags


def next_entity(index, words, entity, f, counter):
    while index < len(words) and words[index].entity == entity:
        f.write(words[index].token + " " + words[index].pos + " - " + entity + "\n")
        index += 1
        counter += 1
    return index - 1, counter


def transfer_eng_to_bio(dataset, output):
    counter = 0
    flags = []
    eng = read_datasets(project_path.get_dataset('eng_old_encoding', dataset))
    with open(output, 'w') as f:
        for doc in eng.documents:
            f.write("-DOCSTART- -X- -X- O\n\n")
            counter += 2
            for sentence in doc.sentences:
                index = 0
                sentence_length = len(sentence.words)
                words = sentence.words
                while index < sentence_length:
                    entity = words[index].entity
                    if entity.startswith('B-'):
                        flags.append(counter)
                    if entity == 'O':
                        f.write(words[index].token + " " + words[index].pos + " - " + entity + "\n")
                    else:
                        if entity.startswith('B'):
                            f.write(words[index].token + " " + words[index].pos + " - " + entity + "\n")
                            index, counter = next_entity(index + 1, words, entity.replace("B-", "I-"), f, counter)
                        else:
                            f.write(words[index].token + " " + words[index].pos + " - " + get_starting(entity) + "\n")
                            index, counter = next_entity(index + 1, words, entity, f, counter)
                    index += 1
                    counter += 1

                f.write("\n")
                counter += 1
    return flags


def count_entity_size_dataset(dataset, counter):
    for document in dataset.documents:
        for sentence in document.sentences:
            i = size = 0
            n = len(sentence.words)
            entity = None
            while i < n:
                word = sentence.words[i]
                if word.entity == 'O':
                    if entity is not None:
                        entity_size_list = counter.get(entity, [])
                        entity_size_list.append(size)
                        counter[entity] = entity_size_list
                    entity = None
                    size = 0
                else:
                    if word.entity.startswith('I-'):
                        size += 1
                    elif word.entity.startswith('B-'):
                        if entity is not None:
                            entity_size_list = counter.get(entity, [])
                            entity_size_list.append(size)
                            counter[entity] = entity_size_list
                        entity = word.entity.split('-', 2)[-1]
                        size = 1
                i += 1
            if entity is not None:
                entity_size_list = counter.get(entity, [])
                entity_size_list.append(size)
                counter[entity] = entity_size_list


def count_entity_size(data):
    counter = {}
    count_entity_size_dataset(data.train, counter)
    count_entity_size_dataset(data.validation, counter)
    count_entity_size_dataset(data.test, counter)
    return counter


In [44]:
import argparse
import pickle
import sys
import numpy as np
import time

from sklearn.linear_model import Perceptron, LogisticRegressionCV
from sklearn.metrics import classification_report, precision_score, f1_score


capitalized = "^[A-Z].*$"
allcapitalized = "^[A-Z]*$"
alldigits = "^[0-9]*$"
alphanumeric = "^[A-Za-z0-9]*$"


def get_entity(word):
    if word is None:
        return 'unknown'
    else:
        return word.entity


def get_token(word):
    if word is None:
        return 'unknown'
    else:
        return word.token


def get_stem(word):
    if word is None:
        return 'unknown'
    else:
        return word.stem


def is_capitalized(token):
    return re.search(capitalized, token) is not None


def all_capitalized(token):
    return re.search(allcapitalized, token) is not None


def alpha_numeric(token):
    return re.search(alphanumeric, token) is not None


def all_digits(token):
    return re.search(alldigits, token) is not None


def make_feature_vec(word, prev, prev_prev, next, next_next):
    vec = []
    vec.append(get_entity(prev_prev))
    vec.append(get_entity(prev))

    vec.append(alpha_numeric(word.token))
    vec.append(all_digits(word.token))
    vec.append(all_capitalized(word.token))

    # extract 3gram chars from and group the by entity

    # vec.append(get_stem(prev_prev))
    # vec.append(get_stem(prev))
    # vec.append(get_stem(word))
    # vec.append(get_stem(next))
    # vec.append(get_stem(next_next))

    vec.append(is_capitalized(get_token(prev_prev)))
    vec.append(is_capitalized(get_token(prev)))
    vec.append(is_capitalized(word.token))
    vec.append(is_capitalized(get_token(next)))
    vec.append(is_capitalized(get_token(next_next)))

    return vec

def fit_transform_column(encoders, input, column):
    lbe = preprocessing.LabelEncoder()
    encoders.append(lbe)
    return lbe.fit_transform(input[:, column])


def transform_column(encoders, input, column):
    return encoders[column].transform(input[column])


def transform_vector(encoders, vector):
    new_vector = np.zeros(vector.shape)
    for i in range(len(vector)):
        new_vector[i] = transform_column(encoders, vector, i)[0]
    return new_vector


def transform_test_features(features, encoders):
    new_features = np.zeros(features.shape)
    for i in range(features.shape[1]):
        new_features[:, i] = transform_column(encoders, features, i)
    return np.array(new_features)


def transform_train_features(features):
    encoders = []
    new_features = np.zeros(features.shape)
    for i in range(features.shape[1]):
        new_features[:, i] = fit_transform_column(encoders, features, i)
    return np.array(new_features), encoders

def get_prev(words, i, offset):
    if i >= offset:
        return words[i-offset]
    else:
        return None


def get_next(words, i, offset):
    if i+offset < len(words):
        return words[i + offset]
    else:
        return None


def get_features(data):
    features = []
    Y = []
    for doc in data.documents:
        for sentance in doc.sentences:
            n = len(sentance.words)
            for i in range(n):
                prev = get_prev(sentance.words, i, 1)
                prev_prev = get_prev(sentance.words, i, 2)
                next = get_next(sentance.words, i, 1)
                next_next = get_next(sentance.words, i, 2)
                features.append(make_feature_vec(sentance.words[i], prev, prev_prev, next, next_next))
                Y.append(sentance.words[i].entity)
    return np.array(features), np.array(Y)



def current_milli_time():
    return int(round(time.time() * 1000))


def print_ms(message, t1, t2):
    print(message, t2-t1, 'ms')


def print_help(parser, message):
    parser.print_help()
    print(message)
    exit(1)


def check_argument_set(arg_set, choices, parser):
    for arg in arg_set:
        if arg not in choices:
            print_help(parser, "'"+arg+"' is not in possible choices: "+str(choices))


def get_set(_set, languages):
    dataset = Dataset()
    for lang in languages:
        data = get_data(lang)
        dataset.merge(data.get_set(_set))
    return dataset


def get_serialized_sets(_set, languages):
    dataset = Dataset()
    for lang in languages:
        with open('serialization/' + _set + '.' + lang, 'rb') as handle:
            train = pickle.load(handle)
            dataset.merge(train)
    return dataset


def save_data(train, name1, validation, name2, test, name3):
    with open('../serialization/'+name1, 'wb') as handle:
        pickle.dump(train, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('../serialization/'+name2, 'wb') as handle:
        pickle.dump(validation, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open('../serialization/'+name3, 'wb') as handle:
        pickle.dump(test, handle, protocol=pickle.HIGHEST_PROTOCOL)


def load_data(name1, name2, name3):
    with open('serialization/'+name1, 'rb') as handle:
        train = pickle.load(handle)
    with open('serialization/' + name2, 'rb') as handle:
        validation = pickle.load(handle)
    with open('serialization/'+name3, 'rb') as handle:
        test = pickle.load(handle)
    return train, validation, test


def parse_arguments(args):
    choices = ['eng', 'esp', 'ned']
    parser = argparse.ArgumentParser()
    parser.add_argument('-train')
    parser.add_argument('-validation')
    parser.add_argument('-test')

    parsed_args = parser.parse_args(args[1:])
    if None in [parsed_args.train, parsed_args.test]:
        print_help(parser, 'Must provide both train and test sets.')

    train_sets = parsed_args.train.split(',')
    validation_sets = parsed_args.validation.split(',')
    test_sets = parsed_args.test.split(',')

    print('checking arguments')
    check_argument_set(train_sets, choices, parser)
    check_argument_set(validation_sets, choices, parser)
    check_argument_set(test_sets, choices, parser)
    return train_sets, validation_sets, test_sets


def get_datasets(train_sets, validation_sets, test_sets):
    print('loading train, validation and test set')
    t0 = current_milli_time()
    train = get_serialized_sets('train', train_sets)
    validation = get_serialized_sets('validation', validation_sets)
    test = get_serialized_sets('test', test_sets)
    # train = get_set('train', train_sets)
    # validation = get_set('validation', validation_sets)
    # test = get_set('test', test_sets)
    # train, validation, test = load_data('train.'+parsed_args.train, 'validation.'+parsed_args.validation,
    #                                     'test.'+parsed_args.test)
    print_ms('data loaded in: ', t0, current_milli_time())

    # save_data(train, 'train.'+parsed_args.train, validation, 'validation.'+parsed_args.validation,
    #           test, 'test.'+parsed_args.test)
    return train, validation, test


def get_all_features(train, validation, test):
    t1 = current_milli_time()
    print('\ngetting train features')
    train_features, train_y = get_features(train)
    t2 = current_milli_time()
    print_ms('train features: ', t1, t2)

    print('\ngetting validation features')
    validation_features, validation_y = get_features(validation)
    t3 = current_milli_time()
    print_ms('validation features: ', t2, t3)

    print('\ngetting test features')
    test_features, test_y = get_features(test)
    t4 = current_milli_time()
    print_ms('test features: ', t3, t4)
    return train_features, train_y, validation_features, validation_y, test_features, test_y




def make_args(train, test):
    return ['/home/stipan/dev/fer/seminar/src/baseline.py', '-train', train, '-validation', train, '-test', test]


def all_combinations():
    for i in ['eng', 'esp', 'ned']:
        for j in ['eng', 'esp', 'ned']:
            main(make_args(i, j))



In [102]:

def transform_all_features(train_features, validation_features, test_features):
    train_len, validation_len, test_len = len(train_features), len(validation_features), len(test_features)
#     all_features = []
#     all_features.extend(train_features)
#     all_features.extend(validation_features)
#     all_features.extend(test_features)
    
    all_features = np.append(np.append(train_features, validation_features, axis=0), test_features, axis=0)
    new_features = np.zeros(all_features.shape)
    for i in range(all_features.shape[1]):
        new_features[:, i] = fit_transform_column(encoders, all_features, i)

    ohe = preprocessing.OneHotEncoder()
    new_features = ohe.fit_transform(new_features).toarray()
    return new_features[:train_len, :], new_features[train_len:train_len+validation_len, :], \
           new_features[train_len+validation_len:, :]


In [62]:
args = ['/home/stipan/dev/fer/seminar/src/baseline.py', '-train', 'eng', '-validation', 'eng', '-test', 'eng']
train_sets, validation_sets, test_sets = parse_arguments(args)
train, validation, test = get_datasets(train_sets, validation_sets, test_sets)
train_features, train_y, validation_features, validation_y, test_features, test_y = get_all_features(train, validation, test)


checking arguments
loading train, validation and test set
data loaded in:  1514 ms

getting train features
train features:  3780 ms

getting validation features
validation features:  809 ms

getting test features
test features:  753 ms


In [57]:
print('\ntransforming features')
print(len(train_features))
print(len(validation_features))
print(len(test_features))
print('###################')
t5 = current_milli_time()
train_features, validation_features, test_features = transform_all_features(train_features, validation_features, test_features)
print_ms('Features transform: ', t5, current_milli_time())
print(train_features.shape)
print(validation_features.shape)
print(test_features.shape)


transforming features
203621
51362
46435
###################
Features transform:  1625 ms
(203621, 36)
(51362, 36)
(46435, 36)


In [None]:



print('\ntraining model')

t6 = current_milli_time()
# lr = LogisticRegressionCV(class_weight='balanced', n_jobs=-1, max_iter=300000, multi_class='multinomial')
# lr.fit(train_features, train_y)
best_estimator = None
max_f1_micro = None
max_f1_macro = None
for alpha in [10**i for i in range(-10, -8)]:
    p = Perceptron(n_jobs=-1, alpha=alpha, penalty='l2', shuffle=True)
    p.fit(train_features, train_y)

    print_ms('\ntraining done: ', t6, current_milli_time())
    predicted_y = p.predict(validation_features)
    temp_f1 = f1_score(validation_y, predicted_y, average='micro')
    if max_f1_micro is None or max_f1_micro < temp_f1:
        max_f1_micro = temp_f1
        max_f1_macro = f1_score(validation_y, predicted_y, average='macro')
        best_estimator = p
    print(p.get_params())
    print(classification_report(validation_y, predicted_y))
    print("micro: ", precision_score(validation_y, predicted_y, average='micro'))
    print("macro: ", precision_score(validation_y, predicted_y, average='macro'))
    print("#"*100)
# print(len(train.documents))
# print(len(validation.documents))
# print(len(test.documents))
# all_features = []
# all_features.extend(train_features)
# all_features.extend(validation_features)


In [None]:
a = np.append(train_features, validation_features, axis=0)
b = np.append(train_y, validation_y)
best_estimator.fit(a, b)
predicted_y = best_estimator.predict(test_features)
print(classification_report(test_y, predicted_y))
print("micro: ", precision_score(test_y, predicted_y, average='micro'))
print("macro: ", precision_score(test_y, predicted_y, average='macro'))
print("#" * 100)

In [3]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)
print(ftwo_scorer)
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn import datasets
iris = datasets.load_breast_cancer()
X = iris.data[:, :2]  # we only take the first two features.
Y = iris.target
print(Y)
grid = GridSearchCV(LinearSVC(), param_grid={'C': [0.00001,0.0001,0.001,0.01, 0.1,1, 10]}, scoring=ftwo_scorer)
grid.fit(X,Y)

make_scorer(fbeta_score, beta=2)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1 0 0 1 1 1 1 0 1 0 0
 1 0 1 0 0 1 1 1 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 0 1 0 0 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1
 1 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 1
 1 1 0 1 0 1 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0
 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 0 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 1 0 1 1
 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1, param_grid={'C': [1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(fbeta_score, beta=2), verbose=0)

In [4]:
grid.best_estimator_

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)