In [1]:
import operator
import numpy as np
from sklearn.model_selection import KFold
import gensim
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics import classification_report



In [2]:
kf = KFold(n_splits=3, shuffle=True)

In [3]:
glove_model = KeyedVectors.load_word2vec_format("D:/wiki.de.vec", binary=False)

In [4]:
glove_model.init_sims(replace=True)

In [5]:
def decapitalize(word):
    if len(word) > 1:
        word = word[0].lower() + word[1:]
    return word

In [6]:
#distance between compound parts: not normalized
def dist_not_norm(a,b):
    a_rev=a[::-1]
    b_rev=b[::-1]
    count = 0
    for i in range(min(len(a), len(b))):
        if a_rev[i] != b_rev[i]:
            break
        else:
            count += 1
    return count

In [7]:
#distance between compound parts: normalized
def dist_norm(a,b):
    return 1-(dist_not_norm(a,b)/((len(a)+len(b))/2))

In [8]:
#no suffixes
def all_dist(vector_1,vector_2):
    cur_dist = 0
    for i in range((len(vector_1))):
        if i == 0 or i == 2:
            ith_dist = dist_norm(vector_1[i], vector_2[i])
            cur_dist += ith_dist * ith_dist
            try:
                ith_dist = glove_model.similarity(decapitalize(vector_1[i]), decapitalize(vector_2[i]))
                cur_dist += (1-ith_dist)*(1-ith_dist)
            except:
                ith_dist = 0
                cur_dist += (1-ith_dist)*(1-ith_dist)
        else:
            if vector_1[i] != vector_2[i]:
                cur_dist += 1
    return cur_dist

In [9]:
file = open("C:/Users/1/Desktop/compounds_final.txt", "r", encoding="utf-8")
lines = file.readlines()

In [10]:
#no suffixes
matrix = []
for line in lines:
    vectors = []
    types = []
    line = line.strip("\n")
    items = line.split("/")
    items_new = items[1:3] + items[4:6] + [items[7]]
    for item in items_new:
        vectors.append(item)
    types.append(items[-1])
    tp = (vectors,types)
    matrix.append(tp)

In [11]:
for item in matrix[:2]:
    print (item)

(['Engel', '<NN>', 'Bild', '<+NN>', 's'], ['N'])
(['Wolke', '<NN>', 'Schatten', '<+NN>', 'n'], ['NN'])


In [12]:
matrixX = []
matrixY = []
for item in matrix:
    matrixX.append(item[0])
    matrixY.append(item[1][0])

In [13]:
X = np.array(matrixX)
y = np.array(matrixY)

In [14]:
kf.get_n_splits(X)

3

In [15]:
all_predicts = []
tests = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    predictions = []
    for vector in X_test:
        distances = []
        types = []
        for x in range(len(X_train)):
            distance = all_dist(X_train[x], vector)
            d_t = (distance, y_train[x])
            distances.append(d_t)
            types.append(y_train[x])
        distances_sort = sorted(distances)
        types_sort = []
        for item in distances_sort[:3]:
            types_sort.append(item[1])
        maxx = max(set(types_sort), key=types_sort.count)
        predictions.append(maxx)
    y_test = y_test.tolist()
    all_predicts.extend(predictions)
    tests.extend(y_test)
    
print(classification_report(tests, all_predicts))

             precision    recall  f1-score   support

         AN       0.48      0.45      0.46      1349
          N       0.42      0.54      0.47      1250
         NN       0.24      0.11      0.15       540

avg / total       0.41      0.43      0.41      3139



In [16]:
#with suffixes
def all_dist(vector_1,vector_2):
    cur_dist = 0
    for i in range((len(vector_1))):
        if i == 0 or i == 3:
            ith_dist = dist_norm(vector_1[i], vector_2[i])
            cur_dist += ith_dist * ith_dist
            try:
                ith_dist = glove_model.similarity(decapitalize(vector_1[i]), decapitalize(vector_2[i]))
                cur_dist += (1-ith_dist)*(1-ith_dist)
            except:
                ith_dist = 0
                cur_dist += (1-ith_dist)*(1-ith_dist)
        else:
            if vector_1[i] != vector_2[i]:
                cur_dist += 1
    return cur_dist

In [17]:
#with suffixes
matrix = []
for line in lines:
    vectors = []
    types = []
    line = line.strip("\n")
    items = line.split("/")
    items_new = items[1:8]
    for item in items_new:
        vectors.append(item)
    types.append(items[-1])
    tp = (vectors,types)
    matrix.append(tp)

In [18]:
for item in matrix[:2]:
    print (item)

(['Engel', '<NN>', 'no', 'Bild', '<+NN>', 'no', 's'], ['N'])
(['Wolke', '<NN>', 'no', 'Schatten', '<+NN>', 'no', 'n'], ['NN'])


In [19]:
matrixX = []
matrixY = []
for item in matrix:
    matrixX.append(item[0])
    matrixY.append(item[1][0])

In [20]:
X = np.array(matrixX)
y = np.array(matrixY)

In [21]:
kf.get_n_splits(X)

3

In [22]:
all_predicts = []
tests = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    predictions = []
    for vector in X_test:
        distances = []
        types = []
        for x in range(len(X_train)):
            distance = all_dist(X_train[x], vector)
            d_t = (distance, y_train[x])
            distances.append(d_t)
            types.append(y_train[x])
        distances_sort = sorted(distances)
        types_sort = []
        for item in distances_sort[:3]:
            types_sort.append(item[1])
        maxx = max(set(types_sort), key=types_sort.count)
        predictions.append(maxx)
    y_test = y_test.tolist()
    all_predicts.extend(predictions)
    tests.extend(y_test)
    
print(classification_report(tests, all_predicts))

             precision    recall  f1-score   support

         AN       0.47      0.44      0.46      1349
          N       0.42      0.55      0.48      1250
         NN       0.26      0.11      0.15       540

avg / total       0.42      0.43      0.41      3139

